diff --git a/projects/rocshmem/CMakeLists.txt b/projects/rocshmem/CMakeLists.txt index bce7de7cbb..59e9522d37 100644 --- a/projects/rocshmem/CMakeLists.txt +++ b/projects/rocshmem/CMakeLists.txt @@ -74,7 +74,7 @@ option(BUILD_UNIT_TESTS "Build the unit tests" ON) set(ROCM_PATH "" CACHE PATH "ROCm path to use") -configure_file(cmake/config.h.in config.h) +configure_file(cmake/rocshmem_config.h.in rocshmem_config.h) ############################################################################### # Validate user passed options @@ -179,7 +179,7 @@ target_include_directories( ${PROJECT_NAME} PUBLIC $ - $ # CONFIG.H + $ # rocshmem_config.h $ ) @@ -190,7 +190,7 @@ set_target_properties( ${PROJECT_NAME} PROPERTIES PUBLIC_HEADER - "${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/rocshmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/debug.hpp" + "${CMAKE_BINARY_DIR}/rocshmem_config.h" ) ############################################################################### @@ -386,6 +386,11 @@ install( COMPONENT bin ) +install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/ + DESTINATION ${INSTALL_INCLUDEDIR} + COMPONENT dev +) + install( EXPORT ${PROJECT_NAME}Targets diff --git a/projects/rocshmem/cmake/config.h.in b/projects/rocshmem/cmake/rocshmem_config.h.in similarity index 100% rename from projects/rocshmem/cmake/config.h.in rename to projects/rocshmem/cmake/rocshmem_config.h.in diff --git a/projects/rocshmem/examples/rocshmem_allreduce_test.cc b/projects/rocshmem/examples/rocshmem_allreduce_test.cc index fdd81fa8a2..c6f4d05dc9 100644 --- a/projects/rocshmem/examples/rocshmem_allreduce_test.cc +++ b/projects/rocshmem/examples/rocshmem_allreduce_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/examples/rocshmem_alltoall_test.cc b/projects/rocshmem/examples/rocshmem_alltoall_test.cc index 36847abf3a..775580fba2 100644 --- a/projects/rocshmem/examples/rocshmem_alltoall_test.cc +++ b/projects/rocshmem/examples/rocshmem_alltoall_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_alltoall_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/examples/rocshmem_broadcast_test.cc b/projects/rocshmem/examples/rocshmem_broadcast_test.cc index 9958707893..4a630c75db 100644 --- a/projects/rocshmem/examples/rocshmem_broadcast_test.cc +++ b/projects/rocshmem/examples/rocshmem_broadcast_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_broadcast_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/examples/rocshmem_getmem_test.cc b/projects/rocshmem/examples/rocshmem_getmem_test.cc index 9c4a419ddd..942b43068d 100644 --- a/projects/rocshmem/examples/rocshmem_getmem_test.cc +++ b/projects/rocshmem/examples/rocshmem_getmem_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_getmem_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/examples/rocshmem_put_signal_test.cc b/projects/rocshmem/examples/rocshmem_put_signal_test.cc index da52562124..9e41bc4529 100644 --- a/projects/rocshmem/examples/rocshmem_put_signal_test.cc +++ b/projects/rocshmem/examples/rocshmem_put_signal_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_put_signal_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/include/rocshmem/rocshmem.hpp b/projects/rocshmem/include/rocshmem/rocshmem.hpp index 4dd33895ef..20fa559e72 100644 --- a/projects/rocshmem/include/rocshmem/rocshmem.hpp +++ b/projects/rocshmem/include/rocshmem/rocshmem.hpp @@ -26,7 +26,14 @@ #include #include -#include "config.h" +#include "rocshmem_config.h" +#include "rocshmem_common.hpp" +#include "rocshmem_RMA.hpp" +#include "rocshmem_AMO.hpp" +#include "rocshmem_SIG_OP.hpp" +#include "rocshmem_COLL.hpp" +#include "rocshmem_P2P_SYNC.hpp" +#include "rocshmem_RMA_X.hpp" /** * @file rocshmem.hpp * @brief Public header for rocSHMEM device and host libraries. @@ -43,106 +50,6 @@ namespace rocshmem { -#ifdef USE_FUNC_CALL -#define ATTR_NO_INLINE __attribute__((noinline)) -#else -#define ATTR_NO_INLINE -#endif - - -enum ROCSHMEM_STATUS { - ROCSHMEM_SUCCESS = 0, - ROCSHMEM_ERROR = 1, -}; - -enum ROCSHMEM_OP { - ROCSHMEM_SUM, - ROCSHMEM_MAX, - ROCSHMEM_MIN, - ROCSHMEM_PROD, - ROCSHMEM_AND, - ROCSHMEM_OR, - ROCSHMEM_XOR, - ROCSHMEM_REPLACE -}; - -enum ROCSHMEM_SIGNAL_OPS { - ROCSHMEM_SIGNAL_SET, - ROCSHMEM_SIGNAL_ADD, -}; - -/** - * @brief Types defined for rocshmem_wait() operations. - */ -enum rocshmem_cmps { - ROCSHMEM_CMP_EQ, - ROCSHMEM_CMP_NE, - ROCSHMEM_CMP_GT, - ROCSHMEM_CMP_GE, - ROCSHMEM_CMP_LT, - ROCSHMEM_CMP_LE, -}; - -enum rocshmem_thread_ops { - ROCSHMEM_THREAD_SINGLE, - ROCSHMEM_THREAD_FUNNELED, - ROCSHMEM_THREAD_WG_FUNNELED, - ROCSHMEM_THREAD_SERIALIZED, - ROCSHMEM_THREAD_MULTIPLE -}; - -/** - * @brief Bitwise flags to mask configuration parameters. - */ -enum rocshmem_team_configs { - ROCSHMEM_TEAM_DEFAULT_CONFIGS, - ROCSHMEM_TEAM_NUM_CONTEXTS -}; - -typedef struct { - int num_contexts; -} rocshmem_team_config_t; - -constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024; -constexpr size_t ROCSHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024); -constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256; -constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256; -// Internally calls sync function, which matches barrier implementation -constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE; -constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1; -constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE; -constexpr size_t ROCSHMEM_SYNC_VALUE = 0; - -const int ROCSHMEM_CTX_ZERO = 0; -const int ROCSHMEM_CTX_NOSTORE = 1; -const int ROCSHMEM_CTX_SERIALIZED = 2; -const int ROCSHMEM_CTX_WG_PRIVATE = 4; -const int ROCSHMEM_CTX_SHARED = 8; - -/** - * @brief GPU side OpenSHMEM context created from each work-groups' - * rocshmem_wg_handle_t - */ -typedef struct { - void *ctx_opaque; - void *team_opaque; -} rocshmem_ctx_t; - -/** - * Shmem default context. - */ -extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT; - -/** - * Used internally to set default context. - */ -void set_internal_ctx(rocshmem_ctx_t *ctx); - -typedef uint64_t *rocshmem_team_t; -extern rocshmem_team_t ROCSHMEM_TEAM_WORLD; - -const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr; - /****************************************************************************** **************************** HOST INTERFACE ********************************** *****************************************************************************/ @@ -322,102 +229,6 @@ __host__ int rocshmem_team_split_strided(rocshmem_team_t parent_team, */ __host__ void rocshmem_team_destroy(rocshmem_team_t team); -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into __host__ rocshmem_quiet() if remote completion is required. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__host__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, int pe); - -__host__ void rocshmem_putmem(void *dest, const void *source, size_t nelems, - int pe); - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * _host__ rocshmem_quiet() if completion notification is required. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__host__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe); - -__host__ void rocshmem_putmem_nbi(void *dest, const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__host__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, int pe); - -__host__ void rocshmem_getmem(void *dest, const void *source, size_t nelems, - int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller will - * return as soon as the request is posted. The caller must call - * __host__ rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__host__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe); - -__host__ void rocshmem_getmem_nbi(void *dest, const void *source, - size_t nelems, int pe); - /** * @brief Guarantees order between messages in this context in accordance with * OpenSHMEM semantics. @@ -549,121 +360,6 @@ __device__ ATTR_NO_INLINE int rocshmem_wg_team_create_ctx( */ __device__ ATTR_NO_INLINE void rocshmem_wg_ctx_destroy(rocshmem_ctx_t *ctx); -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem(void *dest, const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem(void *dest, const void *source, - size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller will - * return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi(void *dest, - const void *source, - size_t nelems, int pe); - /** * @brief Guarantees order between messages in this context in accordance with * OpenSHMEM semantics. @@ -844,1858 +540,6 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system( __device__ ATTR_NO_INLINE void rocshmem_threadfence_system(); -/* - * MACRO DECLARE SHMEM_REDUCTION APIs - */ -#define REDUCTION_API_GEN(T, TNAME, Op_API) \ - __device__ ATTR_NO_INLINE int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nreduce); \ - __host__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nreduce); - -#define ARITH_REDUCTION_API_GEN(T, TNAME) \ - REDUCTION_API_GEN(T, TNAME, sum) \ - REDUCTION_API_GEN(T, TNAME, min) \ - REDUCTION_API_GEN(T, TNAME, max) \ - REDUCTION_API_GEN(T, TNAME, prod) - -#define BITWISE_REDUCTION_API_GEN(T, TNAME) \ - REDUCTION_API_GEN(T, TNAME, or) \ - REDUCTION_API_GEN(T, TNAME, and) \ - REDUCTION_API_GEN(T, TNAME, xor) - -#define INT_REDUCTION_API_GEN(T, TNAME) \ - ARITH_REDUCTION_API_GEN(T, TNAME) \ - BITWISE_REDUCTION_API_GEN(T, TNAME) - -#define FLOAT_REDUCTION_API_GEN(T, TNAME) ARITH_REDUCTION_API_GEN(T, TNAME) - -/* - * MACRO DECLARE SHMEM_BROADCAST APIs - */ -#define BROADCAST_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_broadcast( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nelem, int pe_root); /* NOLINT */ \ - __host__ void rocshmem_ctx_##TNAME##_broadcast( \ - rocshmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ - int pe_start, int log_pe_stride, int pe_size, \ - long *p_sync); /* NOLINT */ \ - __host__ void rocshmem_ctx_##TNAME##_broadcast( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nelem, int pe_root); /* NOLINT */ - -/* - * MACRO DECLARE SHMEM_ALLTOALL APIs - */ -#define ALLTOALL_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_alltoall( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nelem); /* NOLINT */ -/* - * MACRO DECLARE SHMEM_FCOLLECT APIs - */ -#define FCOLLECT_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_fcollect( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nelem); /* NOLINT */ - -/* - * MACRO DECLARE SHMEM_PUT APIs - */ -#define PUT_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put( \ - T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_put( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_##TNAME##_put(T *dest, const T *source, \ - size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_P APIs - */ -#define P_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_p( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_p(T *dest, T value, \ - int pe); \ - __host__ void rocshmem_ctx_##TNAME##_p(rocshmem_ctx_t ctx, T *dest, \ - T value, int pe); \ - __host__ void rocshmem_##TNAME##_p(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_GET APIs - */ -#define GET_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get( \ - T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_get( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_##TNAME##_get(T *dest, const T *source, \ - size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_G APIs - */ -#define G_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_g( \ - rocshmem_ctx_t ctx, const T *source, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_g(const T *source, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_g(rocshmem_ctx_t ctx, const T *source, \ - int pe); \ - __host__ T rocshmem_##TNAME##_g(const T *source, int pe); - -/* - * MACRO DECLARE SHMEM_PUT_NBI APIs - */ -#define PUT_NBI_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_nbi( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_nbi( \ - T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_put_nbi( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_##TNAME##_put_nbi(T *dest, const T *source, \ - size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_GET_NBI APIs - */ -#define GET_NBI_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_nbi( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_nbi( \ - T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_get_nbi( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_##TNAME##_get_nbi(T *dest, const T *source, \ - size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_ADD APIs - */ -#define ATOMIC_FETCH_ADD_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_add( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_add( \ - T *dest, T value, int pe); \ - __host__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_add( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_add( \ - T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_COMPARE_SWAP APIs - */ -#define ATOMIC_COMPARE_SWAP_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_compare_swap( \ - rocshmem_ctx_t ctx, T *dest, T cond, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_compare_swap( \ - T *dest, T cond, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_compare_swap( \ - rocshmem_ctx_t ctx, T *dest, T cond, T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_compare_swap(T *dest, T cond, T value, \ - int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_INC APIs - */ -#define ATOMIC_FETCH_INC_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_inc( \ - rocshmem_ctx_t ctx, T *dest, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_inc(T *dest, \ - int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_inc(rocshmem_ctx_t ctx, \ - T *dest, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch_inc(T *dest, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH APIs - */ -#define ATOMIC_FETCH_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch( \ - rocshmem_ctx_t ctx, T *source, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch(T *source, \ - int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch(rocshmem_ctx_t ctx, \ - T *source, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch(T *source, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_ADD APIs - */ -#define ATOMIC_ADD_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_add( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_add( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_add(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_add(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_SET APIs - */ -#define ATOMIC_SET_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_set( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_set( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_set(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_set(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_SWAP APIs - */ -#define ATOMIC_SWAP_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_swap( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_swap( \ - T *dest, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_swap(rocshmem_ctx_t ctx, T *dest, \ - T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_swap(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_AND APIs - */ -#define ATOMIC_FETCH_AND_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_and( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_and( \ - T *dest, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_and( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch_and(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_AND APIs - */ -#define ATOMIC_AND_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_and( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_and( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_and(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_and(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_OR APIs - */ -#define ATOMIC_FETCH_OR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_or( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_or( \ - T *dest, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_or( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch_or(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_OR APIs - */ -#define ATOMIC_OR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_or( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_or( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_or(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_or(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_XOR APIs - */ -#define ATOMIC_FETCH_XOR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_xor( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_xor( \ - T *dest, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_xor( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch_xor(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_XOR APIs - */ -#define ATOMIC_XOR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_xor( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_xor( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_xor(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_xor(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_INC APIs - */ -#define ATOMIC_INC_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_inc( \ - rocshmem_ctx_t ctx, T *dest, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_inc(T *dest, \ - int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_inc(rocshmem_ctx_t ctx, \ - T *dest, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_inc(T *dest, int pe); - -/* - * MACRO DECLARE SHMEM_WAIT_UNTIL APIs - */ -#define WAIT_UNTIL_API_GEN(T, TNAME) \ - __device__ void rocshmem_##TNAME##_wait_until(T *ivars, \ - int cmp, \ - T val); \ - __device__ size_t rocshmem_##TNAME##_wait_until_any(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T val); \ - __device__ void rocshmem_##TNAME##_wait_until_all(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T val); \ - __device__ size_t rocshmem_##TNAME##_wait_until_some(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T val); \ - __device__ size_t rocshmem_##TNAME##_wait_until_any_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals); \ - __device__ void rocshmem_##TNAME##_wait_until_all_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals); \ - __device__ size_t rocshmem_##TNAME##_wait_until_some_vector(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T* vals); \ - __host__ void rocshmem_##TNAME##_wait_until(T *ivars, \ - int cmp, \ - T val); \ - __host__ size_t rocshmem_##TNAME##_wait_until_any(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T val); \ - __host__ void rocshmem_##TNAME##_wait_until_all(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T val); \ - __host__ size_t rocshmem_##TNAME##_wait_until_some(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T val); \ - __host__ size_t rocshmem_##TNAME##_wait_until_any_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals); \ - __host__ void rocshmem_##TNAME##_wait_until_all_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals); \ - __host__ size_t rocshmem_##TNAME##_wait_until_some_vector(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T* vals); - -/* - * MACRO DECLARE SHMEM_TEST APIs - */ -#define TEST_API_GEN(T, TNAME) \ - __device__ int rocshmem_##TNAME##_test(T *ivars, int cmp, T val); \ - __host__ int rocshmem_##TNAME##_test(T *ivars, int cmp, T val); - -/** - * @name SHMEM_REDUCTIONS - * @brief Perform an allreduce between PEs in the active set. The caller - * is blocked until the reduction completes. - * - * This function must be called as a work-group collective. - * - * @param[in] dest Destination address. Must be an address on the - * symmetric heap. - * @param[in] source Source address. Must be an address on the symmetric - heap. - * @param[in] nreduce Size of the buffer to participate in the reduction. - * @param[in] PE_start PE to start the reduction. - * @param[in] logPE_stride Stride of PEs participating in the reduction. - * @param[in] PE_size Number PEs participating in the reduction. - * @param[in] pWrk Temporary work buffer provided to rocSHMEM. Must - * be of size at least max(size/2 + 1, - ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE). - * @param[in] pSync Temporary sync buffer provided to rocSHMEM. Must - be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. - * @param[in] handle GPU side handle. - * - * @return void - */ -///@{ -INT_REDUCTION_API_GEN(int, int) -INT_REDUCTION_API_GEN(short, short) // NOLINT(runtime/int) -INT_REDUCTION_API_GEN(long, long) // NOLINT(runtime/int) -INT_REDUCTION_API_GEN(long long, longlong) // NOLINT(runtime/int) -FLOAT_REDUCTION_API_GEN(float, float) -FLOAT_REDUCTION_API_GEN(double, double) -// long double reduction fails. hipcc/device may not support long double. -// so disable it for now. -// FLOAT_REDUCTION_API_GEN(long double, longdouble) -///@} - -/** - * @name SHMEM_BROADCAST - * @brief Perform a broadcast between PEs in the active set. The caller - * is blocked until the broadcase completes. - * - * This function must be called as a work-group collective. - * - * @param[in] dest Destination address. Must be an address on the - * symmetric heap. - * @param[in] source Source address. Must be an address on the symmetric - heap. - * @param[in] nelement Size of the buffer to participate in the broadcast. - * @param[in] PE_root Zero-based ordinal of the PE, with respect to the - active set, from which the data is copied - * @param[in] PE_start PE to start the reduction. - * @param[in] logPE_stride Stride of PEs participating in the reduction. - * @param[in] PE_size Number PEs participating in the reduction. - * @param[in] pSync Temporary sync buffer provided to rocSHMEM. Must - be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. - * - * @return void - */ -///@{ -BROADCAST_API_GEN(float, float) -BROADCAST_API_GEN(double, double) -// BROADCAST_API_GEN(long double, longdouble) -BROADCAST_API_GEN(char, char) -BROADCAST_API_GEN(signed char, schar) -BROADCAST_API_GEN(short, short) // NOLINT(runtime/int) -BROADCAST_API_GEN(int, int) -BROADCAST_API_GEN(long, long) // NOLINT(runtime/int) -BROADCAST_API_GEN(long long, longlong) // NOLINT(runtime/int) -BROADCAST_API_GEN(unsigned char, uchar) -BROADCAST_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -BROADCAST_API_GEN(unsigned int, uint) -BROADCAST_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -BROADCAST_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_ALLTOALL - * @brief Exchanges a fixed amount of contiguous data blocks between all pairs - * of PEs participating in the collective routine. - * - * This function must be called as a work-group collective. - * - * @param[in] team The team participating in the collective. - * @param[in] dest Destination address. Must be an address on the - * symmetric heap. - * @param[in] source Source address. Must be an address on the symmetric - heap. - * @param[in] nelems Number of data blocks transferred per pair of PEs. - * - * @return void - */ -///@{ -ALLTOALL_API_GEN(float, float) -ALLTOALL_API_GEN(double, double) -// ALLTOALL_API_GEN(long double, longdouble) -ALLTOALL_API_GEN(char, char) -ALLTOALL_API_GEN(signed char, schar) -ALLTOALL_API_GEN(short, short) // NOLINT(runtime/int) -ALLTOALL_API_GEN(int, int) -ALLTOALL_API_GEN(long, long) // NOLINT(runtime/int) -ALLTOALL_API_GEN(long long, longlong) // NOLINT(runtime/int) -ALLTOALL_API_GEN(unsigned char, uchar) -ALLTOALL_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -ALLTOALL_API_GEN(unsigned int, uint) -ALLTOALL_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -ALLTOALL_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_FCOLLECT - * @brief Concatenates blocks of data from multiple PEs to an array in every - * PE participating in the collective routine. - * - * This function must be called as a work-group collective. - * - * @param[in] team The team participating in the collective. - * @param[in] dest Destination address. Must be an address on the - * symmetric heap. - * @param[in] source Source address. Must be an address on the symmetric - heap. - * @param[in] nelems Number of data blocks in source array. - * - * @return void - */ -///@{ -FCOLLECT_API_GEN(float, float) -FCOLLECT_API_GEN(double, double) -// FCOLLECT_API_GEN(long double, longdouble) -FCOLLECT_API_GEN(char, char) -FCOLLECT_API_GEN(signed char, schar) -FCOLLECT_API_GEN(short, short) // NOLINT(runtime/int) -FCOLLECT_API_GEN(int, int) -FCOLLECT_API_GEN(long, long) // NOLINT(runtime/int) -FCOLLECT_API_GEN(long long, longlong) // NOLINT(runtime/int) -FCOLLECT_API_GEN(unsigned char, uchar) -FCOLLECT_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -FCOLLECT_API_GEN(unsigned int, uint) -FCOLLECT_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -FCOLLECT_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_PUT - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_API_GEN(float, float) -PUT_API_GEN(double, double) -// PUT_API_GEN(long double, longdouble) -PUT_API_GEN(char, char) -PUT_API_GEN(signed char, schar) -PUT_API_GEN(short, short) // NOLINT(runtime/int) -PUT_API_GEN(int, int) -PUT_API_GEN(long, long) // NOLINT(runtime/int) -PUT_API_GEN(long long, longlong) // NOLINT(runtime/int) -PUT_API_GEN(unsigned char, uchar) -PUT_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -PUT_API_GEN(unsigned int, uint) -PUT_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -PUT_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_P - * @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe. - * The caller must call into rocshmem_quiet() if remote completion is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] value Value to write to dest at \p pe. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -P_API_GEN(float, float) -P_API_GEN(double, double) -// P_API_GEN(long double, longdouble) -P_API_GEN(char, char) -P_API_GEN(signed char, schar) -P_API_GEN(short, short) // NOLINT(runtime/int) -P_API_GEN(int, int) -P_API_GEN(long, long) // NOLINT(runtime/int) -P_API_GEN(long long, longlong) // NOLINT(runtime/int) -P_API_GEN(unsigned char, uchar) -P_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -P_API_GEN(unsigned int, uint) -P_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -P_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_GET - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_API_GEN(float, float) -GET_API_GEN(double, double) -// GET_API_GEN(long double, longdouble) -GET_API_GEN(char, char) -GET_API_GEN(signed char, schar) -GET_API_GEN(short, short) // NOLINT(runtime/int) -GET_API_GEN(int, int) -GET_API_GEN(long, long) // NOLINT(runtime/int) -GET_API_GEN(long long, longlong) // NOLINT(runtime/int) -GET_API_GEN(unsigned char, uchar) -GET_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -GET_API_GEN(unsigned int, uint) -GET_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -GET_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_G - * @brief reads and returns single value from \p source at \p pe. - * The calling work-group/thread will block until the operation completes. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] source sourcen address. Must be an address on the symmetric - * heap. - * @param[in] pe PE of the remote process. - * - * @return the value read from remote \p source at \p pe. - */ -///@{ -G_API_GEN(float, float) -G_API_GEN(double, double) -// G_API_GEN(long double, longdouble) -G_API_GEN(char, char) -G_API_GEN(signed char, schar) -G_API_GEN(short, short) // NOLINT(runtime/int) -G_API_GEN(int, int) -G_API_GEN(long, long) // NOLINT(runtime/int) -G_API_GEN(long long, longlong) // NOLINT(runtime/int) -G_API_GEN(unsigned char, uchar) -G_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -G_API_GEN(unsigned int, uint) -G_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -G_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_PUT_NBI - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_NBI_API_GEN(float, float) -PUT_NBI_API_GEN(double, double) -// PUT_NBI_API_GEN(long double, longdouble) -PUT_NBI_API_GEN(char, char) -PUT_NBI_API_GEN(signed char, schar) -PUT_NBI_API_GEN(short, short) // NOLINT(runtime/int) -PUT_NBI_API_GEN(int, int) -PUT_NBI_API_GEN(long, long) // NOLINT(runtime/int) -PUT_NBI_API_GEN(long long, longlong) // NOLINT(runtime/int) -PUT_NBI_API_GEN(unsigned char, uchar) -PUT_NBI_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -PUT_NBI_API_GEN(unsigned int, uint) -PUT_NBI_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -PUT_NBI_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_GET_NBI - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller will - * return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_NBI_API_GEN(float, float) -GET_NBI_API_GEN(double, double) -// GET_NBI_API_GEN(long double, longdouble) -GET_NBI_API_GEN(char, char) -GET_NBI_API_GEN(signed char, schar) -GET_NBI_API_GEN(short, short) // NOLINT(runtime/int) -GET_NBI_API_GEN(int, int) -GET_NBI_API_GEN(long, long) // NOLINT(runtime/int) -GET_NBI_API_GEN(long long, longlong) // NOLINT(runtime/int) -GET_NBI_API_GEN(unsigned char, uchar) -GET_NBI_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -GET_NBI_API_GEN(unsigned int, uint) -GET_NBI_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -GET_NBI_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_ADD - * @brief Atomically add the value \p val to \p dest on \p pe. The operation - * returns the older value of \p dest to the calling PE. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return The old value of \p dest before the \p val was added. - */ -///@{ -ATOMIC_FETCH_ADD_API_GEN(int, int) -ATOMIC_FETCH_ADD_API_GEN(long, long) -ATOMIC_FETCH_ADD_API_GEN(long long, longlong) -ATOMIC_FETCH_ADD_API_GEN(unsigned int, uint) -ATOMIC_FETCH_ADD_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_ADD_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_ADD_API_GEN(int32_t, int32) -ATOMIC_FETCH_ADD_API_GEN(int64_t, int64) -ATOMIC_FETCH_ADD_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_ADD_API_GEN(uint64_t, uint64) -ATOMIC_FETCH_ADD_API_GEN(size_t, size) -ATOMIC_FETCH_ADD_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_COMPARE_SWAP - * @brief Atomically compares if the value in \p dest with \p cond is equal - * then put \p val in \p dest. The operation returns the older value of \p dest - * to the calling PE. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] cond The value to be compare with. - * @param[in] val The value to be atomically swapped. - * @param[in] pe PE of the remote process. - * - * @return The old value of \p dest. - */ -///@{ -ATOMIC_COMPARE_SWAP_API_GEN(int, int) -ATOMIC_COMPARE_SWAP_API_GEN(long, long) -ATOMIC_COMPARE_SWAP_API_GEN(long long, longlong) -ATOMIC_COMPARE_SWAP_API_GEN(unsigned int, uint) -ATOMIC_COMPARE_SWAP_API_GEN(unsigned long, ulong) -ATOMIC_COMPARE_SWAP_API_GEN(unsigned long long, ulonglong) -ATOMIC_COMPARE_SWAP_API_GEN(int32_t, int32) -ATOMIC_COMPARE_SWAP_API_GEN(int64_t, int64) -ATOMIC_COMPARE_SWAP_API_GEN(uint32_t, uint32) -ATOMIC_COMPARE_SWAP_API_GEN(uint64_t, uint64) -ATOMIC_COMPARE_SWAP_API_GEN(size_t, size) -ATOMIC_COMPARE_SWAP_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_INC - * @brief Atomically add 1 to \p dest on \p pe. The operation - * returns the older value of \p dest to the calling PE. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] pe PE of the remote process. - * - * @return The old value of \p dest before it was incremented by 1. - */ -///@{ -ATOMIC_FETCH_INC_API_GEN(int, int) -ATOMIC_FETCH_INC_API_GEN(long, long) -ATOMIC_FETCH_INC_API_GEN(long long, longlong) -ATOMIC_FETCH_INC_API_GEN(unsigned int, uint) -ATOMIC_FETCH_INC_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_INC_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_INC_API_GEN(int32_t, int32) -ATOMIC_FETCH_INC_API_GEN(int64_t, int64) -ATOMIC_FETCH_INC_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_INC_API_GEN(uint64_t, uint64) -ATOMIC_FETCH_INC_API_GEN(size_t, size) -ATOMIC_FETCH_INC_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH - * @brief Atomically return the value of \p dest to the calling PE. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return The value of \p dest. - */ -///@{ -ATOMIC_FETCH_API_GEN(float, float) -ATOMIC_FETCH_API_GEN(double, double) -ATOMIC_FETCH_API_GEN(int, int) -ATOMIC_FETCH_API_GEN(long, long) -ATOMIC_FETCH_API_GEN(long long, longlong) -ATOMIC_FETCH_API_GEN(unsigned int, uint) -ATOMIC_FETCH_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_API_GEN(int32_t, int32) -ATOMIC_FETCH_API_GEN(int64_t, int64) -ATOMIC_FETCH_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_API_GEN(uint64_t, uint64) -ATOMIC_FETCH_API_GEN(size_t, size) -ATOMIC_FETCH_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_ADD - * @brief Atomically add the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_ADD_API_GEN(int, int) -ATOMIC_ADD_API_GEN(long, long) -ATOMIC_ADD_API_GEN(long long, longlong) -ATOMIC_ADD_API_GEN(unsigned int, uint) -ATOMIC_ADD_API_GEN(unsigned long, ulong) -ATOMIC_ADD_API_GEN(unsigned long long, ulonglong) -ATOMIC_ADD_API_GEN(int32_t, int32) -ATOMIC_ADD_API_GEN(int64_t, int64) -ATOMIC_ADD_API_GEN(uint32_t, uint32) -ATOMIC_ADD_API_GEN(uint64_t, uint64) -ATOMIC_ADD_API_GEN(size_t, size) -ATOMIC_ADD_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_SET - * @brief Atomically set the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_SET_API_GEN(float, float) -ATOMIC_SET_API_GEN(double, double) -ATOMIC_SET_API_GEN(int, int) -ATOMIC_SET_API_GEN(long, long) -ATOMIC_SET_API_GEN(long long, longlong) -ATOMIC_SET_API_GEN(unsigned int, uint) -ATOMIC_SET_API_GEN(unsigned long, ulong) -ATOMIC_SET_API_GEN(unsigned long long, ulonglong) -ATOMIC_SET_API_GEN(int32_t, int32) -ATOMIC_SET_API_GEN(int64_t, int64) -ATOMIC_SET_API_GEN(uint32_t, uint32) -ATOMIC_SET_API_GEN(uint64_t, uint64) -ATOMIC_SET_API_GEN(size_t, size) -ATOMIC_SET_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_SWAP - * @brief Atomically swap the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return original value - */ -///@{ -ATOMIC_SWAP_API_GEN(float, float) -ATOMIC_SWAP_API_GEN(double, double) -ATOMIC_SWAP_API_GEN(int, int) -ATOMIC_SWAP_API_GEN(long, long) -ATOMIC_SWAP_API_GEN(long long, longlong) -ATOMIC_SWAP_API_GEN(unsigned int, uint) -ATOMIC_SWAP_API_GEN(unsigned long, ulong) -ATOMIC_SWAP_API_GEN(unsigned long long, ulonglong) -ATOMIC_SWAP_API_GEN(int32_t, int32) -ATOMIC_SWAP_API_GEN(int64_t, int64) -ATOMIC_SWAP_API_GEN(uint32_t, uint32) -ATOMIC_SWAP_API_GEN(uint64_t, uint64) -ATOMIC_SWAP_API_GEN(size_t, size) -ATOMIC_SWAP_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_AND - * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return original value - */ -///@{ -ATOMIC_FETCH_AND_API_GEN(unsigned int, uint) -ATOMIC_FETCH_AND_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_AND_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_AND_API_GEN(int32_t, int32) -ATOMIC_FETCH_AND_API_GEN(int64_t, int64) -ATOMIC_FETCH_AND_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_AND_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_AND - * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_AND_API_GEN(unsigned int, uint) -ATOMIC_AND_API_GEN(unsigned long, ulong) -ATOMIC_AND_API_GEN(unsigned long long, ulonglong) -ATOMIC_AND_API_GEN(int32_t, int32) -ATOMIC_AND_API_GEN(int64_t, int64) -ATOMIC_AND_API_GEN(uint32_t, uint32) -ATOMIC_AND_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_OR - * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return original value - */ -///@{ -ATOMIC_FETCH_OR_API_GEN(unsigned int, uint) -ATOMIC_FETCH_OR_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_OR_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_OR_API_GEN(int32_t, int32) -ATOMIC_FETCH_OR_API_GEN(int64_t, int64) -ATOMIC_FETCH_OR_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_OR_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_OR - * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_OR_API_GEN(unsigned int, uint) -ATOMIC_OR_API_GEN(unsigned long, ulong) -ATOMIC_OR_API_GEN(unsigned long long, ulonglong) -ATOMIC_OR_API_GEN(int32_t, int32) -ATOMIC_OR_API_GEN(int64_t, int64) -ATOMIC_OR_API_GEN(uint32_t, uint32) -ATOMIC_OR_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_XOR - * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return original value - */ -///@{ -ATOMIC_FETCH_XOR_API_GEN(unsigned int, uint) -ATOMIC_FETCH_XOR_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_XOR_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_XOR_API_GEN(int32_t, int32) -ATOMIC_FETCH_XOR_API_GEN(int64_t, int64) -ATOMIC_FETCH_XOR_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_XOR_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_XOR - * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_XOR_API_GEN(unsigned int, uint) -ATOMIC_XOR_API_GEN(unsigned long, ulong) -ATOMIC_XOR_API_GEN(unsigned long long, ulonglong) -ATOMIC_XOR_API_GEN(int32_t, int32) -ATOMIC_XOR_API_GEN(int64_t, int64) -ATOMIC_XOR_API_GEN(uint32_t, uint32) -ATOMIC_XOR_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_INC - * @brief Atomically add 1 to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_INC_API_GEN(int, int) -ATOMIC_INC_API_GEN(long, long) -ATOMIC_INC_API_GEN(long long, longlong) -ATOMIC_INC_API_GEN(unsigned int, uint) -ATOMIC_INC_API_GEN(unsigned long, ulong) -ATOMIC_INC_API_GEN(unsigned long long, ulonglong) -ATOMIC_INC_API_GEN(int32_t, int32) -ATOMIC_INC_API_GEN(int64_t, int64) -ATOMIC_INC_API_GEN(uint32_t, uint32) -ATOMIC_INC_API_GEN(uint64_t, uint64) -ATOMIC_INC_API_GEN(size_t, size) -ATOMIC_INC_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_WAIT_UNTIL - * @brief Block the caller until the condition (* \p ptr \p cmps \p val) is - * true. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ivars Pointer to memory on the symmetric heap to wait for. - * @param[in] cmp Operation for the comparison. - * @param[in] val Value to compare the memory at \p ptr to. - * - * @return void - */ -///@{ -WAIT_UNTIL_API_GEN(float, float) -WAIT_UNTIL_API_GEN(double, double) -// WAIT_UNTIL_API_GEN(long double, longdouble) -WAIT_UNTIL_API_GEN(char, char) -WAIT_UNTIL_API_GEN(signed char, schar) -WAIT_UNTIL_API_GEN(short, short) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(int, int) -WAIT_UNTIL_API_GEN(long, long) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(long long, longlong) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(unsigned char, uchar) -WAIT_UNTIL_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(unsigned int, uint) -WAIT_UNTIL_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_TEST - * @brief test if the condition (* \p ptr \p cmps \p val) is - * true. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ivars Pointer to memory on the symmetric heap to wait for. - * @param[in] cmp Operation for the comparison. - * @param[in] val Value to compare the memory at \p ptr to. - * - * @return 1 if the evaluation is true else 0 - */ -///@{ -TEST_API_GEN(float, float) -TEST_API_GEN(double, double) -// TEST_API_GEN(long double, longdouble) -TEST_API_GEN(char, char) -TEST_API_GEN(signed char, schar) -TEST_API_GEN(short, short) // NOLINT(runtime/int) -TEST_API_GEN(int, int) -TEST_API_GEN(long, long) // NOLINT(runtime/int) -TEST_API_GEN(long long, longlong) // NOLINT(runtime/int) -TEST_API_GEN(unsigned char, uchar) -TEST_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -TEST_API_GEN(unsigned int, uint) -TEST_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -TEST_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/****************************************************************************** - ***************************** API EXTENSIONS ********************************* - *****************************************************************************/ - -/* - * MACRO DECLARE SHMEM_PUT APIs - */ -#define PUT_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_##GRAN( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_##GRAN( \ - T *dest, const T *source, size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_GET APIs - */ -#define GET_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_##GRAN( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_##GRAN( \ - T *dest, const T *source, size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_PUT_NBI APIs - */ -#define PUT_NBI_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_nbi_##GRAN( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_nbi_##GRAN( \ - T *dest, const T *source, size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_GET_NBI APIs - */ -#define GET_NBI_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_nbi_##GRAN( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_nbi_##GRAN( \ - T *dest, const T *source, size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in a wave must participate in the - * call using the same parameters. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wave( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_wave(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-workgroup - * (WG) granularity. However, all threads in the workgroup must participate in - * the call using the same parameters. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_wg(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in a wave must collectively participate - * in the call using the same arguments - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_API_EXT_GEN(wave, float, float) -PUT_API_EXT_GEN(wave, double, double) -// PUT_API_EXT_GEN(wave, long double, longdouble) -PUT_API_EXT_GEN(wave, char, char) -PUT_API_EXT_GEN(wave, signed char, schar) -PUT_API_EXT_GEN(wave, short, short) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, int, int) -PUT_API_EXT_GEN(wave, long, long) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, long long, longlong) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, unsigned char, uchar) -PUT_API_EXT_GEN(wave, unsigned short, ushort) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, unsigned int, uint) -PUT_API_EXT_GEN(wave, unsigned long, ulong) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-workgroub - * (WG) granularity. However, All threads in a WG must collectively participate - * in the call using the same arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_API_EXT_GEN(wg, float, float) -PUT_API_EXT_GEN(wg, double, double) -// PUT_API_EXT_GEN(wg, long double, longdouble) -PUT_API_EXT_GEN(wg, char, char) -PUT_API_EXT_GEN(wg, signed char, schar) -PUT_API_EXT_GEN(wg, short, short) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, int, int) -PUT_API_EXT_GEN(wg, long, long) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, long long, longlong) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, unsigned char, uchar) -PUT_API_EXT_GEN(wg, unsigned short, ushort) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, unsigned int, uint) -PUT_API_EXT_GEN(wg, unsigned long, ulong) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in a the wave must participate in the - * call using the same parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wave( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_wave(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-workgroup - * (WG) granularity. However, all threads in the workgroup must participate - * in the call using the same parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_wg(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in the wave must participate in the - * call using the same parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_API_EXT_GEN(wave, float, float) -GET_API_EXT_GEN(wave, double, double) -// GET_API_EXT_GEN(wave, long double, longdouble) -GET_API_EXT_GEN(wave, char, char) -GET_API_EXT_GEN(wave, signed char, schar) -GET_API_EXT_GEN(wave, short, short) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, int, int) -GET_API_EXT_GEN(wave, long, long) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, long long, longlong) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, unsigned char, uchar) -GET_API_EXT_GEN(wave, unsigned short, ushort) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, unsigned int, uint) -GET_API_EXT_GEN(wave, unsigned long, ulong) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in the workgroup must participate in - * the call using the same parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_API_EXT_GEN(wg, float, float) -GET_API_EXT_GEN(wg, double, double) -// GET_API_EXT_GEN(wg, long double, longdouble) -GET_API_EXT_GEN(wg, char, char) -GET_API_EXT_GEN(wg, signed char, schar) -GET_API_EXT_GEN(wg, short, short) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, int, int) -GET_API_EXT_GEN(wg, long, long) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, long long, longlong) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, unsigned char, uchar) -GET_API_EXT_GEN(wg, unsigned short, ushort) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, unsigned int, uint) -GET_API_EXT_GEN(wg, unsigned long, ulong) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in a wave must call in with the same - * parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wave( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wave(void *dest, - const void *source, - size_t nelems, - int pe); - -/** - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in the wave must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_NBI_API_EXT_GEN(wave, float, float) -PUT_NBI_API_EXT_GEN(wave, double, double) -// PUT_NBI_API_EXT_GEN(wave, long double, longdouble) -PUT_NBI_API_EXT_GEN(wave, char, char) -PUT_NBI_API_EXT_GEN(wave, signed char, schar) -PUT_NBI_API_EXT_GEN(wave, short, short) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, int, int) -PUT_NBI_API_EXT_GEN(wave, long, long) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, long long, longlong) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, unsigned char, uchar) -PUT_NBI_API_EXT_GEN(wave, unsigned short, ushort) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, unsigned int, uint) -PUT_NBI_API_EXT_GEN(wave, unsigned long, ulong) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT -///@} - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in a WG must call in with the same - * parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wg( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wg(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in the WG must call in with the sameo - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_NBI_API_EXT_GEN(wg, float, float) -PUT_NBI_API_EXT_GEN(wg, double, double) -// PUT_NBI_API_EXT_GEN(wg, long double, longdouble) -PUT_NBI_API_EXT_GEN(wg, char, char) -PUT_NBI_API_EXT_GEN(wg, signed char, schar) -PUT_NBI_API_EXT_GEN(wg, short, short) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, int, int) -PUT_NBI_API_EXT_GEN(wg, long, long) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, long long, longlong) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, unsigned char, uchar) -PUT_NBI_API_EXT_GEN(wg, unsigned short, ushort) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, unsigned int, uint) -PUT_NBI_API_EXT_GEN(wg, unsigned long, ulong) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in the wave must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wave( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wave(void *dest, - const void *source, - size_t nelems, - int pe); - -/** - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in the wave must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_NBI_API_EXT_GEN(wave, float, float) -GET_NBI_API_EXT_GEN(wave, double, double) -// GET_NBI_API_EXT_GEN(wave, long double, longdouble) -GET_NBI_API_EXT_GEN(wave, char, char) -GET_NBI_API_EXT_GEN(wave, signed char, schar) -GET_NBI_API_EXT_GEN(wave, short, short) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, int, int) -GET_NBI_API_EXT_GEN(wave, long, long) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, long long, longlong) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, unsigned char, uchar) -GET_NBI_API_EXT_GEN(wave, unsigned short, ushort) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, unsigned int, uint) -GET_NBI_API_EXT_GEN(wave, unsigned long, ulong) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT -///@} - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in the WG must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wg( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wg(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in the WG must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_NBI_API_EXT_GEN(wg, float, float) -GET_NBI_API_EXT_GEN(wg, double, double) -// GET_NBI_API_EXT_GEN(wg, long double, longdouble) -GET_NBI_API_EXT_GEN(wg, char, char) -GET_NBI_API_EXT_GEN(wg, signed char, schar) -GET_NBI_API_EXT_GEN(wg, short, short) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, int, int) -GET_NBI_API_EXT_GEN(wg, long, long) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, long long, longlong) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, unsigned char, uchar) -GET_NBI_API_EXT_GEN(wg, unsigned short, ushort) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, unsigned int, uint) -GET_NBI_API_EXT_GEN(wg, unsigned long, ulong) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - - -/* - * ROCSHMEM Signalling Operations - */ -#define PUTMEM_SIGNAL_DEC(SUFFIX) \ - __device__ ATTR_NO_INLINE void rocshmem_putmem_signal##SUFFIX(void *dest, \ - const void *source, \ - size_t nelems, \ - uint64_t *sig_addr, \ - uint64_t signal, \ - int sig_op, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal##SUFFIX(rocshmem_ctx_t ctx, \ - void *dest, \ - const void *source, \ - size_t nelems, \ - uint64_t *sig_addr, \ - uint64_t signal, \ - int sig_op, int pe); - -#define PUT_SIGNAL_TYPED_DEC(T, TNAME, SUFFIX) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_signal##SUFFIX(rocshmem_ctx_t ctx, \ - T *dest, \ - const T *source, \ - size_t nelems, \ - uint64_t *sig_addr, \ - uint64_t signal, \ - int sig_op, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_signal##SUFFIX(T *dest, \ - const T *source, \ - size_t nelems, \ - uint64_t *sig_addr, \ - uint64_t signal, \ - int sig_op, int pe); - -#define PUT_SIGNAL_DEC(SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(float, float, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(double, double, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(char, char, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(signed char, schar, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(short, short, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(int, int, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(long, long, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(long long, longlong, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned char, uchar, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned short, ushort, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned int, uint, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned long, ulong, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned long long, ulonglong, SUFFIX) - -#define SIGNALING_API_DEC(SUFFIX) \ - PUTMEM_SIGNAL_DEC(SUFFIX) \ - PUT_SIGNAL_DEC(SUFFIX) - -SIGNALING_API_DEC() -SIGNALING_API_DEC(_wg) -SIGNALING_API_DEC(_wave) -SIGNALING_API_DEC(_nbi) -SIGNALING_API_DEC(_nbi_wg) -SIGNALING_API_DEC(_nbi_wave) - -__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr); -__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr); -__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr); - } // namespace rocshmem #endif // LIBRARY_INCLUDE_ROCSHMEM_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_AMO.hpp b/projects/rocshmem/include/rocshmem/rocshmem_AMO.hpp new file mode 100644 index 0000000000..76fc47d52b --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_AMO.hpp @@ -0,0 +1,1581 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP + +namespace rocshmem { + +/** + * @name SHMEM_ATOMIC_FETCH + * @brief Atomically return the value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return The value of \p dest. + */ +__device__ ATTR_NO_INLINE float rocshmem_ctx_float_atomic_fetch( + rocshmem_ctx_t ctx, float *source, int pe); +__device__ ATTR_NO_INLINE float rocshmem_float_atomic_fetch( + float *source, int pe); +__host__ float rocshmem_ctx_float_atomic_fetch( + rocshmem_ctx_t ctx, float *source, int pe); +__host__ float rocshmem_float_atomic_fetch( + float *source, int pe); + +__device__ ATTR_NO_INLINE double rocshmem_ctx_double_atomic_fetch( + rocshmem_ctx_t ctx, double *source, int pe); +__device__ ATTR_NO_INLINE double rocshmem_double_atomic_fetch( + double *source, int pe); +__host__ double rocshmem_ctx_double_atomic_fetch( + rocshmem_ctx_t ctx, double *source, int pe); +__host__ double rocshmem_double_atomic_fetch( + double *source, int pe); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch( + rocshmem_ctx_t ctx, int *source, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch( + int *source, int pe); +__host__ int rocshmem_ctx_int_atomic_fetch( + rocshmem_ctx_t ctx, int *source, int pe); +__host__ int rocshmem_int_atomic_fetch( + int *source, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch( + rocshmem_ctx_t ctx, long *source, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch( + long *source, int pe); +__host__ long rocshmem_ctx_long_atomic_fetch( + rocshmem_ctx_t ctx, long *source, int pe); +__host__ long rocshmem_long_atomic_fetch( + long *source, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch( + rocshmem_ctx_t ctx, long long *source, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch( + long long *source, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_fetch( + rocshmem_ctx_t ctx, long long *source, int pe); +__host__ long long rocshmem_longlong_atomic_fetch( + long long *source, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch( + rocshmem_ctx_t ctx, unsigned int *source, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch( + unsigned int *source, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch( + rocshmem_ctx_t ctx, unsigned int *source, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch( + unsigned int *source, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch( + rocshmem_ctx_t ctx, unsigned long *source, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch( + unsigned long *source, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch( + rocshmem_ctx_t ctx, unsigned long *source, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch( + unsigned long *source, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch( + rocshmem_ctx_t ctx, unsigned long long *source, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch( + unsigned long long *source, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch( + rocshmem_ctx_t ctx, unsigned long long *source, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch( + unsigned long long *source, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch( + rocshmem_ctx_t ctx, int32_t *source, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch( + int32_t *source, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch( + rocshmem_ctx_t ctx, int32_t *source, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch( + int32_t *source, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch( + rocshmem_ctx_t ctx, int64_t *source, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch( + int64_t *source, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch( + rocshmem_ctx_t ctx, int64_t *source, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch( + int64_t *source, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch( + rocshmem_ctx_t ctx, uint32_t *source, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch( + uint32_t *source, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch( + rocshmem_ctx_t ctx, uint32_t *source, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch( + uint32_t *source, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch( + rocshmem_ctx_t ctx, uint64_t *source, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch( + uint64_t *source, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch( + rocshmem_ctx_t ctx, uint64_t *source, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch( + uint64_t *source, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch( + rocshmem_ctx_t ctx, size_t *source, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch( + size_t *source, int pe); +__host__ size_t rocshmem_ctx_size_atomic_fetch( + rocshmem_ctx_t ctx, size_t *source, int pe); +__host__ size_t rocshmem_size_atomic_fetch( + size_t *source, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch( + rocshmem_ctx_t ctx, ptrdiff_t *source, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch( + ptrdiff_t *source, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch( + rocshmem_ctx_t ctx, ptrdiff_t *source, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch( + ptrdiff_t *source, int pe); + + +/** + * @name SHMEM_ATOMIC_SET + * @brief Atomically set the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_atomic_set( + rocshmem_ctx_t ctx, float *dest, float value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_atomic_set( + float *dest, float value, int pe); +__host__ void rocshmem_ctx_float_atomic_set( + rocshmem_ctx_t ctx, float *dest, float value, int pe); +__host__ void rocshmem_float_atomic_set( + float *dest, float value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_atomic_set( + rocshmem_ctx_t ctx, double *dest, double value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_atomic_set( + double *dest, double value, int pe); +__host__ void rocshmem_ctx_double_atomic_set( + rocshmem_ctx_t ctx, double *dest, double value, int pe); +__host__ void rocshmem_double_atomic_set( + double *dest, double value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_set( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_atomic_set( + int *dest, int value, int pe); +__host__ void rocshmem_ctx_int_atomic_set( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__host__ void rocshmem_int_atomic_set( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_set( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_atomic_set( + long *dest, long value, int pe); +__host__ void rocshmem_ctx_long_atomic_set( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__host__ void rocshmem_long_atomic_set( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_set( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_set( + long long *dest, long long value, int pe); +__host__ void rocshmem_ctx_longlong_atomic_set( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__host__ void rocshmem_longlong_atomic_set( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_set( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_set( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_set( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_set( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_set( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_set( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_set( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_set( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_set( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_set( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_set( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_set( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_set( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_set( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_set( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_set( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_set( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_set( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_set( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_set( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_set( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_set( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_set( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_set( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_set( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_set( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_set( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_set( + uint64_t *dest, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_set( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_size_atomic_set( + size_t *dest, size_t value, int pe); +__host__ void rocshmem_ctx_size_atomic_set( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__host__ void rocshmem_size_atomic_set( + size_t *dest, size_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_set( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_set( + ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ void rocshmem_ctx_ptrdiff_atomic_set( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ void rocshmem_ptrdiff_atomic_set( + ptrdiff_t *dest, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_COMPARE_SWAP + * @brief Atomically compares if the value in \p dest with \p cond is equal + * then put \p val in \p dest. The operation returns the older value of \p dest + * to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] cond The value to be compare with. + * @param[in] val The value to be atomically swapped. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest. + */ +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_compare_swap( + rocshmem_ctx_t ctx, int *dest, int cond, int value, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_compare_swap( + int *dest, int cond, int value, int pe); +__host__ int rocshmem_ctx_int_atomic_compare_swap( + rocshmem_ctx_t ctx, int *dest, int cond, int value, int pe); +__host__ int rocshmem_int_atomic_compare_swap( + int *dest, int cond, int value, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_compare_swap( + rocshmem_ctx_t ctx, long *dest, long cond, long value, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_compare_swap( + long *dest, long cond, long value, int pe); +__host__ long rocshmem_ctx_long_atomic_compare_swap( + rocshmem_ctx_t ctx, long *dest, long cond, long value, int pe); +__host__ long rocshmem_long_atomic_compare_swap( + long *dest, long cond, long value, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_compare_swap( + rocshmem_ctx_t ctx, long long *dest, long long cond, long long value, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_compare_swap( + long long *dest, long long cond, long long value, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_compare_swap( + rocshmem_ctx_t ctx, long long *dest, long long cond, long long value, int pe); +__host__ long long rocshmem_longlong_atomic_compare_swap( + long long *dest, long long cond, long long value, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int cond, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_compare_swap( + unsigned int *dest, unsigned int cond, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int cond, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_compare_swap( + unsigned int *dest, unsigned int cond, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long cond, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_compare_swap( + unsigned long *dest, unsigned long cond, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long cond, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_compare_swap( + unsigned long *dest, unsigned long cond, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_compare_swap( + unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_compare_swap( + unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_compare_swap( + rocshmem_ctx_t ctx, int32_t *dest, int32_t cond, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_compare_swap( + int32_t *dest, int32_t cond, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_compare_swap( + rocshmem_ctx_t ctx, int32_t *dest, int32_t cond, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_compare_swap( + int32_t *dest, int32_t cond, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_compare_swap( + rocshmem_ctx_t ctx, int64_t *dest, int64_t cond, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_compare_swap( + int64_t *dest, int64_t cond, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_compare_swap( + rocshmem_ctx_t ctx, int64_t *dest, int64_t cond, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_compare_swap( + int64_t *dest, int64_t cond, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_compare_swap( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t cond, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_compare_swap( + uint32_t *dest, uint32_t cond, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_compare_swap( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t cond, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_compare_swap( + uint32_t *dest, uint32_t cond, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_compare_swap( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t cond, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_compare_swap( + uint64_t *dest, uint64_t cond, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_compare_swap( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t cond, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_compare_swap( + uint64_t *dest, uint64_t cond, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_compare_swap( + rocshmem_ctx_t ctx, size_t *dest, size_t cond, size_t value, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_compare_swap( + size_t *dest, size_t cond, size_t value, int pe); +__host__ size_t rocshmem_ctx_size_atomic_compare_swap( + rocshmem_ctx_t ctx, size_t *dest, size_t cond, size_t value, int pe); +__host__ size_t rocshmem_size_atomic_compare_swap( + size_t *dest, size_t cond, size_t value, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_compare_swap( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_compare_swap( + ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_compare_swap( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_compare_swap( + ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_SWAP + * @brief Atomically swap the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */ +__device__ ATTR_NO_INLINE float rocshmem_ctx_float_atomic_swap( + rocshmem_ctx_t ctx, float *dest, float value, int pe); +__device__ ATTR_NO_INLINE float rocshmem_float_atomic_swap( + float *dest, float value, int pe); +__host__ float rocshmem_ctx_float_atomic_swap( + rocshmem_ctx_t ctx, float *dest, float value, int pe); +__host__ float rocshmem_float_atomic_swap( + float *dest, float value, int pe); + +__device__ ATTR_NO_INLINE double rocshmem_ctx_double_atomic_swap( + rocshmem_ctx_t ctx, double *dest, double value, int pe); +__device__ ATTR_NO_INLINE double rocshmem_double_atomic_swap( + double *dest, double value, int pe); +__host__ double rocshmem_ctx_double_atomic_swap( + rocshmem_ctx_t ctx, double *dest, double value, int pe); +__host__ double rocshmem_double_atomic_swap( + double *dest, double value, int pe); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_swap( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_swap( + int *dest, int value, int pe); +__host__ int rocshmem_ctx_int_atomic_swap( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__host__ int rocshmem_int_atomic_swap( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_swap( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_swap( + long *dest, long value, int pe); +__host__ long rocshmem_ctx_long_atomic_swap( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__host__ long rocshmem_long_atomic_swap( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_swap( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_swap( + long long *dest, long long value, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_swap( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__host__ long long rocshmem_longlong_atomic_swap( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_swap( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_swap( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_swap( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_swap( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_swap( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_swap( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_swap( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_swap( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_swap( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_swap( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_swap( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_swap( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_swap( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_swap( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_swap( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_swap( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_swap( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_swap( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_swap( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_swap( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_swap( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_swap( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_swap( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_swap( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_swap( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_swap( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_swap( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_swap( + uint64_t *dest, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_swap( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_swap( + size_t *dest, size_t value, int pe); +__host__ size_t rocshmem_ctx_size_atomic_swap( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__host__ size_t rocshmem_size_atomic_swap( + size_t *dest, size_t value, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_swap( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_swap( + ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_swap( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_swap( + ptrdiff_t *dest, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_INC + * @brief Atomically add 1 to \p dest on \p pe. The operation + * returns the older value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest before it was incremented by 1. + */ +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch_inc( + rocshmem_ctx_t ctx, int *dest, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch_inc( + int *dest, int pe); +__host__ int rocshmem_ctx_int_atomic_fetch_inc( + rocshmem_ctx_t ctx, int *dest, int pe); +__host__ int rocshmem_int_atomic_fetch_inc( + int *dest, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch_inc( + rocshmem_ctx_t ctx, long *dest, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch_inc( + long *dest, int pe); +__host__ long rocshmem_ctx_long_atomic_fetch_inc( + rocshmem_ctx_t ctx, long *dest, int pe); +__host__ long rocshmem_long_atomic_fetch_inc( + long *dest, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch_inc( + rocshmem_ctx_t ctx, long long *dest, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch_inc( + long long *dest, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_fetch_inc( + rocshmem_ctx_t ctx, long long *dest, int pe); +__host__ long long rocshmem_longlong_atomic_fetch_inc( + long long *dest, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned int *dest, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_inc( + unsigned int *dest, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned int *dest, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_inc( + unsigned int *dest, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned long *dest, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_inc( + unsigned long *dest, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned long *dest, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_inc( + unsigned long *dest, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned long long *dest, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_inc( + unsigned long long *dest, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned long long *dest, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_inc( + unsigned long long *dest, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_inc( + rocshmem_ctx_t ctx, int32_t *dest, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_inc( + int32_t *dest, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_inc( + rocshmem_ctx_t ctx, int32_t *dest, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_inc( + int32_t *dest, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_inc( + rocshmem_ctx_t ctx, int64_t *dest, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_inc( + int64_t *dest, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_inc( + rocshmem_ctx_t ctx, int64_t *dest, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_inc( + int64_t *dest, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_inc( + rocshmem_ctx_t ctx, uint32_t *dest, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_inc( + uint32_t *dest, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_inc( + rocshmem_ctx_t ctx, uint32_t *dest, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_inc( + uint32_t *dest, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_inc( + rocshmem_ctx_t ctx, uint64_t *dest, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_inc( + uint64_t *dest, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_inc( + rocshmem_ctx_t ctx, uint64_t *dest, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_inc( + uint64_t *dest, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch_inc( + rocshmem_ctx_t ctx, size_t *dest, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch_inc( + size_t *dest, int pe); +__host__ size_t rocshmem_ctx_size_atomic_fetch_inc( + rocshmem_ctx_t ctx, size_t *dest, int pe); +__host__ size_t rocshmem_size_atomic_fetch_inc( + size_t *dest, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_inc( + rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch_inc( + ptrdiff_t *dest, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_inc( + rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch_inc( + ptrdiff_t *dest, int pe); + + +/** + * @name SHMEM_ATOMIC_INC + * @brief Atomically add 1 to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_inc( + rocshmem_ctx_t ctx, int *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_atomic_inc( + int *dest, int pe); +__host__ void rocshmem_ctx_int_atomic_inc( + rocshmem_ctx_t ctx, int *dest, int pe); +__host__ void rocshmem_int_atomic_inc( + int *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_inc( + rocshmem_ctx_t ctx, long *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_atomic_inc( + long *dest, int pe); +__host__ void rocshmem_ctx_long_atomic_inc( + rocshmem_ctx_t ctx, long *dest, int pe); +__host__ void rocshmem_long_atomic_inc( + long *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_inc( + rocshmem_ctx_t ctx, long long *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_inc( + long long *dest, int pe); +__host__ void rocshmem_ctx_longlong_atomic_inc( + rocshmem_ctx_t ctx, long long *dest, int pe); +__host__ void rocshmem_longlong_atomic_inc( + long long *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_inc( + rocshmem_ctx_t ctx, unsigned int *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_inc( + unsigned int *dest, int pe); +__host__ void rocshmem_ctx_uint_atomic_inc( + rocshmem_ctx_t ctx, unsigned int *dest, int pe); +__host__ void rocshmem_uint_atomic_inc( + unsigned int *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_inc( + rocshmem_ctx_t ctx, unsigned long *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_inc( + unsigned long *dest, int pe); +__host__ void rocshmem_ctx_ulong_atomic_inc( + rocshmem_ctx_t ctx, unsigned long *dest, int pe); +__host__ void rocshmem_ulong_atomic_inc( + unsigned long *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_inc( + rocshmem_ctx_t ctx, unsigned long long *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_inc( + unsigned long long *dest, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_inc( + rocshmem_ctx_t ctx, unsigned long long *dest, int pe); +__host__ void rocshmem_ulonglong_atomic_inc( + unsigned long long *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_inc( + rocshmem_ctx_t ctx, int32_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_inc( + int32_t *dest, int pe); +__host__ void rocshmem_ctx_int32_atomic_inc( + rocshmem_ctx_t ctx, int32_t *dest, int pe); +__host__ void rocshmem_int32_atomic_inc( + int32_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_inc( + rocshmem_ctx_t ctx, int64_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_inc( + int64_t *dest, int pe); +__host__ void rocshmem_ctx_int64_atomic_inc( + rocshmem_ctx_t ctx, int64_t *dest, int pe); +__host__ void rocshmem_int64_atomic_inc( + int64_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_inc( + rocshmem_ctx_t ctx, uint32_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_inc( + uint32_t *dest, int pe); +__host__ void rocshmem_ctx_uint32_atomic_inc( + rocshmem_ctx_t ctx, uint32_t *dest, int pe); +__host__ void rocshmem_uint32_atomic_inc( + uint32_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_inc( + rocshmem_ctx_t ctx, uint64_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_inc( + uint64_t *dest, int pe); +__host__ void rocshmem_ctx_uint64_atomic_inc( + rocshmem_ctx_t ctx, uint64_t *dest, int pe); +__host__ void rocshmem_uint64_atomic_inc( + uint64_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_inc( + rocshmem_ctx_t ctx, size_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_size_atomic_inc( + size_t *dest, int pe); +__host__ void rocshmem_ctx_size_atomic_inc( + rocshmem_ctx_t ctx, size_t *dest, int pe); +__host__ void rocshmem_size_atomic_inc( + size_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_inc( + rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_inc( + ptrdiff_t *dest, int pe); +__host__ void rocshmem_ctx_ptrdiff_atomic_inc( + rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe); +__host__ void rocshmem_ptrdiff_atomic_inc( + ptrdiff_t *dest, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_ADD + * @brief Atomically add the value \p val to \p dest on \p pe. The operation + * returns the older value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest before the \p val was added. + */ +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch_add( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch_add( + int *dest, int value, int pe); +__host__ int rocshmem_ctx_int_atomic_fetch_add( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__host__ int rocshmem_int_atomic_fetch_add( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch_add( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch_add( + long *dest, long value, int pe); +__host__ long rocshmem_ctx_long_atomic_fetch_add( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__host__ long rocshmem_long_atomic_fetch_add( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch_add( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch_add( + long long *dest, long long value, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_fetch_add( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__host__ long long rocshmem_longlong_atomic_fetch_add( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_add( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_add( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_add( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_add( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_add( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_add( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_add( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_add( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_add( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_add( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_add( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_add( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_add( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_add( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_add( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_add( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_add( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_add( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_add( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_add( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_add( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_add( + uint64_t *dest, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch_add( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch_add( + size_t *dest, size_t value, int pe); +__host__ size_t rocshmem_ctx_size_atomic_fetch_add( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__host__ size_t rocshmem_size_atomic_fetch_add( + size_t *dest, size_t value, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_add( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch_add( + ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_add( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch_add( + ptrdiff_t *dest, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_ADD + * @brief Atomically add the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_add( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_atomic_add( + int *dest, int value, int pe); +__host__ void rocshmem_ctx_int_atomic_add( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__host__ void rocshmem_int_atomic_add( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_add( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_atomic_add( + long *dest, long value, int pe); +__host__ void rocshmem_ctx_long_atomic_add( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__host__ void rocshmem_long_atomic_add( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_add( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_add( + long long *dest, long long value, int pe); +__host__ void rocshmem_ctx_longlong_atomic_add( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__host__ void rocshmem_longlong_atomic_add( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_add( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_add( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_add( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_add( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_add( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_add( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_add( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_add( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_add( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_add( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_add( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_add( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_add( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_add( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_add( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_add( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_add( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_add( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_add( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_add( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_add( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_add( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_add( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_add( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_add( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_add( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_add( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_add( + uint64_t *dest, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_add( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_size_atomic_add( + size_t *dest, size_t value, int pe); +__host__ void rocshmem_ctx_size_atomic_add( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__host__ void rocshmem_size_atomic_add( + size_t *dest, size_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_add( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_add( + ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ void rocshmem_ctx_ptrdiff_atomic_add( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ void rocshmem_ptrdiff_atomic_add( + ptrdiff_t *dest, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_AND + * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */ +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_and( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_and( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_and( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_and( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_and( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_and( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_and( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_and( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_and( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_and( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_and( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_and( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_and( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_and( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_and( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_and( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_and( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_and( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_and( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_and( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_and( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_and( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_AND + * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_and( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_and( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_and( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_and( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_and( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_and( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_and( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_and( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_and( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_and( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_and( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_and( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_and( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_and( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_and( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_and( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_and( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_and( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_and( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_and( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_and( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_and( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_and( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_and( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_and( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_and( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_and( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_and( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_OR + * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */ +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_or( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_or( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_or( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_or( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_or( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_or( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_or( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_or( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_or( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_or( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_or( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_or( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_or( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_or( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_or( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_or( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_or( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_or( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_or( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_or( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_or( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_or( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_OR + * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_or( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_or( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_or( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_or( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_or( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_or( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_or( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_or( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_or( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_or( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_or( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_or( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_or( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_or( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_or( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_or( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_or( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_or( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_or( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_or( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_or( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_or( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_or( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_or( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_or( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_or( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_or( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_or( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_XOR + * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */ +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_xor( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_xor( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_xor( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_xor( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_xor( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_xor( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_xor( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_xor( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_xor( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_xor( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_xor( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_xor( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_xor( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_xor( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_xor( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_xor( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_xor( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_xor( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_xor( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_xor( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_xor( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_xor( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_XOR + * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_xor( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_xor( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_xor( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_xor( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_xor( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_xor( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_xor( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_xor( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_xor( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_xor( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_xor( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_xor( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_xor( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_xor( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_xor( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_xor( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_xor( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_xor( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_xor( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_xor( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_xor( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_xor( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_xor( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_xor( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_xor( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_xor( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_xor( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_xor( + uint64_t *dest, uint64_t value, int pe); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp new file mode 100644 index 0000000000..c15498705d --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp @@ -0,0 +1,603 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP + +namespace rocshmem { + +/** + * @name SHMEM_ALLTOALL + * @brief Exchanges a fixed amount of contiguous data blocks between all pairs + * of PEs participating in the collective routine. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelems Number of data blocks transferred per pair of PEs. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, + const float *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, + const double *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, + const char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, + const signed char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, + const short *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, + const int *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, + const long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, + const long long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, + const unsigned char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, + const unsigned short *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, + const unsigned int *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, + const unsigned long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, + const unsigned long long *source, int nelems); + + +/** + * @name SHMEM_BROADCAST + * @brief Perform a broadcast between PEs in the active set. The caller + * is blocked until the broadcase completes. + * + * This function must be called as a work-group collective. + * + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelement Size of the buffer to participate in the broadcast. + * @param[in] PE_root Zero-based ordinal of the PE, with respect to the + active set, from which the data is copied + * @param[in] PE_start PE to start the reduction. + * @param[in] logPE_stride Stride of PEs participating in the reduction. + * @param[in] PE_size Number PEs participating in the reduction. + * @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must + be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, + const float *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_float_broadcast( + rocshmem_ctx_t ctx, float *dest, const float *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_float_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, + const float *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, + const double *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_double_broadcast( + rocshmem_ctx_t ctx, double *dest, const double *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_double_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, + const double *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, + const char *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_char_broadcast( + rocshmem_ctx_t ctx, char *dest, const char *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_char_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, + const char *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, + const signed char *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_schar_broadcast( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_schar_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, + const signed char *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, + const short *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_short_broadcast( + rocshmem_ctx_t ctx, short *dest, const short *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_short_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, + const short *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, + const int *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_int_broadcast( + rocshmem_ctx_t ctx, int *dest, const int *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_int_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, + const int *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, + const long *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_long_broadcast( + rocshmem_ctx_t ctx, long *dest, const long *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_long_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, + const long *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, + const long long *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_longlong_broadcast( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_longlong_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, + const long long *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, + const unsigned char *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_uchar_broadcast( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_uchar_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, + const unsigned char *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, + const unsigned short *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_ushort_broadcast( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_ushort_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, + const unsigned short *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, + const unsigned int *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_uint_broadcast( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_uint_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, + const unsigned int *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, + const unsigned long *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_ulong_broadcast( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_ulong_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, + const unsigned long *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, + const unsigned long long *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_ulonglong_broadcast( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_ulonglong_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, + const unsigned long long *source, int nelems, int pe_root); + + +/** + * @name SHMEM_FCOLLECT + * @brief Concatenates blocks of data from multiple PEs to an array in every + * PE participating in the collective routine. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelems Number of data blocks in source array. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, + const float *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, + const double *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, + const char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, + const signed char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, + const short *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, + const int *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, + const long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, + const long long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, + const unsigned char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, + const unsigned short *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, + const unsigned int *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, + const unsigned long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, + const unsigned long long *source, int nelems); + + +/** + * @name SHMEM_REDUCTIONS + * @brief Perform an allreduce between PEs in the active set. The caller + * is blocked until the reduction completes. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nreduce Size of the buffer to participate in the reduction. + * + * @return int (Zero on successful local completion. Nonzero otherwise.) + */ +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_or_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_and_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_xor_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_or_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_and_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_xor_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_or_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_and_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_xor_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_or_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_and_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_xor_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); +__host__ int rocshmem_ctx_float_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); +__host__ int rocshmem_ctx_float_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); +__host__ int rocshmem_ctx_float_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); +__host__ int rocshmem_ctx_float_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); +__host__ int rocshmem_ctx_double_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); +__host__ int rocshmem_ctx_double_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); +__host__ int rocshmem_ctx_double_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); +__host__ int rocshmem_ctx_double_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_P2P_SYNC.hpp b/projects/rocshmem/include/rocshmem/rocshmem_P2P_SYNC.hpp new file mode 100644 index 0000000000..8ebe9b0390 --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_P2P_SYNC.hpp @@ -0,0 +1,662 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP + +namespace rocshmem { + +/** + * @name SHMEM_WAIT_UNTIL + * @brief Block the caller until the condition (* \p ptr \p cmps \p val) is + * true. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ivars Pointer to memory on the symmetric heap to wait for. + * @param[in] cmp Operation for the comparison. + * @param[in] val Value to compare the memory at \p ptr to. + * + * @return void + */ +__device__ void rocshmem_float_wait_until( + float *ivars, int cmp, float val); +__device__ size_t rocshmem_float_wait_until_any( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__device__ void rocshmem_float_wait_until_all( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__device__ size_t rocshmem_float_wait_until_some( + float *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, float val); +__device__ size_t rocshmem_float_wait_until_any_vector( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__device__ void rocshmem_float_wait_until_all_vector( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__device__ size_t rocshmem_float_wait_until_some_vector( + float *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, float val); +__host__ void rocshmem_float_wait_until( + float *ivars, int cmp, float val); +__host__ size_t rocshmem_float_wait_until_any( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__host__ void rocshmem_float_wait_until_all( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__host__ size_t rocshmem_float_wait_until_some( + float *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, float val); +__host__ size_t rocshmem_float_wait_until_any_vector( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__host__ void rocshmem_float_wait_until_all_vector( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__host__ size_t rocshmem_float_wait_until_some_vector( + float *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, float val); + +__device__ void rocshmem_double_wait_until( + double *ivars, int cmp, double val); +__device__ size_t rocshmem_double_wait_until_any( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__device__ void rocshmem_double_wait_until_all( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__device__ size_t rocshmem_double_wait_until_some( + double *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, double val); +__device__ size_t rocshmem_double_wait_until_any_vector( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__device__ void rocshmem_double_wait_until_all_vector( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__device__ size_t rocshmem_double_wait_until_some_vector( + double *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, double val); +__host__ void rocshmem_double_wait_until( + double *ivars, int cmp, double val); +__host__ size_t rocshmem_double_wait_until_any( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__host__ void rocshmem_double_wait_until_all( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__host__ size_t rocshmem_double_wait_until_some( + double *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, double val); +__host__ size_t rocshmem_double_wait_until_any_vector( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__host__ void rocshmem_double_wait_until_all_vector( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__host__ size_t rocshmem_double_wait_until_some_vector( + double *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, double val); + +__device__ void rocshmem_char_wait_until( + char *ivars, int cmp, char val); +__device__ size_t rocshmem_char_wait_until_any( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__device__ void rocshmem_char_wait_until_all( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__device__ size_t rocshmem_char_wait_until_some( + char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, char val); +__device__ size_t rocshmem_char_wait_until_any_vector( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__device__ void rocshmem_char_wait_until_all_vector( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__device__ size_t rocshmem_char_wait_until_some_vector( + char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, char val); +__host__ void rocshmem_char_wait_until( + char *ivars, int cmp, char val); +__host__ size_t rocshmem_char_wait_until_any( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__host__ void rocshmem_char_wait_until_all( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__host__ size_t rocshmem_char_wait_until_some( + char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, char val); +__host__ size_t rocshmem_char_wait_until_any_vector( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__host__ void rocshmem_char_wait_until_all_vector( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__host__ size_t rocshmem_char_wait_until_some_vector( + char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, char val); + +__device__ void rocshmem_schar_wait_until( + signed char *ivars, int cmp, signed char val); +__device__ size_t rocshmem_schar_wait_until_any( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__device__ void rocshmem_schar_wait_until_all( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__device__ size_t rocshmem_schar_wait_until_some( + signed char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, signed char val); +__device__ size_t rocshmem_schar_wait_until_any_vector( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__device__ void rocshmem_schar_wait_until_all_vector( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__device__ size_t rocshmem_schar_wait_until_some_vector( + signed char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, signed char val); +__host__ void rocshmem_schar_wait_until( + signed char *ivars, int cmp, signed char val); +__host__ size_t rocshmem_schar_wait_until_any( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__host__ void rocshmem_schar_wait_until_all( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__host__ size_t rocshmem_schar_wait_until_some( + signed char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, signed char val); +__host__ size_t rocshmem_schar_wait_until_any_vector( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__host__ void rocshmem_schar_wait_until_all_vector( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__host__ size_t rocshmem_schar_wait_until_some_vector( + signed char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, signed char val); + +__device__ void rocshmem_short_wait_until( + short *ivars, int cmp, short val); +__device__ size_t rocshmem_short_wait_until_any( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__device__ void rocshmem_short_wait_until_all( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__device__ size_t rocshmem_short_wait_until_some( + short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, short val); +__device__ size_t rocshmem_short_wait_until_any_vector( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__device__ void rocshmem_short_wait_until_all_vector( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__device__ size_t rocshmem_short_wait_until_some_vector( + short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, short val); +__host__ void rocshmem_short_wait_until( + short *ivars, int cmp, short val); +__host__ size_t rocshmem_short_wait_until_any( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__host__ void rocshmem_short_wait_until_all( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__host__ size_t rocshmem_short_wait_until_some( + short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, short val); +__host__ size_t rocshmem_short_wait_until_any_vector( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__host__ void rocshmem_short_wait_until_all_vector( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__host__ size_t rocshmem_short_wait_until_some_vector( + short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, short val); + +__device__ void rocshmem_int_wait_until( + int *ivars, int cmp, int val); +__device__ size_t rocshmem_int_wait_until_any( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__device__ void rocshmem_int_wait_until_all( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__device__ size_t rocshmem_int_wait_until_some( + int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, int val); +__device__ size_t rocshmem_int_wait_until_any_vector( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__device__ void rocshmem_int_wait_until_all_vector( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__device__ size_t rocshmem_int_wait_until_some_vector( + int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, int val); +__host__ void rocshmem_int_wait_until( + int *ivars, int cmp, int val); +__host__ size_t rocshmem_int_wait_until_any( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__host__ void rocshmem_int_wait_until_all( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__host__ size_t rocshmem_int_wait_until_some( + int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, int val); +__host__ size_t rocshmem_int_wait_until_any_vector( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__host__ void rocshmem_int_wait_until_all_vector( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__host__ size_t rocshmem_int_wait_until_some_vector( + int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, int val); + +__device__ void rocshmem_long_wait_until( + long *ivars, int cmp, long val); +__device__ size_t rocshmem_long_wait_until_any( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__device__ void rocshmem_long_wait_until_all( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__device__ size_t rocshmem_long_wait_until_some( + long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long val); +__device__ size_t rocshmem_long_wait_until_any_vector( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__device__ void rocshmem_long_wait_until_all_vector( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__device__ size_t rocshmem_long_wait_until_some_vector( + long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long val); +__host__ void rocshmem_long_wait_until( + long *ivars, int cmp, long val); +__host__ size_t rocshmem_long_wait_until_any( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__host__ void rocshmem_long_wait_until_all( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__host__ size_t rocshmem_long_wait_until_some( + long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long val); +__host__ size_t rocshmem_long_wait_until_any_vector( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__host__ void rocshmem_long_wait_until_all_vector( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__host__ size_t rocshmem_long_wait_until_some_vector( + long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long val); + +__device__ void rocshmem_longlong_wait_until( + long long *ivars, int cmp, long long val); +__device__ size_t rocshmem_longlong_wait_until_any( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__device__ void rocshmem_longlong_wait_until_all( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__device__ size_t rocshmem_longlong_wait_until_some( + long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long long val); +__device__ size_t rocshmem_longlong_wait_until_any_vector( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__device__ void rocshmem_longlong_wait_until_all_vector( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__device__ size_t rocshmem_longlong_wait_until_some_vector( + long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long long val); +__host__ void rocshmem_longlong_wait_until( + long long *ivars, int cmp, long long val); +__host__ size_t rocshmem_longlong_wait_until_any( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__host__ void rocshmem_longlong_wait_until_all( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__host__ size_t rocshmem_longlong_wait_until_some( + long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long long val); +__host__ size_t rocshmem_longlong_wait_until_any_vector( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__host__ void rocshmem_longlong_wait_until_all_vector( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__host__ size_t rocshmem_longlong_wait_until_some_vector( + long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long long val); + +__device__ void rocshmem_uchar_wait_until( + unsigned char *ivars, int cmp, unsigned char val); +__device__ size_t rocshmem_uchar_wait_until_any( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__device__ void rocshmem_uchar_wait_until_all( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__device__ size_t rocshmem_uchar_wait_until_some( + unsigned char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned char val); +__device__ size_t rocshmem_uchar_wait_until_any_vector( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__device__ void rocshmem_uchar_wait_until_all_vector( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__device__ size_t rocshmem_uchar_wait_until_some_vector( + unsigned char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned char val); +__host__ void rocshmem_uchar_wait_until( + unsigned char *ivars, int cmp, unsigned char val); +__host__ size_t rocshmem_uchar_wait_until_any( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__host__ void rocshmem_uchar_wait_until_all( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__host__ size_t rocshmem_uchar_wait_until_some( + unsigned char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned char val); +__host__ size_t rocshmem_uchar_wait_until_any_vector( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__host__ void rocshmem_uchar_wait_until_all_vector( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__host__ size_t rocshmem_uchar_wait_until_some_vector( + unsigned char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned char val); + +__device__ void rocshmem_ushort_wait_until( + unsigned short *ivars, int cmp, unsigned short val); +__device__ size_t rocshmem_ushort_wait_until_any( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__device__ void rocshmem_ushort_wait_until_all( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__device__ size_t rocshmem_ushort_wait_until_some( + unsigned short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned short val); +__device__ size_t rocshmem_ushort_wait_until_any_vector( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__device__ void rocshmem_ushort_wait_until_all_vector( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__device__ size_t rocshmem_ushort_wait_until_some_vector( + unsigned short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned short val); +__host__ void rocshmem_ushort_wait_until( + unsigned short *ivars, int cmp, unsigned short val); +__host__ size_t rocshmem_ushort_wait_until_any( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__host__ void rocshmem_ushort_wait_until_all( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__host__ size_t rocshmem_ushort_wait_until_some( + unsigned short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned short val); +__host__ size_t rocshmem_ushort_wait_until_any_vector( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__host__ void rocshmem_ushort_wait_until_all_vector( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__host__ size_t rocshmem_ushort_wait_until_some_vector( + unsigned short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned short val); + +__device__ void rocshmem_uint_wait_until( + unsigned int *ivars, int cmp, unsigned int val); +__device__ size_t rocshmem_uint_wait_until_any( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__device__ void rocshmem_uint_wait_until_all( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__device__ size_t rocshmem_uint_wait_until_some( + unsigned int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned int val); +__device__ size_t rocshmem_uint_wait_until_any_vector( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__device__ void rocshmem_uint_wait_until_all_vector( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__device__ size_t rocshmem_uint_wait_until_some_vector( + unsigned int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned int val); +__host__ void rocshmem_uint_wait_until( + unsigned int *ivars, int cmp, unsigned int val); +__host__ size_t rocshmem_uint_wait_until_any( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__host__ void rocshmem_uint_wait_until_all( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__host__ size_t rocshmem_uint_wait_until_some( + unsigned int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned int val); +__host__ size_t rocshmem_uint_wait_until_any_vector( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__host__ void rocshmem_uint_wait_until_all_vector( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__host__ size_t rocshmem_uint_wait_until_some_vector( + unsigned int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned int val); + +__device__ void rocshmem_ulong_wait_until( + unsigned long *ivars, int cmp, unsigned long val); +__device__ size_t rocshmem_ulong_wait_until_any( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__device__ void rocshmem_ulong_wait_until_all( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__device__ size_t rocshmem_ulong_wait_until_some( + unsigned long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long val); +__device__ size_t rocshmem_ulong_wait_until_any_vector( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__device__ void rocshmem_ulong_wait_until_all_vector( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__device__ size_t rocshmem_ulong_wait_until_some_vector( + unsigned long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long val); +__host__ void rocshmem_ulong_wait_until( + unsigned long *ivars, int cmp, unsigned long val); +__host__ size_t rocshmem_ulong_wait_until_any( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__host__ void rocshmem_ulong_wait_until_all( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__host__ size_t rocshmem_ulong_wait_until_some( + unsigned long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long val); +__host__ size_t rocshmem_ulong_wait_until_any_vector( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__host__ void rocshmem_ulong_wait_until_all_vector( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__host__ size_t rocshmem_ulong_wait_until_some_vector( + unsigned long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long val); + +__device__ void rocshmem_ulonglong_wait_until( + unsigned long long *ivars, int cmp, unsigned long long val); +__device__ size_t rocshmem_ulonglong_wait_until_any( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__device__ void rocshmem_ulonglong_wait_until_all( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__device__ size_t rocshmem_ulonglong_wait_until_some( + unsigned long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long long val); +__device__ size_t rocshmem_ulonglong_wait_until_any_vector( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__device__ void rocshmem_ulonglong_wait_until_all_vector( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__device__ size_t rocshmem_ulonglong_wait_until_some_vector( + unsigned long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long long val); +__host__ void rocshmem_ulonglong_wait_until( + unsigned long long *ivars, int cmp, unsigned long long val); +__host__ size_t rocshmem_ulonglong_wait_until_any( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__host__ void rocshmem_ulonglong_wait_until_all( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__host__ size_t rocshmem_ulonglong_wait_until_some( + unsigned long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long long val); +__host__ size_t rocshmem_ulonglong_wait_until_any_vector( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__host__ void rocshmem_ulonglong_wait_until_all_vector( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__host__ size_t rocshmem_ulonglong_wait_until_some_vector( + unsigned long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long long val); + + +/** + * @name SHMEM_TEST + * @brief test if the condition (* \p ptr \p cmps \p val) is + * true. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ivars Pointer to memory on the symmetric heap to wait for. + * @param[in] cmp Operation for the comparison. + * @param[in] val Value to compare the memory at \p ptr to. + * + * @return 1 if the evaluation is true else 0 + */ +__device__ int rocshmem_float_test( + float *ivars, int cmp, float val); +__host__ int rocshmem_float_test( + float *ivars, int cmp, float val); + +__device__ int rocshmem_double_test( + double *ivars, int cmp, double val); +__host__ int rocshmem_double_test( + double *ivars, int cmp, double val); + +__device__ int rocshmem_char_test( + char *ivars, int cmp, char val); +__host__ int rocshmem_char_test( + char *ivars, int cmp, char val); + +__device__ int rocshmem_schar_test( + signed char *ivars, int cmp, signed char val); +__host__ int rocshmem_schar_test( + signed char *ivars, int cmp, signed char val); + +__device__ int rocshmem_short_test( + short *ivars, int cmp, short val); +__host__ int rocshmem_short_test( + short *ivars, int cmp, short val); + +__device__ int rocshmem_int_test( + int *ivars, int cmp, int val); +__host__ int rocshmem_int_test( + int *ivars, int cmp, int val); + +__device__ int rocshmem_long_test( + long *ivars, int cmp, long val); +__host__ int rocshmem_long_test( + long *ivars, int cmp, long val); + +__device__ int rocshmem_longlong_test( + long long *ivars, int cmp, long long val); +__host__ int rocshmem_longlong_test( + long long *ivars, int cmp, long long val); + +__device__ int rocshmem_uchar_test( + unsigned char *ivars, int cmp, unsigned char val); +__host__ int rocshmem_uchar_test( + unsigned char *ivars, int cmp, unsigned char val); + +__device__ int rocshmem_ushort_test( + unsigned short *ivars, int cmp, unsigned short val); +__host__ int rocshmem_ushort_test( + unsigned short *ivars, int cmp, unsigned short val); + +__device__ int rocshmem_uint_test( + unsigned int *ivars, int cmp, unsigned int val); +__host__ int rocshmem_uint_test( + unsigned int *ivars, int cmp, unsigned int val); + +__device__ int rocshmem_ulong_test( + unsigned long *ivars, int cmp, unsigned long val); +__host__ int rocshmem_ulong_test( + unsigned long *ivars, int cmp, unsigned long val); + +__device__ int rocshmem_ulonglong_test( + unsigned long long *ivars, int cmp, unsigned long long val); +__host__ int rocshmem_ulonglong_test( + unsigned long long *ivars, int cmp, unsigned long long val); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_RMA.hpp b/projects/rocshmem/include/rocshmem/rocshmem_RMA.hpp new file mode 100644 index 0000000000..ccd71f061b --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_RMA.hpp @@ -0,0 +1,1208 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP + +namespace rocshmem { + +/** + * @name SHMEM_PUT + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put( + float *dest, const float *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_float_put( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__host__ void rocshmem_float_put(float *dest, + const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put( + double *dest, const double *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_double_put( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__host__ void rocshmem_double_put(double *dest, + const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put( + char *dest, const char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_char_put( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__host__ void rocshmem_char_put(char *dest, + const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put( + signed char *dest, const signed char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_schar_put( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__host__ void rocshmem_schar_put(signed char *dest, + const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put( + short *dest, const short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_short_put( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__host__ void rocshmem_short_put(short *dest, + const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put( + int *dest, const int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_int_put( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__host__ void rocshmem_int_put(int *dest, + const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put( + long *dest, const long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_long_put( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__host__ void rocshmem_long_put(long *dest, + const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put( + long long *dest, const long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_longlong_put( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__host__ void rocshmem_longlong_put(long long *dest, + const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uchar_put( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__host__ void rocshmem_uchar_put(unsigned char *dest, + const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ushort_put( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__host__ void rocshmem_ushort_put(unsigned short *dest, + const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uint_put( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__host__ void rocshmem_uint_put(unsigned int *dest, + const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulong_put( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulong_put(unsigned long *dest, + const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulonglong_put( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulonglong_put(unsigned long long *dest, + const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem(void *dest, const void *source, + size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into __host__ rocshmem_quiet() if remote completion is required. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__host__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, int pe); + +__host__ void rocshmem_putmem(void *dest, const void *source, size_t nelems, + int pe); + + +/** + * @name SHMEM_P + * @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe. + * The caller must call into rocshmem_quiet() if remote completion is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] value Value to write to dest at \p pe. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_p( + rocshmem_ctx_t ctx, float *dest, float value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_p( + float *dest, float value, int pe); +__host__ void rocshmem_ctx_float_p( + rocshmem_ctx_t ctx, float *dest, float value, + int pe); +__host__ void rocshmem_float_p( + float *dest, float value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_p( + rocshmem_ctx_t ctx, double *dest, double value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_p( + double *dest, double value, int pe); +__host__ void rocshmem_ctx_double_p( + rocshmem_ctx_t ctx, double *dest, double value, + int pe); +__host__ void rocshmem_double_p( + double *dest, double value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_p( + rocshmem_ctx_t ctx, char *dest, char value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_p( + char *dest, char value, int pe); +__host__ void rocshmem_ctx_char_p( + rocshmem_ctx_t ctx, char *dest, char value, + int pe); +__host__ void rocshmem_char_p( + char *dest, char value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_p( + rocshmem_ctx_t ctx, signed char *dest, signed char value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_p( + signed char *dest, signed char value, int pe); +__host__ void rocshmem_ctx_schar_p( + rocshmem_ctx_t ctx, signed char *dest, signed char value, + int pe); +__host__ void rocshmem_schar_p( + signed char *dest, signed char value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_p( + rocshmem_ctx_t ctx, short *dest, short value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_p( + short *dest, short value, int pe); +__host__ void rocshmem_ctx_short_p( + rocshmem_ctx_t ctx, short *dest, short value, + int pe); +__host__ void rocshmem_short_p( + short *dest, short value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_p( + rocshmem_ctx_t ctx, int *dest, int value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_p( + int *dest, int value, int pe); +__host__ void rocshmem_ctx_int_p( + rocshmem_ctx_t ctx, int *dest, int value, + int pe); +__host__ void rocshmem_int_p( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_p( + rocshmem_ctx_t ctx, long *dest, long value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_p( + long *dest, long value, int pe); +__host__ void rocshmem_ctx_long_p( + rocshmem_ctx_t ctx, long *dest, long value, + int pe); +__host__ void rocshmem_long_p( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_p( + rocshmem_ctx_t ctx, long long *dest, long long value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_p( + long long *dest, long long value, int pe); +__host__ void rocshmem_ctx_longlong_p( + rocshmem_ctx_t ctx, long long *dest, long long value, + int pe); +__host__ void rocshmem_longlong_p( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_p( + rocshmem_ctx_t ctx, unsigned char *dest, unsigned char value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_p( + unsigned char *dest, unsigned char value, int pe); +__host__ void rocshmem_ctx_uchar_p( + rocshmem_ctx_t ctx, unsigned char *dest, unsigned char value, + int pe); +__host__ void rocshmem_uchar_p( + unsigned char *dest, unsigned char value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_p( + rocshmem_ctx_t ctx, unsigned short *dest, unsigned short value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_p( + unsigned short *dest, unsigned short value, int pe); +__host__ void rocshmem_ctx_ushort_p( + rocshmem_ctx_t ctx, unsigned short *dest, unsigned short value, + int pe); +__host__ void rocshmem_ushort_p( + unsigned short *dest, unsigned short value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_p( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_p( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_p( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, + int pe); +__host__ void rocshmem_uint_p( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_p( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_p( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_p( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, + int pe); +__host__ void rocshmem_ulong_p( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_p( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_p( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_p( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, + int pe); +__host__ void rocshmem_ulonglong_p( + unsigned long long *dest, unsigned long long value, int pe); + + +/** + * @name SHMEM_GET + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get( + float *dest, const float *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_float_get( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__host__ void rocshmem_float_get(float *dest, + const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get( + double *dest, const double *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_double_get( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__host__ void rocshmem_double_get(double *dest, + const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get( + char *dest, const char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_char_get( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__host__ void rocshmem_char_get(char *dest, + const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get( + signed char *dest, const signed char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_schar_get( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__host__ void rocshmem_schar_get(signed char *dest, + const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get( + short *dest, const short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_short_get( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__host__ void rocshmem_short_get(short *dest, + const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get( + int *dest, const int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_int_get( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__host__ void rocshmem_int_get(int *dest, + const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get( + long *dest, const long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_long_get( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__host__ void rocshmem_long_get(long *dest, + const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get( + long long *dest, const long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_longlong_get( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__host__ void rocshmem_longlong_get(long long *dest, + const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uchar_get( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__host__ void rocshmem_uchar_get(unsigned char *dest, + const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ushort_get( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__host__ void rocshmem_ushort_get(unsigned short *dest, + const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uint_get( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__host__ void rocshmem_uint_get(unsigned int *dest, + const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulong_get( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulong_get(unsigned long *dest, + const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulonglong_get( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulonglong_get(unsigned long long *dest, + const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem(void *dest, const void *source, + size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__host__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, int pe); + +__host__ void rocshmem_getmem(void *dest, const void *source, size_t nelems, + int pe); + + +/** + * @name SHMEM_G + * @brief reads and returns single value from \p source at \p pe. + * The calling work-group/thread will block until the operation completes. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] source Source address. Must be an address on the symmetric + * heap. + * @param[in] pe PE of the remote process. + * + * @return the value read from remote \p source at \p pe. + */ +__device__ ATTR_NO_INLINE float rocshmem_ctx_float_g( + rocshmem_ctx_t ctx, const float *source, int pe); +__device__ ATTR_NO_INLINE float rocshmem_float_g( + const float *source, int pe); +__host__ float rocshmem_ctx_float_g( + rocshmem_ctx_t ctx, const float *source, int pe); +__host__ float rocshmem_float_g( + const float *source, int pe); + +__device__ ATTR_NO_INLINE double rocshmem_ctx_double_g( + rocshmem_ctx_t ctx, const double *source, int pe); +__device__ ATTR_NO_INLINE double rocshmem_double_g( + const double *source, int pe); +__host__ double rocshmem_ctx_double_g( + rocshmem_ctx_t ctx, const double *source, int pe); +__host__ double rocshmem_double_g( + const double *source, int pe); + +__device__ ATTR_NO_INLINE char rocshmem_ctx_char_g( + rocshmem_ctx_t ctx, const char *source, int pe); +__device__ ATTR_NO_INLINE char rocshmem_char_g( + const char *source, int pe); +__host__ char rocshmem_ctx_char_g( + rocshmem_ctx_t ctx, const char *source, int pe); +__host__ char rocshmem_char_g( + const char *source, int pe); + +__device__ ATTR_NO_INLINE signed char rocshmem_ctx_schar_g( + rocshmem_ctx_t ctx, const signed char *source, int pe); +__device__ ATTR_NO_INLINE signed char rocshmem_schar_g( + const signed char *source, int pe); +__host__ signed char rocshmem_ctx_schar_g( + rocshmem_ctx_t ctx, const signed char *source, int pe); +__host__ signed char rocshmem_schar_g( + const signed char *source, int pe); + +__device__ ATTR_NO_INLINE short rocshmem_ctx_short_g( + rocshmem_ctx_t ctx, const short *source, int pe); +__device__ ATTR_NO_INLINE short rocshmem_short_g( + const short *source, int pe); +__host__ short rocshmem_ctx_short_g( + rocshmem_ctx_t ctx, const short *source, int pe); +__host__ short rocshmem_short_g( + const short *source, int pe); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_g( + rocshmem_ctx_t ctx, const int *source, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_g( + const int *source, int pe); +__host__ int rocshmem_ctx_int_g( + rocshmem_ctx_t ctx, const int *source, int pe); +__host__ int rocshmem_int_g( + const int *source, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_g( + rocshmem_ctx_t ctx, const long *source, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_g( + const long *source, int pe); +__host__ long rocshmem_ctx_long_g( + rocshmem_ctx_t ctx, const long *source, int pe); +__host__ long rocshmem_long_g( + const long *source, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_g( + rocshmem_ctx_t ctx, const long long *source, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_g( + const long long *source, int pe); +__host__ long long rocshmem_ctx_longlong_g( + rocshmem_ctx_t ctx, const long long *source, int pe); +__host__ long long rocshmem_longlong_g( + const long long *source, int pe); + +__device__ ATTR_NO_INLINE unsigned char rocshmem_ctx_uchar_g( + rocshmem_ctx_t ctx, const unsigned char *source, int pe); +__device__ ATTR_NO_INLINE unsigned char rocshmem_uchar_g( + const unsigned char *source, int pe); +__host__ unsigned char rocshmem_ctx_uchar_g( + rocshmem_ctx_t ctx, const unsigned char *source, int pe); +__host__ unsigned char rocshmem_uchar_g( + const unsigned char *source, int pe); + +__device__ ATTR_NO_INLINE unsigned short rocshmem_ctx_ushort_g( + rocshmem_ctx_t ctx, const unsigned short *source, int pe); +__device__ ATTR_NO_INLINE unsigned short rocshmem_ushort_g( + const unsigned short *source, int pe); +__host__ unsigned short rocshmem_ctx_ushort_g( + rocshmem_ctx_t ctx, const unsigned short *source, int pe); +__host__ unsigned short rocshmem_ushort_g( + const unsigned short *source, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_g( + rocshmem_ctx_t ctx, const unsigned int *source, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_g( + const unsigned int *source, int pe); +__host__ unsigned int rocshmem_ctx_uint_g( + rocshmem_ctx_t ctx, const unsigned int *source, int pe); +__host__ unsigned int rocshmem_uint_g( + const unsigned int *source, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_g( + rocshmem_ctx_t ctx, const unsigned long *source, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_g( + const unsigned long *source, int pe); +__host__ unsigned long rocshmem_ctx_ulong_g( + rocshmem_ctx_t ctx, const unsigned long *source, int pe); +__host__ unsigned long rocshmem_ulong_g( + const unsigned long *source, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_g( + rocshmem_ctx_t ctx, const unsigned long long *source, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_g( + const unsigned long long *source, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_g( + rocshmem_ctx_t ctx, const unsigned long long *source, int pe); +__host__ unsigned long long rocshmem_ulonglong_g( + const unsigned long long *source, int pe); + + +/** + * @name SHMEM_PUT_NBI + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi( + float *dest, const float *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_float_put_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__host__ void rocshmem_float_put_nbi( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi( + double *dest, const double *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_double_put_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__host__ void rocshmem_double_put_nbi( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi( + char *dest, const char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_char_put_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__host__ void rocshmem_char_put_nbi( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi( + signed char *dest, const signed char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_schar_put_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__host__ void rocshmem_schar_put_nbi( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi( + short *dest, const short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_short_put_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__host__ void rocshmem_short_put_nbi( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi( + int *dest, const int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_int_put_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__host__ void rocshmem_int_put_nbi( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi( + long *dest, const long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_long_put_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__host__ void rocshmem_long_put_nbi( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi( + long long *dest, const long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_longlong_put_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__host__ void rocshmem_longlong_put_nbi( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uchar_put_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__host__ void rocshmem_uchar_put_nbi( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ushort_put_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__host__ void rocshmem_ushort_put_nbi( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uint_put_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__host__ void rocshmem_uint_put_nbi( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulong_put_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulong_put_nbi( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulonglong_put_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulonglong_put_nbi( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * _host__ rocshmem_quiet() if completion notification is required. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__host__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe); + +__host__ void rocshmem_putmem_nbi(void *dest, const void *source, + size_t nelems, int pe); + + +/** + * @name SHMEM_GET_NBI + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller will + * return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi( + float *dest, const float *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_float_get_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__host__ void rocshmem_float_get_nbi(float *dest, + const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi( + double *dest, const double *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_double_get_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__host__ void rocshmem_double_get_nbi(double *dest, + const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi( + char *dest, const char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_char_get_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__host__ void rocshmem_char_get_nbi(char *dest, + const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi( + signed char *dest, const signed char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_schar_get_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__host__ void rocshmem_schar_get_nbi(signed char *dest, + const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi( + short *dest, const short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_short_get_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__host__ void rocshmem_short_get_nbi(short *dest, + const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi( + int *dest, const int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_int_get_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__host__ void rocshmem_int_get_nbi(int *dest, + const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi( + long *dest, const long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_long_get_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__host__ void rocshmem_long_get_nbi(long *dest, + const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi( + long long *dest, const long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_longlong_get_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__host__ void rocshmem_longlong_get_nbi(long long *dest, + const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uchar_get_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__host__ void rocshmem_uchar_get_nbi(unsigned char *dest, + const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ushort_get_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__host__ void rocshmem_ushort_get_nbi(unsigned short *dest, + const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uint_get_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__host__ void rocshmem_uint_get_nbi(unsigned int *dest, + const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulong_get_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulong_get_nbi(unsigned long *dest, + const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulonglong_get_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulonglong_get_nbi(unsigned long long *dest, + const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller will + * return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller will + * return as soon as the request is posted. The caller must call + * __host__ rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__host__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe); + +__host__ void rocshmem_getmem_nbi(void *dest, const void *source, + size_t nelems, int pe); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_RMA_X.hpp b/projects/rocshmem/include/rocshmem/rocshmem_RMA_X.hpp new file mode 100644 index 0000000000..34b9185e9d --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_RMA_X.hpp @@ -0,0 +1,1036 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP + +namespace rocshmem { + +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a wave must collectively participate + * in the call using the same arguments + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_wave( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_wave( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_wave( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_wave( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_wave( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_wave( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_wave( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_wave( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-workgroup + * (WG) granularity. However, All threads in a WG must collectively participate + * in the call using the same arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_wg( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_wg( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_wg( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_wg( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_wg( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_wg( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_wg( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_wg( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a wave must participate in the + * call using the same parameters. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_wave(void *dest, + const void *source, + size_t nelems, int pe); + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-workgroup + * (WG) granularity. However, all threads in the workgroup must participate in + * the call using the same parameters. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_wg(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must participate in the + * call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_wave( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_wave( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_wave( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_wave( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_wave( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_wave( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_wave( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_wave( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the workgroup must participate in + * the call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_wg( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_wg( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_wg( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_wg( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_wg( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_wg( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_wg( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_wg( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a the wave must participate in the + * call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_wave(void *dest, + const void *source, + size_t nelems, int pe); + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-workgroup + * (WG) granularity. However, all threads in the workgroup must participate + * in the call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_wg(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi_wave( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi_wave( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi_wave( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi_wave( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi_wave( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi_wave( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi_wave( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi_wave( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the WG must call in with the sameo + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi_wg( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi_wg( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi_wg( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi_wg( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi_wg( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi_wg( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi_wg( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi_wg( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a wave must call in with the same + * parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wave(void *dest, + const void *source, + size_t nelems, + int pe); + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in a WG must call in with the same + * parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wg(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi_wave( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi_wave( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi_wave( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi_wave( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi_wave( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi_wave( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi_wave( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi_wave( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the WG must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi_wg( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi_wg( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi_wg( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi_wg( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi_wg( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi_wg( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi_wg( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi_wg( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wave(void *dest, + const void *source, + size_t nelems, + int pe); + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the WG must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wg(void *dest, + const void *source, + size_t nelems, int pe); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_SIG_OP.hpp b/projects/rocshmem/include/rocshmem/rocshmem_SIG_OP.hpp new file mode 100644 index 0000000000..00ad57a10b --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_SIG_OP.hpp @@ -0,0 +1,623 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP + +namespace rocshmem { +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wg( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wg( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wg( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wg( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wg( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wg( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wg( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wg( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wg( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wave( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wave( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wave( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wave( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wave( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wave( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wave( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wave( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wave( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wg( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wg( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wg( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wg( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wg( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wg( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wg( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wg( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wg( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wave( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wave( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wave( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wave( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wave( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wave( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wave( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wave( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wave( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + + +__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr); +__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr); +__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_common.hpp b/projects/rocshmem/include/rocshmem/rocshmem_common.hpp new file mode 100644 index 0000000000..baea438244 --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_common.hpp @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP + +namespace rocshmem { + +#ifdef USE_FUNC_CALL +#define ATTR_NO_INLINE __attribute__((noinline)) +#else +#define ATTR_NO_INLINE +#endif + + +enum ROCSHMEM_STATUS { + ROCSHMEM_SUCCESS = 0, + ROCSHMEM_ERROR = 1, +}; + +enum ROCSHMEM_OP { + ROCSHMEM_SUM, + ROCSHMEM_MAX, + ROCSHMEM_MIN, + ROCSHMEM_PROD, + ROCSHMEM_AND, + ROCSHMEM_OR, + ROCSHMEM_XOR, + ROCSHMEM_REPLACE +}; + +enum ROCSHMEM_SIGNAL_OPS { + ROCSHMEM_SIGNAL_SET, + ROCSHMEM_SIGNAL_ADD, +}; + +/** + * @brief Types defined for rocshmem_wait() operations. + */ +enum rocshmem_cmps { + ROCSHMEM_CMP_EQ, + ROCSHMEM_CMP_NE, + ROCSHMEM_CMP_GT, + ROCSHMEM_CMP_GE, + ROCSHMEM_CMP_LT, + ROCSHMEM_CMP_LE, +}; + +enum rocshmem_thread_ops { + ROCSHMEM_THREAD_SINGLE, + ROCSHMEM_THREAD_FUNNELED, + ROCSHMEM_THREAD_WG_FUNNELED, + ROCSHMEM_THREAD_SERIALIZED, + ROCSHMEM_THREAD_MULTIPLE +}; + +/** + * @brief Bitwise flags to mask configuration parameters. + */ +enum rocshmem_team_configs { + ROCSHMEM_TEAM_DEFAULT_CONFIGS, + ROCSHMEM_TEAM_NUM_CONTEXTS +}; + +typedef struct { + int num_contexts; +} rocshmem_team_config_t; + +constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024; +constexpr size_t ROCSHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024); +constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256; +constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256; +// Internally calls sync function, which matches barrier implementation +constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE; +constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1; +constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE; +constexpr size_t ROCSHMEM_SYNC_VALUE = 0; + +const int ROCSHMEM_CTX_ZERO = 0; +const int ROCSHMEM_CTX_NOSTORE = 1; +const int ROCSHMEM_CTX_SERIALIZED = 2; +const int ROCSHMEM_CTX_WG_PRIVATE = 4; +const int ROCSHMEM_CTX_SHARED = 8; + +/** + * @brief GPU side OpenSHMEM context created from each work-groups' + * rocshmem_wg_handle_t + */ +typedef struct { + void *ctx_opaque; + void *team_opaque; +} rocshmem_ctx_t; + +/** + * Shmem default context. + */ +extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT; + +/** + * Used internally to set default context. + */ +void set_internal_ctx(rocshmem_ctx_t *ctx); + +typedef uint64_t *rocshmem_team_t; +extern rocshmem_team_t ROCSHMEM_TEAM_WORLD; + +const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr; + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP diff --git a/projects/rocshmem/src/backend_bc.hpp b/projects/rocshmem/src/backend_bc.hpp index d635f5bc5c..ccfd81c6f3 100644 --- a/projects/rocshmem/src/backend_bc.hpp +++ b/projects/rocshmem/src/backend_bc.hpp @@ -35,7 +35,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "backend_type.hpp" #include "ipc_policy.hpp" diff --git a/projects/rocshmem/src/backend_type.hpp b/projects/rocshmem/src/backend_type.hpp index 50818eb178..020b956f57 100644 --- a/projects/rocshmem/src/backend_type.hpp +++ b/projects/rocshmem/src/backend_type.hpp @@ -34,7 +34,7 @@ * functions are not supported at this time. */ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) namespace rocshmem { diff --git a/projects/rocshmem/src/context_device.cpp b/projects/rocshmem/src/context_device.cpp index 1f5fbaae36..850274ef9f 100644 --- a/projects/rocshmem/src/context_device.cpp +++ b/projects/rocshmem/src/context_device.cpp @@ -20,7 +20,7 @@ * IN THE SOFTWARE. *****************************************************************************/ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_bc.hpp" #include "context_incl.hpp" #include "util.hpp" diff --git a/projects/rocshmem/src/context_host.cpp b/projects/rocshmem/src/context_host.cpp index ee97bd4b24..4dce6de3f6 100644 --- a/projects/rocshmem/src/context_host.cpp +++ b/projects/rocshmem/src/context_host.cpp @@ -20,7 +20,7 @@ * IN THE SOFTWARE. *****************************************************************************/ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_bc.hpp" #include "context_incl.hpp" diff --git a/projects/rocshmem/src/context_tmpl_device.hpp b/projects/rocshmem/src/context_tmpl_device.hpp index 015a9f9ed9..4a7862cb64 100644 --- a/projects/rocshmem/src/context_tmpl_device.hpp +++ b/projects/rocshmem/src/context_tmpl_device.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_ #define LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_type.hpp" #ifdef USE_GPU_IB #include "gpu_ib/context_ib_device.hpp" diff --git a/projects/rocshmem/src/context_tmpl_host.hpp b/projects/rocshmem/src/context_tmpl_host.hpp index ac226e1c83..8bc913f2fc 100644 --- a/projects/rocshmem/src/context_tmpl_host.hpp +++ b/projects/rocshmem/src/context_tmpl_host.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_CONTEXT_TMPL_HOST_HPP_ #define LIBRARY_SRC_CONTEXT_TMPL_HOST_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_type.hpp" #ifdef USE_GPU_IB #include "gpu_ib/context_ib_host.hpp" diff --git a/projects/rocshmem/src/gpu_ib/connection_policy.cpp b/projects/rocshmem/src/gpu_ib/connection_policy.cpp index f672a0bbb2..5ccf91a0ea 100644 --- a/projects/rocshmem/src/gpu_ib/connection_policy.cpp +++ b/projects/rocshmem/src/gpu_ib/connection_policy.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "dynamic_connection.hpp" #include "queue_pair.hpp" diff --git a/projects/rocshmem/src/gpu_ib/connection_policy.hpp b/projects/rocshmem/src/gpu_ib/connection_policy.hpp index 840fe1f2ca..76bb5db1cf 100644 --- a/projects/rocshmem/src/gpu_ib/connection_policy.hpp +++ b/projects/rocshmem/src/gpu_ib/connection_policy.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_ #define LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "infiniband_structs.hpp" namespace rocshmem { diff --git a/projects/rocshmem/src/gpu_ib/context_ib_device.cpp b/projects/rocshmem/src/gpu_ib/context_ib_device.cpp index fed54fc6e8..c3e8146736 100644 --- a/projects/rocshmem/src/gpu_ib/context_ib_device.cpp +++ b/projects/rocshmem/src/gpu_ib/context_ib_device.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "../backend_type.hpp" #include "../context_incl.hpp" diff --git a/projects/rocshmem/src/gpu_ib/context_ib_host.cpp b/projects/rocshmem/src/gpu_ib/context_ib_host.cpp index 530cfe392d..e569d7e6ef 100644 --- a/projects/rocshmem/src/gpu_ib/context_ib_host.cpp +++ b/projects/rocshmem/src/gpu_ib/context_ib_host.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../backend_type.hpp" #include "../context_incl.hpp" #include "backend_ib.hpp" diff --git a/projects/rocshmem/src/gpu_ib/context_ib_tmpl_device.hpp b/projects/rocshmem/src/gpu_ib/context_ib_tmpl_device.hpp index 44b1dd2291..b29523c804 100644 --- a/projects/rocshmem/src/gpu_ib/context_ib_tmpl_device.hpp +++ b/projects/rocshmem/src/gpu_ib/context_ib_tmpl_device.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_ #define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "context_ib_device.hpp" #include "gpu_ib_team.hpp" diff --git a/projects/rocshmem/src/gpu_ib/context_ib_tmpl_host.hpp b/projects/rocshmem/src/gpu_ib/context_ib_tmpl_host.hpp index 552a8c25a2..259f158162 100644 --- a/projects/rocshmem/src/gpu_ib/context_ib_tmpl_host.hpp +++ b/projects/rocshmem/src/gpu_ib/context_ib_tmpl_host.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_ #define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../host/host_templates.hpp" namespace rocshmem { diff --git a/projects/rocshmem/src/gpu_ib/network_policy.cpp b/projects/rocshmem/src/gpu_ib/network_policy.cpp index 60ba379f49..b251dd2ad2 100644 --- a/projects/rocshmem/src/gpu_ib/network_policy.cpp +++ b/projects/rocshmem/src/gpu_ib/network_policy.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../atomic_return.hpp" #include "../context_incl.hpp" #include "backend_ib.hpp" diff --git a/projects/rocshmem/src/gpu_ib/network_policy.hpp b/projects/rocshmem/src/gpu_ib/network_policy.hpp index 6e1ceb9107..ab3c75a589 100644 --- a/projects/rocshmem/src/gpu_ib/network_policy.hpp +++ b/projects/rocshmem/src/gpu_ib/network_policy.hpp @@ -26,7 +26,7 @@ #include #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "connection_policy.hpp" #include "queue_pair.hpp" diff --git a/projects/rocshmem/src/gpu_ib/queue_pair.cpp b/projects/rocshmem/src/gpu_ib/queue_pair.cpp index 4a015e1f86..49357ff66c 100644 --- a/projects/rocshmem/src/gpu_ib/queue_pair.cpp +++ b/projects/rocshmem/src/gpu_ib/queue_pair.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_ib.hpp" #include "endian.hpp" #include "segment_builder.hpp" diff --git a/projects/rocshmem/src/gpu_ib/queue_pair.hpp b/projects/rocshmem/src/gpu_ib/queue_pair.hpp index 649950c1dc..f17c51b3fe 100644 --- a/projects/rocshmem/src/gpu_ib/queue_pair.hpp +++ b/projects/rocshmem/src/gpu_ib/queue_pair.hpp @@ -34,7 +34,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../atomic_return.hpp" #include "connection_policy.hpp" #include "thread_policy.hpp" diff --git a/projects/rocshmem/src/gpu_ib/thread_policy.cpp b/projects/rocshmem/src/gpu_ib/thread_policy.cpp index e202352b51..80dbcf5482 100644 --- a/projects/rocshmem/src/gpu_ib/thread_policy.cpp +++ b/projects/rocshmem/src/gpu_ib/thread_policy.cpp @@ -22,7 +22,7 @@ #include "thread_policy.hpp" -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "queue_pair.hpp" namespace rocshmem { diff --git a/projects/rocshmem/src/gpu_ib/thread_policy.hpp b/projects/rocshmem/src/gpu_ib/thread_policy.hpp index 6a7d5b6a1f..be79cb350e 100644 --- a/projects/rocshmem/src/gpu_ib/thread_policy.hpp +++ b/projects/rocshmem/src/gpu_ib/thread_policy.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_ #define LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../util.hpp" namespace rocshmem { diff --git a/projects/rocshmem/src/hdp_policy.hpp b/projects/rocshmem/src/hdp_policy.hpp index 3477b13b78..c6517e7d32 100644 --- a/projects/rocshmem/src/hdp_policy.hpp +++ b/projects/rocshmem/src/hdp_policy.hpp @@ -26,7 +26,7 @@ #include #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "memory/hip_allocator.hpp" #include "util.hpp" diff --git a/projects/rocshmem/src/host/host.cpp b/projects/rocshmem/src/host/host.cpp index 3826fd92f9..12608a589f 100644 --- a/projects/rocshmem/src/host/host.cpp +++ b/projects/rocshmem/src/host/host.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "host_helpers.hpp" #include "../memory/window_info.hpp" #include "../util.hpp" diff --git a/projects/rocshmem/src/host/host_templates.hpp b/projects/rocshmem/src/host/host_templates.hpp index 522d1398b1..b02d4903c0 100644 --- a/projects/rocshmem/src/host/host_templates.hpp +++ b/projects/rocshmem/src/host/host_templates.hpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "host_helpers.hpp" #include "../memory/window_info.hpp" #include "../team.hpp" diff --git a/projects/rocshmem/src/ipc/context_ipc_device.cpp b/projects/rocshmem/src/ipc/context_ipc_device.cpp index 9e8df917bd..d214c663b8 100644 --- a/projects/rocshmem/src/ipc/context_ipc_device.cpp +++ b/projects/rocshmem/src/ipc/context_ipc_device.cpp @@ -30,7 +30,7 @@ #include #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "backend_ipc.hpp" diff --git a/projects/rocshmem/src/ipc/context_ipc_host.cpp b/projects/rocshmem/src/ipc/context_ipc_host.cpp index 0d3464f33d..051789b99a 100644 --- a/projects/rocshmem/src/ipc/context_ipc_host.cpp +++ b/projects/rocshmem/src/ipc/context_ipc_host.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../backend_type.hpp" #include "../context_incl.hpp" #include "backend_ipc.hpp" diff --git a/projects/rocshmem/src/ipc/context_ipc_tmpl_device.hpp b/projects/rocshmem/src/ipc/context_ipc_tmpl_device.hpp index b83d763c1a..1be94d9ce4 100644 --- a/projects/rocshmem/src/ipc/context_ipc_tmpl_device.hpp +++ b/projects/rocshmem/src/ipc/context_ipc_tmpl_device.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_ #define LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "context_ipc_device.hpp" #include "../util.hpp" diff --git a/projects/rocshmem/src/ipc/context_ipc_tmpl_host.hpp b/projects/rocshmem/src/ipc/context_ipc_tmpl_host.hpp index 9e79849631..e1ff45f228 100644 --- a/projects/rocshmem/src/ipc/context_ipc_tmpl_host.hpp +++ b/projects/rocshmem/src/ipc/context_ipc_tmpl_host.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_IPC_CONTEXT_TMPL_HOST_HPP_ #define LIBRARY_SRC_IPC_CONTEXT_TMPL_HOST_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../host/host_templates.hpp" namespace rocshmem { diff --git a/projects/rocshmem/src/ipc_policy.cpp b/projects/rocshmem/src/ipc_policy.cpp index 2c3a96fa7c..79f13d91b8 100644 --- a/projects/rocshmem/src/ipc_policy.cpp +++ b/projects/rocshmem/src/ipc_policy.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_bc.hpp" #include "context_incl.hpp" #include "util.hpp" diff --git a/projects/rocshmem/src/ipc_policy.hpp b/projects/rocshmem/src/ipc_policy.hpp index 74052bbe54..0cc7437e75 100644 --- a/projects/rocshmem/src/ipc_policy.hpp +++ b/projects/rocshmem/src/ipc_policy.hpp @@ -29,7 +29,7 @@ #include #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "memory/hip_allocator.hpp" #include "util.hpp" diff --git a/projects/rocshmem/src/memory/heap_type.hpp b/projects/rocshmem/src/memory/heap_type.hpp index 3f48edd11e..058f781b86 100644 --- a/projects/rocshmem/src/memory/heap_type.hpp +++ b/projects/rocshmem/src/memory/heap_type.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_MEMORY_HEAP_TYPE_HPP_ #define LIBRARY_SRC_MEMORY_HEAP_TYPE_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "hip_allocator.hpp" /** diff --git a/projects/rocshmem/src/reverse_offload/context_ro_device.cpp b/projects/rocshmem/src/reverse_offload/context_ro_device.cpp index 43ce155cac..4cc5951de0 100644 --- a/projects/rocshmem/src/reverse_offload/context_ro_device.cpp +++ b/projects/rocshmem/src/reverse_offload/context_ro_device.cpp @@ -29,7 +29,7 @@ #include #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "../backend_type.hpp" #include "../hdp_policy.hpp" diff --git a/projects/rocshmem/src/reverse_offload/context_ro_host.cpp b/projects/rocshmem/src/reverse_offload/context_ro_host.cpp index 5c3f5d1a70..62360a5224 100644 --- a/projects/rocshmem/src/reverse_offload/context_ro_host.cpp +++ b/projects/rocshmem/src/reverse_offload/context_ro_host.cpp @@ -24,7 +24,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../backend_type.hpp" #include "../context_incl.hpp" #include "../host/host.hpp" diff --git a/projects/rocshmem/src/reverse_offload/context_ro_tmpl_device.hpp b/projects/rocshmem/src/reverse_offload/context_ro_tmpl_device.hpp index 003ddae928..d76a9eb222 100644 --- a/projects/rocshmem/src/reverse_offload/context_ro_tmpl_device.hpp +++ b/projects/rocshmem/src/reverse_offload/context_ro_tmpl_device.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_GPU_TEMPLATES_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_GPU_TEMPLATES_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "commands_types.hpp" #include "context_ro_device.hpp" #include "queue_proxy.hpp" diff --git a/projects/rocshmem/src/reverse_offload/context_ro_tmpl_host.hpp b/projects/rocshmem/src/reverse_offload/context_ro_tmpl_host.hpp index 3bffda1cd0..28abfecc2e 100644 --- a/projects/rocshmem/src/reverse_offload/context_ro_tmpl_host.hpp +++ b/projects/rocshmem/src/reverse_offload/context_ro_tmpl_host.hpp @@ -22,7 +22,7 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_HOST_TEMPLATES_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_HOST_TEMPLATES_HPP_ -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../host/host_templates.hpp" namespace rocshmem { diff --git a/projects/rocshmem/src/reverse_offload/profiler.hpp b/projects/rocshmem/src/reverse_offload/profiler.hpp index 132538941a..a180f560c8 100644 --- a/projects/rocshmem/src/reverse_offload/profiler.hpp +++ b/projects/rocshmem/src/reverse_offload/profiler.hpp @@ -26,7 +26,7 @@ #include #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "../device_proxy.hpp" #include "../memory/../memory/hip_allocator.hpp" #include "../stats.hpp" diff --git a/projects/rocshmem/src/rocshmem_gpu.cpp b/projects/rocshmem/src/rocshmem_gpu.cpp index a9d3667ba9..0e4ce22508 100644 --- a/projects/rocshmem/src/rocshmem_gpu.cpp +++ b/projects/rocshmem/src/rocshmem_gpu.cpp @@ -41,7 +41,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "backend_bc.hpp" #include "context_incl.hpp" diff --git a/projects/rocshmem/src/util.cpp b/projects/rocshmem/src/util.cpp index 7318b14208..e2f05faf4b 100644 --- a/projects/rocshmem/src/util.cpp +++ b/projects/rocshmem/src/util.cpp @@ -26,7 +26,7 @@ #include #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) namespace rocshmem { diff --git a/projects/rocshmem/src/util.hpp b/projects/rocshmem/src/util.hpp index c02f891dc4..32ac87cc58 100644 --- a/projects/rocshmem/src/util.hpp +++ b/projects/rocshmem/src/util.hpp @@ -30,7 +30,7 @@ #include #include "assembly.hpp" -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "constants.hpp" namespace rocshmem { diff --git a/projects/rocshmem/src/wf_coal_policy.hpp b/projects/rocshmem/src/wf_coal_policy.hpp index 08a06323ca..2d70f8fc61 100644 --- a/projects/rocshmem/src/wf_coal_policy.hpp +++ b/projects/rocshmem/src/wf_coal_policy.hpp @@ -25,7 +25,7 @@ #include -#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "util.hpp" namespace rocshmem { diff --git a/projects/rocshmem/utlis/header_files_gen/AMO.py b/projects/rocshmem/utlis/header_files_gen/AMO.py new file mode 100644 index 0000000000..afee8d0f43 --- /dev/null +++ b/projects/rocshmem/utlis/header_files_gen/AMO.py @@ -0,0 +1,639 @@ +""" +****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************** +""" + +import os + +types = [ + ("int", "int"), + ("long", "long"), + ("long long", "longlong"), + ("unsigned int", "uint"), + ("unsigned long", "ulong"), + ("unsigned long long", "ulonglong"), + ("int32_t", "int32"), + ("int64_t", "int64"), + ("uint32_t", "uint32"), + ("uint64_t", "uint64"), + ("size_t", "size"), + ("ptrdiff_t", "ptrdiff"), +] + + +float_types = [ + ("float", "float"), + ("double", "double"), +] + +bitwise_types = types[3:10] + + +def atomic_fetch_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch(\n" + f" rocshmem_ctx_t ctx, {T} *source, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch(\n" + f" {T} *source, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch(\n" + f" rocshmem_ctx_t ctx, {T} *source, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_atomic_fetch(\n" + f" {T} *source, int pe);\n\n" + ) + + +def generate_atomic_fetch_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_FETCH + * @brief Atomically return the value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return The value of \p dest. + */\n""" + + for type_, tname_ in float_types: + expanded_code += atomic_fetch_api(type_, tname_) + + for type_, tname_ in types: + expanded_code += atomic_fetch_api(type_, tname_) + + return expanded_code + + +def atomic_set_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_set(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_set(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_atomic_set(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_{TNAME}_atomic_set(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_set_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_SET + * @brief Atomically set the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */\n""" + + for type_, tname_ in float_types: + expanded_code += atomic_set_api(type_, tname_) + + for type_, tname_ in types: + expanded_code += atomic_set_api(type_, tname_) + + return expanded_code + + +def atomic_compare_swap_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_compare_swap(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} cond, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_compare_swap(\n" + f" {T} *dest, {T} cond, {T} value, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_compare_swap(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} cond, {T} value, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_atomic_compare_swap(\n" + f" {T} *dest, {T} cond, {T} value, int pe);\n\n" + ) + + +def generate_atomic_compare_swap_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_COMPARE_SWAP + * @brief Atomically compares if the value in \p dest with \p cond is equal + * then put \p val in \p dest. The operation returns the older value of \p dest + * to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] cond The value to be compare with. + * @param[in] val The value to be atomically swapped. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest. + */\n""" + for type_, tname_ in types: + expanded_code += atomic_compare_swap_api(type_, tname_) + + return expanded_code + + +def atomic_swap_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_swap(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_swap(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_swap(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_atomic_swap(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_swap_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_SWAP + * @brief Atomically swap the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */\n""" + + for type_, tname_ in float_types: + expanded_code += atomic_swap_api(type_, tname_) + + for type_, tname_ in types: + expanded_code += atomic_swap_api(type_, tname_) + + return expanded_code + + +def atomic_fetch_inc_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_inc(\n" + f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_inc(\n" + f" {T} *dest, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_inc(\n" + f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_inc(\n" + f" {T} *dest, int pe);\n\n" + ) + + +def generate_atomic_fetch_inc_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_FETCH_INC + * @brief Atomically add 1 to \p dest on \p pe. The operation + * returns the older value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest before it was incremented by 1. + */\n""" + for type_, tname_ in types: + expanded_code += atomic_fetch_inc_api(type_, tname_) + + return expanded_code + + +def atomic_inc_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_inc(\n" + f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_inc(\n" + f" {T} *dest, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_atomic_inc(\n" + f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n" + f"__host__ void rocshmem_{TNAME}_atomic_inc(\n" + f" {T} *dest, int pe);\n\n" + ) + + +def generate_atomic_inc_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_INC + * @brief Atomically add 1 to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] pe PE of the remote process. + * + * @return void + */\n""" + for type_, tname_ in types: + expanded_code += atomic_inc_api(type_, tname_) + + return expanded_code + + +def atomic_fetch_add_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_add(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_add(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_add(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_add(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_fetch_add_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_FETCH_ADD + * @brief Atomically add the value \p val to \p dest on \p pe. The operation + * returns the older value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest before the \p val was added. + */\n""" + for type_, tname_ in types: + expanded_code += atomic_fetch_add_api(type_, tname_) + + return expanded_code + + +def atomic_add_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_add(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_add(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_atomic_add(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_{TNAME}_atomic_add(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_add_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_ADD + * @brief Atomically add the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */\n""" + for type_, tname_ in types: + expanded_code += atomic_add_api(type_, tname_) + + return expanded_code + + +def atomic_fetch_and_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_and(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_and(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_and(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_and(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_fetch_and_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_FETCH_AND + * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */\n""" + for type_, tname_ in bitwise_types: + expanded_code += atomic_fetch_and_api(type_, tname_) + + return expanded_code + + +def atomic_and_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_and(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_and(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_atomic_and(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_{TNAME}_atomic_and(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_and_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_AND + * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */\n""" + for type_, tname_ in bitwise_types: + expanded_code += atomic_and_api(type_, tname_) + + return expanded_code + + +def atomic_fetch_or_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_or(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_or(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_or(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_or(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_fetch_or_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_FETCH_OR + * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */\n""" + for type_, tname_ in bitwise_types: + expanded_code += atomic_fetch_or_api(type_, tname_) + + return expanded_code + + +def atomic_or_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_or(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_or(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_atomic_or(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_{TNAME}_atomic_or(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_or_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_OR + * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */\n""" + for type_, tname_ in bitwise_types: + expanded_code += atomic_or_api(type_, tname_) + + return expanded_code + + +def atomic_fetch_xor_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_xor(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_xor(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_xor(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_xor(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_fetch_xor_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_FETCH_XOR + * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */\n""" + for type_, tname_ in bitwise_types: + expanded_code += atomic_fetch_xor_api(type_, tname_) + + return expanded_code + + +def atomic_xor_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_xor(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_xor(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_atomic_xor(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_{TNAME}_atomic_xor(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_atomic_xor_api(): + expanded_code = """ +/** + * @name SHMEM_ATOMIC_XOR + * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */\n""" + for type_, tname_ in bitwise_types: + expanded_code += atomic_xor_api(type_, tname_) + + return expanded_code + +def write_to_file(filename, content): + with open(filename, 'w') as file: + file.write(content) + + +def generate_AMO_header(output_dir, copyright): + expanded_code = copyright + + expanded_code += """ +#ifndef LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP + +namespace rocshmem { +""" + + expanded_code += ( + generate_atomic_fetch_api() + + generate_atomic_set_api() + + generate_atomic_compare_swap_api() + + generate_atomic_swap_api() + + generate_atomic_fetch_inc_api() + + generate_atomic_inc_api() + + generate_atomic_fetch_add_api() + + generate_atomic_add_api() + + generate_atomic_fetch_and_api() + + generate_atomic_and_api() + + generate_atomic_fetch_or_api() + + generate_atomic_or_api() + + generate_atomic_fetch_xor_api() + + generate_atomic_xor_api() + ) + + expanded_code += """ +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP +""" + + output_file = os.path.join( + output_dir, 'rocshmem_AMO.hpp' + ) + + write_to_file(output_file, expanded_code) diff --git a/projects/rocshmem/utlis/header_files_gen/COLL.py b/projects/rocshmem/utlis/header_files_gen/COLL.py new file mode 100644 index 0000000000..2755f8b26f --- /dev/null +++ b/projects/rocshmem/utlis/header_files_gen/COLL.py @@ -0,0 +1,246 @@ +""" +****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************** +""" + +import os + +types = [ + ("float", "float"), + ("double", "double"), + ("char", "char"), + ("signed char", "schar"), + ("short", "short"), + ("int", "int"), + ("long", "long"), + ("long long", "longlong"), + ("unsigned char", "uchar"), + ("unsigned short", "ushort"), + ("unsigned int", "uint"), + ("unsigned long", "ulong"), + ("unsigned long long", "ulonglong"), +] + + +def alltoall_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_alltoall(\n" + f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n" + f" const {T} *source, int nelems);\n\n" + ) + + +def generate_alltoall_api(): + expanded_code = """ +/** + * @name SHMEM_ALLTOALL + * @brief Exchanges a fixed amount of contiguous data blocks between all pairs + * of PEs participating in the collective routine. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelems Number of data blocks transferred per pair of PEs. + * + * @return void + */\n""" + for type_, tname_ in types: + expanded_code += alltoall_api(type_, tname_) + + return expanded_code + + +def broadcast_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_broadcast(\n" + f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n" + f" const {T} *source, int nelems, int pe_root);\n" + f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" int nelems, int pe_root, int pe_start, int log_pe_stride,\n" + f" int pe_size, long *p_sync);\n" + f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n" + f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n" + f" const {T} *source, int nelems, int pe_root);\n\n" + ) + + +def generate_broadcast_api(): + expanded_code = """ +/** + * @name SHMEM_BROADCAST + * @brief Perform a broadcast between PEs in the active set. The caller + * is blocked until the broadcase completes. + * + * This function must be called as a work-group collective. + * + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelement Size of the buffer to participate in the broadcast. + * @param[in] PE_root Zero-based ordinal of the PE, with respect to the + active set, from which the data is copied + * @param[in] PE_start PE to start the reduction. + * @param[in] logPE_stride Stride of PEs participating in the reduction. + * @param[in] PE_size Number PEs participating in the reduction. + * @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must + be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. + * + * @return void + */\n""" + for type_, tname_ in types: + expanded_code += broadcast_api(type_, tname_) + + return expanded_code + + +def fcollect_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_fcollect(\n" + f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n" + f" const {T} *source, int nelems);\n\n" + ) + + +def generate_fcollect_api(): + expanded_code = """ +/** + * @name SHMEM_FCOLLECT + * @brief Concatenates blocks of data from multiple PEs to an array in every + * PE participating in the collective routine. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelems Number of data blocks in source array. + * + * @return void + */\n""" + for type_, tname_ in types: + expanded_code += fcollect_api(type_, tname_) + + return expanded_code + + +def reduction_api(T, TNAME, Op_API): + return ( + f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_wg_reduce(\n" + f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n" + f" int nreduce);\n" + f"__host__ int rocshmem_ctx_{TNAME}_{Op_API}_reduce(\n" + f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n" + f" int nreduce);\n\n" + ) + + +def arith_reduction_api(T, TNAME): + operations = ["sum", "min", "max", "prod"] + return "".join([reduction_api(T, TNAME, op) for op in operations]) + +def bitwise_reduction_api(T, TNAME): + operations = ["or", "and", "xor"] + return "".join([reduction_api(T, TNAME, op) for op in operations]) + + +def generate_reduction_api(): + expanded_code = """ +/** + * @name SHMEM_REDUCTIONS + * @brief Perform an allreduce between PEs in the active set. The caller + * is blocked until the reduction completes. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nreduce Size of the buffer to participate in the reduction. + * + * @return int (Zero on successful local completion. Nonzero otherwise.) + */\n""" + + int_types = [ + ("short", "short"), + ("int", "int"), + ("long", "long"), + ("long long", "longlong") + ] + + float_types = [ + ("float", "float"), + ("double", "double") + ] + + for type_, tname_ in int_types: + expanded_code += arith_reduction_api(type_, tname_) + expanded_code += bitwise_reduction_api(type_, tname_) + + for type_, tname_ in float_types: + expanded_code += arith_reduction_api(type_, tname_) + + return expanded_code + + +def write_to_file(filename, content): + with open(filename, 'w') as file: + file.write(content) + + +def generate_COLL_header(output_dir, copyright): + expanded_code = copyright + + expanded_code += """ +#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP + +namespace rocshmem { +""" + + expanded_code += ( + generate_alltoall_api() + + generate_broadcast_api() + + generate_fcollect_api() + + generate_reduction_api() + ) + + expanded_code += """ +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP +""" + + output_file = os.path.join( + output_dir, 'rocshmem_COLL.hpp' + ) + + write_to_file(output_file, expanded_code) diff --git a/projects/rocshmem/utlis/header_files_gen/P2P_SYNC.py b/projects/rocshmem/utlis/header_files_gen/P2P_SYNC.py new file mode 100644 index 0000000000..430e0b8aed --- /dev/null +++ b/projects/rocshmem/utlis/header_files_gen/P2P_SYNC.py @@ -0,0 +1,176 @@ +""" +****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************** +""" + +import os + +types = [ + ("float", "float"), + ("double", "double"), + ("char", "char"), + ("signed char", "schar"), + ("short", "short"), + ("int", "int"), + ("long", "long"), + ("long long", "longlong"), + ("unsigned char", "uchar"), + ("unsigned short", "ushort"), + ("unsigned int", "uint"), + ("unsigned long", "ulong"), + ("unsigned long long", "ulonglong"), +] + + +def wait_until_api(T, TNAME): + return ( + f"__device__ void rocshmem_{TNAME}_wait_until(\n" + f" {T} *ivars, int cmp, {T} val);\n" + f"__device__ size_t rocshmem_{TNAME}_wait_until_any(\n" + f" {T} *ivars, size_t nelems, const int* status,\n" + f" int cmp, {T} val);\n" + f"__device__ void rocshmem_{TNAME}_wait_until_all(\n" + f" {T} *ivars, size_t nelems, const int* status,\n" + f" int cmp, {T} val);\n" + f"__device__ size_t rocshmem_{TNAME}_wait_until_some(\n" + f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n" + f" int cmp, {T} val);\n" + f"__device__ size_t rocshmem_{TNAME}_wait_until_any_vector(\n" + f" {T} *ivars, size_t nelems, const int* status,\n" + f" int cmp, {T} val);\n" + f"__device__ void rocshmem_{TNAME}_wait_until_all_vector(\n" + f" {T} *ivars, size_t nelems, const int* status,\n" + f" int cmp, {T} val);\n" + f"__device__ size_t rocshmem_{TNAME}_wait_until_some_vector(\n" + f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n" + f" int cmp, {T} val);\n" + f"__host__ void rocshmem_{TNAME}_wait_until(\n" + f" {T} *ivars, int cmp, {T} val);\n" + f"__host__ size_t rocshmem_{TNAME}_wait_until_any(\n" + f" {T} *ivars, size_t nelems, const int* status,\n" + f" int cmp, {T} val);\n" + f"__host__ void rocshmem_{TNAME}_wait_until_all(\n" + f" {T} *ivars, size_t nelems, const int* status,\n" + f" int cmp, {T} val);\n" + f"__host__ size_t rocshmem_{TNAME}_wait_until_some(\n" + f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n" + f" int cmp, {T} val);\n" + f"__host__ size_t rocshmem_{TNAME}_wait_until_any_vector(\n" + f" {T} *ivars, size_t nelems, const int* status,\n" + f" int cmp, {T} val);\n" + f"__host__ void rocshmem_{TNAME}_wait_until_all_vector(\n" + f" {T} *ivars, size_t nelems, const int* status,\n" + f" int cmp, {T} val);\n" + f"__host__ size_t rocshmem_{TNAME}_wait_until_some_vector(\n" + f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n" + f" int cmp, {T} val);\n\n" + ) + + +def generate_wait_until_api(): + expanded_code = """ +/** + * @name SHMEM_WAIT_UNTIL + * @brief Block the caller until the condition (* \p ptr \p cmps \p val) is + * true. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ivars Pointer to memory on the symmetric heap to wait for. + * @param[in] cmp Operation for the comparison. + * @param[in] val Value to compare the memory at \p ptr to. + * + * @return void + */\n""" + for type_, tname_ in types: + expanded_code += wait_until_api(type_, tname_) + + return expanded_code + + +def test_api(T, TNAME): + return ( + f"__device__ int rocshmem_{TNAME}_test(\n" + f" {T} *ivars, int cmp, {T} val);\n" + f"__host__ int rocshmem_{TNAME}_test(\n" + f" {T} *ivars, int cmp, {T} val);\n\n" + ) + + +def generate_test_api(): + expanded_code = """ +/** + * @name SHMEM_TEST + * @brief test if the condition (* \p ptr \p cmps \p val) is + * true. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ivars Pointer to memory on the symmetric heap to wait for. + * @param[in] cmp Operation for the comparison. + * @param[in] val Value to compare the memory at \p ptr to. + * + * @return 1 if the evaluation is true else 0 + */\n""" + for type_, tname_ in types: + expanded_code += test_api(type_, tname_) + + return expanded_code + + +def write_to_file(filename, content): + with open(filename, 'w') as file: + file.write(content) + + +def generate_P2P_SYNC_header(output_dir, copyright): + expanded_code = copyright + + expanded_code += """ +#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP + +namespace rocshmem { +""" + + expanded_code += ( + generate_wait_until_api() + + generate_test_api() + ) + + expanded_code += """ +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP +""" + + output_file = os.path.join( + output_dir, 'rocshmem_P2P_SYNC.hpp' + ) + + write_to_file(output_file, expanded_code) diff --git a/projects/rocshmem/utlis/header_files_gen/RMA.py b/projects/rocshmem/utlis/header_files_gen/RMA.py new file mode 100644 index 0000000000..cf12a3c3fa --- /dev/null +++ b/projects/rocshmem/utlis/header_files_gen/RMA.py @@ -0,0 +1,335 @@ +""" +****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************** +""" + +import os + +types = [ + ("float", "float"), + ("double", "double"), + ("char", "char"), + ("signed char", "schar"), + ("short", "short"), + ("int", "int"), + ("long", "long"), + ("long long", "longlong"), + ("unsigned char", "uchar"), + ("unsigned short", "ushort"), + ("unsigned int", "uint"), + ("unsigned long", "ulong"), + ("unsigned long long", "ulonglong"), +] + + +def put_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_put(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__host__ void rocshmem_{TNAME}_put({T} *dest,\n" + f" const {T} *source, size_t nelems, int pe);\n\n" + ) + + +def generate_put_api(): + expanded_code = """ +/** + * @name SHMEM_PUT + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += put_api(type_, tname_) + + return expanded_code + + +def get_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_get(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__host__ void rocshmem_{TNAME}_get({T} *dest,\n" + f" const {T} *source, size_t nelems, int pe);\n\n" + ) + + +def generate_get_api(): + expanded_code = """ +/** + * @name SHMEM_GET + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += get_api(type_, tname_) + + return expanded_code + + +def p_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_p(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value,\n" + f" int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_p(\n" + f" {T} *dest, {T} value, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_p(\n" + f" rocshmem_ctx_t ctx, {T} *dest, {T} value,\n" + f" int pe);\n" + f"__host__ void rocshmem_{TNAME}_p(\n" + f" {T} *dest, {T} value, int pe);\n\n" + ) + + +def generate_p_api(): + expanded_code = """ +/** + * @name SHMEM_P + * @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe. + * The caller must call into rocshmem_quiet() if remote completion is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] value Value to write to dest at \p pe. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += p_api(type_, tname_) + + return expanded_code + + +def g_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_g(\n" + f" rocshmem_ctx_t ctx, const {T} *source, int pe);\n" + f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_g(\n" + f" const {T} *source, int pe);\n" + f"__host__ {T} rocshmem_ctx_{TNAME}_g(\n" + f" rocshmem_ctx_t ctx, const {T} *source, int pe);\n" + f"__host__ {T} rocshmem_{TNAME}_g(\n" + f" const {T} *source, int pe);\n\n" + ) + + +def generate_g_api(): + expanded_code = """ +/** + * @name SHMEM_G + * @brief reads and returns single value from \p source at \p pe. + * The calling work-group/thread will block until the operation completes. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] source Source address. Must be an address on the symmetric + * heap. + * @param[in] pe PE of the remote process. + * + * @return the value read from remote \p source at \p pe. + */\n""" + for type_, tname_ in types: + expanded_code += g_api(type_, tname_) + + return expanded_code + + +def put_nbi_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_nbi(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_nbi(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_put_nbi(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__host__ void rocshmem_{TNAME}_put_nbi(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n" + ) + + +def generate_put_nbi_api(): + expanded_code = """ +/** + * @name SHMEM_PUT_NBI + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += put_nbi_api(type_, tname_) + + return expanded_code + + +def get_nbi_api(T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_nbi(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_nbi(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n" + f"__host__ void rocshmem_ctx_{TNAME}_get_nbi(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__host__ void rocshmem_{TNAME}_get_nbi({T} *dest,\n" + f" const {T} *source, size_t nelems, int pe);\n\n" + ) + + +def generate_get_nbi_api(): + expanded_code = """ +/** + * @name SHMEM_GET_NBI + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller will + * return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += get_nbi_api(type_, tname_) + + return expanded_code + + +def write_to_file(filename, content): + with open(filename, 'w') as file: + file.write(content) + + +def generate_RMA_header(output_dir, copyright): + expanded_code = copyright + + expanded_code += """ +#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP + +namespace rocshmem { +""" + + expanded_code += ( + generate_put_api() + + generate_p_api() + + generate_get_api() + + generate_g_api() + + generate_put_nbi_api() + + generate_get_nbi_api() + ) + + expanded_code += """ +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP +""" + + output_file = os.path.join( + output_dir, 'rocshmem_RMA.hpp' + ) + + write_to_file(output_file, expanded_code) diff --git a/projects/rocshmem/utlis/header_files_gen/RMA_X.py b/projects/rocshmem/utlis/header_files_gen/RMA_X.py new file mode 100644 index 0000000000..705731fa53 --- /dev/null +++ b/projects/rocshmem/utlis/header_files_gen/RMA_X.py @@ -0,0 +1,318 @@ +""" +****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************** +""" + +import os + +types = [ + ("float", "float"), + ("double", "double"), + ("char", "char"), + ("signed char", "schar"), + ("short", "short"), + ("int", "int"), + ("long", "long"), + ("long long", "longlong"), + ("unsigned char", "uchar"), + ("unsigned short", "ushort"), + ("unsigned int", "uint"), + ("unsigned long", "ulong"), + ("unsigned long long", "ulonglong"), +] + + +def put_api_x(GRAN, T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_{GRAN}(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_{GRAN}(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n" + ) + + +def generate_put_api_x(): + expanded_code = """ +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a wave must collectively participate + * in the call using the same arguments + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += put_api_x("wave", type_, tname_) + + expanded_code += """ +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-workgroup + * (WG) granularity. However, All threads in a WG must collectively participate + * in the call using the same arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += put_api_x("wg", type_, tname_) + + return expanded_code + + +def get_api_x(GRAN, T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_{GRAN}(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_{GRAN}(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n" + ) + + +def generate_get_api_x(): + expanded_code = """ +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must participate in the + * call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += get_api_x("wave", type_, tname_) + + expanded_code += """ +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the workgroup must participate in + * the call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += get_api_x("wg", type_, tname_) + + return expanded_code + + +def put_nbi_api_x(GRAN, T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_nbi_{GRAN}(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_nbi_{GRAN}(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n" + ) + + +def generate_put_nbi_api_x(): + expanded_code = """ +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += put_nbi_api_x("wave", type_, tname_) + + expanded_code += """ +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the WG must call in with the sameo + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += put_nbi_api_x("wg", type_, tname_) + + return expanded_code + + +def get_nbi_api_x(GRAN, T, TNAME): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_nbi_{GRAN}(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n" + f" size_t nelems, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_nbi_{GRAN}(\n" + f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n" + ) + + +def generate_get_nbi_api_x(): + expanded_code = """ +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += get_nbi_api_x("wave", type_, tname_) + + expanded_code += """ +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the WG must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */\n""" + for type_, tname_ in types: + expanded_code += get_nbi_api_x("wg", type_, tname_) + + return expanded_code + + +def write_to_file(filename, content): + with open(filename, 'w') as file: + file.write(content) + + +def generate_RMA_X_header(output_dir, copyright): + expanded_code = copyright + + expanded_code += """ +#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP + +namespace rocshmem { +""" + + expanded_code += ( + generate_put_api_x() + + generate_get_api_x() + + generate_put_nbi_api_x() + + generate_get_nbi_api_x() + ) + + expanded_code += """ +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP +""" + + output_file = os.path.join( + output_dir, 'rocshmem_RMA_X.hpp' + ) + + write_to_file(output_file, expanded_code) diff --git a/projects/rocshmem/utlis/header_files_gen/SIG_OP.py b/projects/rocshmem/utlis/header_files_gen/SIG_OP.py new file mode 100644 index 0000000000..b9880c0d0a --- /dev/null +++ b/projects/rocshmem/utlis/header_files_gen/SIG_OP.py @@ -0,0 +1,108 @@ +""" +****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************** +""" + +import os + +types = [ + ("float", "float"), + ("double", "double"), + ("char", "char"), + ("signed char", "schar"), + ("short", "short"), + ("int", "int"), + ("long", "long"), + ("long long", "longlong"), + ("unsigned char", "uchar"), + ("unsigned short", "ushort"), + ("unsigned int", "uint"), + ("unsigned long", "ulong"), + ("unsigned long long", "ulonglong"), +] + + +def putmem_signal_dec(SUFFIX): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_putmem_signal{SUFFIX}(\n" + f" void *dest, const void *source, size_t nelems, uint64_t *sig_addr,\n" + f" uint64_t signal, int sig_op, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal{SUFFIX}(\n" + f" rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,\n" + f" uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);\n\n" + ) + + +def put_signal_typed_dec(T, TNAME, SUFFIX): + return ( + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_signal{SUFFIX}(\n" + f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source, size_t nelems,\n" + f" uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);\n" + f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_signal{SUFFIX}(\n" + f" {T} *dest, const {T} *source, size_t nelems, uint64_t *sig_addr,\n" + f" uint64_t signal, int sig_op, int pe);\n\n" + ) + + +def put_signal_dec(SUFFIX): + return "".join([put_signal_typed_dec(T, TNAME, SUFFIX) for T, TNAME in types]) + + +def signaling_api_dec(SUFFIX): + return (putmem_signal_dec(SUFFIX) + put_signal_dec(SUFFIX)) + + +def generate_signal_api(): + + suffixes = ["", "_wg", "_wave", "_nbi", "_nbi_wg", "_nbi_wave"] + + return "".join([signaling_api_dec(suffix) for suffix in suffixes]) + + +def write_to_file(filename, content): + with open(filename, 'w') as file: + file.write(content) + + +def generate_SIG_OP_header(output_dir, copyright): + expanded_code = copyright + + expanded_code += """ +#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP + +namespace rocshmem { +""" + + expanded_code += generate_signal_api() + + expanded_code += """ +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP +""" + + output_file = os.path.join( + output_dir, 'rocshmem_SIG_OP.hpp' + ) + + write_to_file(output_file, expanded_code) diff --git a/projects/rocshmem/utlis/header_files_gen/header_files_gen.py b/projects/rocshmem/utlis/header_files_gen/header_files_gen.py new file mode 100644 index 0000000000..ca7268298a --- /dev/null +++ b/projects/rocshmem/utlis/header_files_gen/header_files_gen.py @@ -0,0 +1,77 @@ +""" +****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************** + """ + +import argparse +from RMA import generate_RMA_header +from AMO import generate_AMO_header +from SIG_OP import generate_SIG_OP_header +from COLL import generate_COLL_header +from P2P_SYNC import generate_P2P_SYNC_header +from RMA_X import generate_RMA_X_header + + +copyright = """ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ +""" + + +def main(): + parser = argparse.ArgumentParser( + description='Generate an expanded header files.' + ) + parser.add_argument( + 'directory', type=str, nargs='?', default='.', + help='Directory to write the header files to (default: current directory)' + ) + args = parser.parse_args() + + generate_RMA_header(args.directory, copyright) + generate_AMO_header(args.directory, copyright) + generate_SIG_OP_header(args.directory, copyright) + generate_COLL_header(args.directory, copyright) + generate_P2P_SYNC_header(args.directory, copyright) + generate_RMA_X_header(args.directory, copyright) + +if __name__ == "__main__": + main()