Add header files based on sections in OpenSHMEM specifications

* rocshmem_RMA.hpp
* rocshmem_AMO.hpp
* rocshmem_SIG_OP.hpp
* rocshmem_COLL.hpp
* rocshmem_P2P_SYNC.hpp
* rocshmem_RMA_X.hpp


[ROCm/rocshmem commit: 3117a47b8d]
Этот коммит содержится в:
avinashkethineedi
2024-12-05 23:24:10 +00:00
родитель 0c3714843a
Коммит 52088167ae
14 изменённых файлов: 5856 добавлений и 2169 удалений
+6 -1
Просмотреть файл
@@ -190,7 +190,7 @@ set_target_properties(
${PROJECT_NAME}
PROPERTIES
PUBLIC_HEADER
"${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/rocshmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/debug.hpp"
"${CMAKE_BINARY_DIR}/config.h"
)
###############################################################################
@@ -386,6 +386,11 @@ install(
COMPONENT bin
)
install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/
DESTINATION ${INSTALL_INCLUDEDIR}
COMPONENT dev
)
install(
EXPORT
${PROJECT_NAME}Targets
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_alltoall_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_broadcast_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_getmem_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_put_signal_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
Разница между файлами не показана из-за своего большого размера Загрузить разницу
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+603
Просмотреть файл
@@ -0,0 +1,603 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
namespace rocshmem {
/**
* @name SHMEM_ALLTOALL
* @brief Exchanges a fixed amount of contiguous data blocks between all pairs
* of PEs participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks transferred per pair of PEs.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
/**
* @name SHMEM_BROADCAST
* @brief Perform a broadcast between PEs in the active set. The caller
* is blocked until the broadcase completes.
*
* This function must be called as a work-group collective.
*
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelement Size of the buffer to participate in the broadcast.
* @param[in] PE_root Zero-based ordinal of the PE, with respect to the
active set, from which the data is copied
* @param[in] PE_start PE to start the reduction.
* @param[in] logPE_stride Stride of PEs participating in the reduction.
* @param[in] PE_size Number PEs participating in the reduction.
* @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must
be of size at least ROCSHMEM_REDUCE_SYNC_SIZE.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, float *dest, const float *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, double *dest, const double *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, char *dest, const char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, short *dest, const short *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, int *dest, const int *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, long *dest, const long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulonglong_broadcast(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ulonglong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
/**
* @name SHMEM_FCOLLECT
* @brief Concatenates blocks of data from multiple PEs to an array in every
* PE participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks in source array.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
/**
* @name SHMEM_REDUCTIONS
* @brief Perform an allreduce between PEs in the active set. The caller
* is blocked until the reduction completes.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nreduce Size of the buffer to participate in the reduction.
*
* @return int (Zero on successful local completion. Nonzero otherwise.)
*/
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
+662
Просмотреть файл
@@ -0,0 +1,662 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
namespace rocshmem {
/**
* @name SHMEM_WAIT_UNTIL
* @brief Block the caller until the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return void
*/
__device__ void rocshmem_float_wait_until(
float *ivars, int cmp, float val);
__device__ size_t rocshmem_float_wait_until_any(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ void rocshmem_float_wait_until_all(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_some(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_any_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ void rocshmem_float_wait_until_all_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_some_vector(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until(
float *ivars, int cmp, float val);
__host__ size_t rocshmem_float_wait_until_any(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until_all(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_some(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_any_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until_all_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_some_vector(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__device__ void rocshmem_double_wait_until(
double *ivars, int cmp, double val);
__device__ size_t rocshmem_double_wait_until_any(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ void rocshmem_double_wait_until_all(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_some(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_any_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ void rocshmem_double_wait_until_all_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_some_vector(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until(
double *ivars, int cmp, double val);
__host__ size_t rocshmem_double_wait_until_any(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until_all(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_some(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_any_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until_all_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_some_vector(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__device__ void rocshmem_char_wait_until(
char *ivars, int cmp, char val);
__device__ size_t rocshmem_char_wait_until_any(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ void rocshmem_char_wait_until_all(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_some(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_any_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ void rocshmem_char_wait_until_all_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_some_vector(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until(
char *ivars, int cmp, char val);
__host__ size_t rocshmem_char_wait_until_any(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until_all(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_some(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_any_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until_all_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_some_vector(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__device__ void rocshmem_schar_wait_until(
signed char *ivars, int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_any(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ void rocshmem_schar_wait_until_all(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_some(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_any_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ void rocshmem_schar_wait_until_all_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_some_vector(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until(
signed char *ivars, int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_any(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until_all(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_some(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_any_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until_all_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_some_vector(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__device__ void rocshmem_short_wait_until(
short *ivars, int cmp, short val);
__device__ size_t rocshmem_short_wait_until_any(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ void rocshmem_short_wait_until_all(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_some(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_any_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ void rocshmem_short_wait_until_all_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_some_vector(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until(
short *ivars, int cmp, short val);
__host__ size_t rocshmem_short_wait_until_any(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until_all(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_some(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_any_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until_all_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_some_vector(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__device__ void rocshmem_int_wait_until(
int *ivars, int cmp, int val);
__device__ size_t rocshmem_int_wait_until_any(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ void rocshmem_int_wait_until_all(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_some(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_any_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ void rocshmem_int_wait_until_all_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_some_vector(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until(
int *ivars, int cmp, int val);
__host__ size_t rocshmem_int_wait_until_any(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until_all(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_some(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_any_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until_all_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_some_vector(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__device__ void rocshmem_long_wait_until(
long *ivars, int cmp, long val);
__device__ size_t rocshmem_long_wait_until_any(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ void rocshmem_long_wait_until_all(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_some(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_any_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ void rocshmem_long_wait_until_all_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_some_vector(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until(
long *ivars, int cmp, long val);
__host__ size_t rocshmem_long_wait_until_any(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until_all(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_some(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_any_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until_all_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_some_vector(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__device__ void rocshmem_longlong_wait_until(
long long *ivars, int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_any(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ void rocshmem_longlong_wait_until_all(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_some(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_any_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ void rocshmem_longlong_wait_until_all_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_some_vector(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until(
long long *ivars, int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_any(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until_all(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_some(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_any_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until_all_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_some_vector(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__device__ void rocshmem_uchar_wait_until(
unsigned char *ivars, int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_any(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_uchar_wait_until_all(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_some(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_any_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_uchar_wait_until_all_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_some_vector(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until(
unsigned char *ivars, int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_any(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until_all(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_some(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_any_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until_all_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_some_vector(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_ushort_wait_until(
unsigned short *ivars, int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_any(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_ushort_wait_until_all(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_some(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_any_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_ushort_wait_until_all_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_some_vector(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until(
unsigned short *ivars, int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_any(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until_all(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_some(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_any_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until_all_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_some_vector(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_uint_wait_until(
unsigned int *ivars, int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_any(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_uint_wait_until_all(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_some(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_any_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_uint_wait_until_all_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_some_vector(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until(
unsigned int *ivars, int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_any(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until_all(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_some(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_any_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until_all_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_some_vector(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_ulong_wait_until(
unsigned long *ivars, int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_any(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulong_wait_until_all(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_some(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_any_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulong_wait_until_all_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_some_vector(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until(
unsigned long *ivars, int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_any(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until_all(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_some(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_any_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until_all_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_some_vector(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulonglong_wait_until(
unsigned long long *ivars, int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_any(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_ulonglong_wait_until_all(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_some(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_any_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_ulonglong_wait_until_all_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_some_vector(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until(
unsigned long long *ivars, int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_any(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until_all(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_some(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_any_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until_all_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_some_vector(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
/**
* @name SHMEM_TEST
* @brief test if the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return 1 if the evaluation is true else 0
*/
__device__ int rocshmem_float_test(
float *ivars, int cmp, float val);
__host__ int rocshmem_float_test(
float *ivars, int cmp, float val);
__device__ int rocshmem_double_test(
double *ivars, int cmp, double val);
__host__ int rocshmem_double_test(
double *ivars, int cmp, double val);
__device__ int rocshmem_char_test(
char *ivars, int cmp, char val);
__host__ int rocshmem_char_test(
char *ivars, int cmp, char val);
__device__ int rocshmem_schar_test(
signed char *ivars, int cmp, signed char val);
__host__ int rocshmem_schar_test(
signed char *ivars, int cmp, signed char val);
__device__ int rocshmem_short_test(
short *ivars, int cmp, short val);
__host__ int rocshmem_short_test(
short *ivars, int cmp, short val);
__device__ int rocshmem_int_test(
int *ivars, int cmp, int val);
__host__ int rocshmem_int_test(
int *ivars, int cmp, int val);
__device__ int rocshmem_long_test(
long *ivars, int cmp, long val);
__host__ int rocshmem_long_test(
long *ivars, int cmp, long val);
__device__ int rocshmem_longlong_test(
long long *ivars, int cmp, long long val);
__host__ int rocshmem_longlong_test(
long long *ivars, int cmp, long long val);
__device__ int rocshmem_uchar_test(
unsigned char *ivars, int cmp, unsigned char val);
__host__ int rocshmem_uchar_test(
unsigned char *ivars, int cmp, unsigned char val);
__device__ int rocshmem_ushort_test(
unsigned short *ivars, int cmp, unsigned short val);
__host__ int rocshmem_ushort_test(
unsigned short *ivars, int cmp, unsigned short val);
__device__ int rocshmem_uint_test(
unsigned int *ivars, int cmp, unsigned int val);
__host__ int rocshmem_uint_test(
unsigned int *ivars, int cmp, unsigned int val);
__device__ int rocshmem_ulong_test(
unsigned long *ivars, int cmp, unsigned long val);
__host__ int rocshmem_ulong_test(
unsigned long *ivars, int cmp, unsigned long val);
__device__ int rocshmem_ulonglong_test(
unsigned long long *ivars, int cmp, unsigned long long val);
__host__ int rocshmem_ulonglong_test(
unsigned long long *ivars, int cmp, unsigned long long val);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
Разница между файлами не показана из-за своего большого размера Загрузить разницу
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+623
Просмотреть файл
@@ -0,0 +1,623 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
namespace rocshmem {
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wg(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wg(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wg(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wg(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wg(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wg(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wg(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wg(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wg(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wg(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wg(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wg(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wg(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wg(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wg(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wave(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wave(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wave(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wave(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wave(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wave(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wave(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wave(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wave(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wave(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wave(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wave(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wave(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wave(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wave(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wg(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wg(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wg(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wg(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wg(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wg(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wg(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wg(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wg(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wg(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wg(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wg(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wg(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wg(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wg(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wave(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wave(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wave(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wave(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wave(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wave(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wave(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wave(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wave(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wave(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wave(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wave(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wave(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wave(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wave(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
+130
Просмотреть файл
@@ -0,0 +1,130 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
namespace rocshmem {
#ifdef USE_FUNC_CALL
#define ATTR_NO_INLINE __attribute__((noinline))
#else
#define ATTR_NO_INLINE
#endif
enum ROCSHMEM_STATUS {
ROCSHMEM_SUCCESS = 0,
ROCSHMEM_ERROR = 1,
};
enum ROCSHMEM_OP {
ROCSHMEM_SUM,
ROCSHMEM_MAX,
ROCSHMEM_MIN,
ROCSHMEM_PROD,
ROCSHMEM_AND,
ROCSHMEM_OR,
ROCSHMEM_XOR,
ROCSHMEM_REPLACE
};
enum ROCSHMEM_SIGNAL_OPS {
ROCSHMEM_SIGNAL_SET,
ROCSHMEM_SIGNAL_ADD,
};
/**
* @brief Types defined for rocshmem_wait() operations.
*/
enum rocshmem_cmps {
ROCSHMEM_CMP_EQ,
ROCSHMEM_CMP_NE,
ROCSHMEM_CMP_GT,
ROCSHMEM_CMP_GE,
ROCSHMEM_CMP_LT,
ROCSHMEM_CMP_LE,
};
enum rocshmem_thread_ops {
ROCSHMEM_THREAD_SINGLE,
ROCSHMEM_THREAD_FUNNELED,
ROCSHMEM_THREAD_WG_FUNNELED,
ROCSHMEM_THREAD_SERIALIZED,
ROCSHMEM_THREAD_MULTIPLE
};
/**
* @brief Bitwise flags to mask configuration parameters.
*/
enum rocshmem_team_configs {
ROCSHMEM_TEAM_DEFAULT_CONFIGS,
ROCSHMEM_TEAM_NUM_CONTEXTS
};
typedef struct {
int num_contexts;
} rocshmem_team_config_t;
constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024;
constexpr size_t ROCSHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024);
constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256;
constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256;
// Internally calls sync function, which matches barrier implementation
constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE;
constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1;
constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE;
constexpr size_t ROCSHMEM_SYNC_VALUE = 0;
const int ROCSHMEM_CTX_ZERO = 0;
const int ROCSHMEM_CTX_NOSTORE = 1;
const int ROCSHMEM_CTX_SERIALIZED = 2;
const int ROCSHMEM_CTX_WG_PRIVATE = 4;
const int ROCSHMEM_CTX_SHARED = 8;
/**
* @brief GPU side OpenSHMEM context created from each work-groups'
* rocshmem_wg_handle_t
*/
typedef struct {
void *ctx_opaque;
void *team_opaque;
} rocshmem_ctx_t;
/**
* Shmem default context.
*/
extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT;
/**
* Used internally to set default context.
*/
void set_internal_ctx(rocshmem_ctx_t *ctx);
typedef uint64_t *rocshmem_team_t;
extern rocshmem_team_t ROCSHMEM_TEAM_WORLD;
const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr;
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP