Merge pull request #60 from avinashkethineedi/rocshmem_header_files

rocshmem header files

[ROCm/rocshmem commit: 1ae7b9812a]
Этот коммит содержится в:
Avinash Kethineedi
2024-12-06 11:19:08 -06:00
коммит произвёл GitHub
родитель 0c3714843a 29b0518afc
Коммит dde3fb5bf7
59 изменённых файлов: 7795 добавлений и 2209 удалений
+8 -3
Просмотреть файл
@@ -74,7 +74,7 @@ option(BUILD_UNIT_TESTS "Build the unit tests" ON)
set(ROCM_PATH "" CACHE PATH "ROCm path to use")
configure_file(cmake/config.h.in config.h)
configure_file(cmake/rocshmem_config.h.in rocshmem_config.h)
###############################################################################
# Validate user passed options
@@ -179,7 +179,7 @@ target_include_directories(
${PROJECT_NAME}
PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}> # CONFIG.H
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}> # rocshmem_config.h
$<INSTALL_INTERFACE:include>
)
@@ -190,7 +190,7 @@ set_target_properties(
${PROJECT_NAME}
PROPERTIES
PUBLIC_HEADER
"${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/rocshmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/debug.hpp"
"${CMAKE_BINARY_DIR}/rocshmem_config.h"
)
###############################################################################
@@ -386,6 +386,11 @@ install(
COMPONENT bin
)
install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/
DESTINATION ${INSTALL_INCLUDEDIR}
COMPONENT dev
)
install(
EXPORT
${PROJECT_NAME}Targets
Просмотреть файл
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_alltoall_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_broadcast_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_getmem_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
-1
Просмотреть файл
@@ -1,7 +1,6 @@
/*
hipcc -c -fgpu-rdc -x hip rocshmem_put_signal_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_SRC_DIR/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
Разница между файлами не показана из-за своего большого размера Загрузить разницу
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+603
Просмотреть файл
@@ -0,0 +1,603 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
namespace rocshmem {
/**
* @name SHMEM_ALLTOALL
* @brief Exchanges a fixed amount of contiguous data blocks between all pairs
* of PEs participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks transferred per pair of PEs.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
/**
* @name SHMEM_BROADCAST
* @brief Perform a broadcast between PEs in the active set. The caller
* is blocked until the broadcase completes.
*
* This function must be called as a work-group collective.
*
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelement Size of the buffer to participate in the broadcast.
* @param[in] PE_root Zero-based ordinal of the PE, with respect to the
active set, from which the data is copied
* @param[in] PE_start PE to start the reduction.
* @param[in] logPE_stride Stride of PEs participating in the reduction.
* @param[in] PE_size Number PEs participating in the reduction.
* @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must
be of size at least ROCSHMEM_REDUCE_SYNC_SIZE.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, float *dest, const float *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, double *dest, const double *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, char *dest, const char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, short *dest, const short *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, int *dest, const int *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, long *dest, const long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulonglong_broadcast(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ulonglong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
/**
* @name SHMEM_FCOLLECT
* @brief Concatenates blocks of data from multiple PEs to an array in every
* PE participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks in source array.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
/**
* @name SHMEM_REDUCTIONS
* @brief Perform an allreduce between PEs in the active set. The caller
* is blocked until the reduction completes.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nreduce Size of the buffer to participate in the reduction.
*
* @return int (Zero on successful local completion. Nonzero otherwise.)
*/
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
+662
Просмотреть файл
@@ -0,0 +1,662 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
namespace rocshmem {
/**
* @name SHMEM_WAIT_UNTIL
* @brief Block the caller until the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return void
*/
__device__ void rocshmem_float_wait_until(
float *ivars, int cmp, float val);
__device__ size_t rocshmem_float_wait_until_any(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ void rocshmem_float_wait_until_all(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_some(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_any_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ void rocshmem_float_wait_until_all_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_some_vector(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until(
float *ivars, int cmp, float val);
__host__ size_t rocshmem_float_wait_until_any(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until_all(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_some(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_any_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until_all_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_some_vector(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__device__ void rocshmem_double_wait_until(
double *ivars, int cmp, double val);
__device__ size_t rocshmem_double_wait_until_any(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ void rocshmem_double_wait_until_all(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_some(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_any_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ void rocshmem_double_wait_until_all_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_some_vector(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until(
double *ivars, int cmp, double val);
__host__ size_t rocshmem_double_wait_until_any(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until_all(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_some(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_any_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until_all_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_some_vector(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__device__ void rocshmem_char_wait_until(
char *ivars, int cmp, char val);
__device__ size_t rocshmem_char_wait_until_any(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ void rocshmem_char_wait_until_all(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_some(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_any_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ void rocshmem_char_wait_until_all_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_some_vector(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until(
char *ivars, int cmp, char val);
__host__ size_t rocshmem_char_wait_until_any(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until_all(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_some(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_any_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until_all_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_some_vector(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__device__ void rocshmem_schar_wait_until(
signed char *ivars, int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_any(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ void rocshmem_schar_wait_until_all(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_some(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_any_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ void rocshmem_schar_wait_until_all_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_some_vector(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until(
signed char *ivars, int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_any(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until_all(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_some(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_any_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until_all_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_some_vector(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__device__ void rocshmem_short_wait_until(
short *ivars, int cmp, short val);
__device__ size_t rocshmem_short_wait_until_any(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ void rocshmem_short_wait_until_all(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_some(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_any_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ void rocshmem_short_wait_until_all_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_some_vector(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until(
short *ivars, int cmp, short val);
__host__ size_t rocshmem_short_wait_until_any(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until_all(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_some(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_any_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until_all_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_some_vector(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__device__ void rocshmem_int_wait_until(
int *ivars, int cmp, int val);
__device__ size_t rocshmem_int_wait_until_any(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ void rocshmem_int_wait_until_all(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_some(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_any_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ void rocshmem_int_wait_until_all_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_some_vector(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until(
int *ivars, int cmp, int val);
__host__ size_t rocshmem_int_wait_until_any(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until_all(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_some(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_any_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until_all_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_some_vector(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__device__ void rocshmem_long_wait_until(
long *ivars, int cmp, long val);
__device__ size_t rocshmem_long_wait_until_any(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ void rocshmem_long_wait_until_all(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_some(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_any_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ void rocshmem_long_wait_until_all_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_some_vector(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until(
long *ivars, int cmp, long val);
__host__ size_t rocshmem_long_wait_until_any(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until_all(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_some(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_any_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until_all_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_some_vector(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__device__ void rocshmem_longlong_wait_until(
long long *ivars, int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_any(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ void rocshmem_longlong_wait_until_all(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_some(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_any_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ void rocshmem_longlong_wait_until_all_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_some_vector(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until(
long long *ivars, int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_any(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until_all(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_some(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_any_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until_all_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_some_vector(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__device__ void rocshmem_uchar_wait_until(
unsigned char *ivars, int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_any(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_uchar_wait_until_all(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_some(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_any_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_uchar_wait_until_all_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_some_vector(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until(
unsigned char *ivars, int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_any(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until_all(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_some(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_any_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until_all_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_some_vector(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_ushort_wait_until(
unsigned short *ivars, int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_any(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_ushort_wait_until_all(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_some(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_any_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_ushort_wait_until_all_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_some_vector(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until(
unsigned short *ivars, int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_any(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until_all(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_some(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_any_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until_all_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_some_vector(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_uint_wait_until(
unsigned int *ivars, int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_any(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_uint_wait_until_all(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_some(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_any_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_uint_wait_until_all_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_some_vector(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until(
unsigned int *ivars, int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_any(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until_all(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_some(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_any_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until_all_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_some_vector(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_ulong_wait_until(
unsigned long *ivars, int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_any(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulong_wait_until_all(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_some(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_any_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulong_wait_until_all_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_some_vector(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until(
unsigned long *ivars, int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_any(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until_all(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_some(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_any_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until_all_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_some_vector(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulonglong_wait_until(
unsigned long long *ivars, int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_any(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_ulonglong_wait_until_all(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_some(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_any_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_ulonglong_wait_until_all_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_some_vector(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until(
unsigned long long *ivars, int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_any(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until_all(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_some(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_any_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until_all_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_some_vector(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
/**
* @name SHMEM_TEST
* @brief test if the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return 1 if the evaluation is true else 0
*/
__device__ int rocshmem_float_test(
float *ivars, int cmp, float val);
__host__ int rocshmem_float_test(
float *ivars, int cmp, float val);
__device__ int rocshmem_double_test(
double *ivars, int cmp, double val);
__host__ int rocshmem_double_test(
double *ivars, int cmp, double val);
__device__ int rocshmem_char_test(
char *ivars, int cmp, char val);
__host__ int rocshmem_char_test(
char *ivars, int cmp, char val);
__device__ int rocshmem_schar_test(
signed char *ivars, int cmp, signed char val);
__host__ int rocshmem_schar_test(
signed char *ivars, int cmp, signed char val);
__device__ int rocshmem_short_test(
short *ivars, int cmp, short val);
__host__ int rocshmem_short_test(
short *ivars, int cmp, short val);
__device__ int rocshmem_int_test(
int *ivars, int cmp, int val);
__host__ int rocshmem_int_test(
int *ivars, int cmp, int val);
__device__ int rocshmem_long_test(
long *ivars, int cmp, long val);
__host__ int rocshmem_long_test(
long *ivars, int cmp, long val);
__device__ int rocshmem_longlong_test(
long long *ivars, int cmp, long long val);
__host__ int rocshmem_longlong_test(
long long *ivars, int cmp, long long val);
__device__ int rocshmem_uchar_test(
unsigned char *ivars, int cmp, unsigned char val);
__host__ int rocshmem_uchar_test(
unsigned char *ivars, int cmp, unsigned char val);
__device__ int rocshmem_ushort_test(
unsigned short *ivars, int cmp, unsigned short val);
__host__ int rocshmem_ushort_test(
unsigned short *ivars, int cmp, unsigned short val);
__device__ int rocshmem_uint_test(
unsigned int *ivars, int cmp, unsigned int val);
__host__ int rocshmem_uint_test(
unsigned int *ivars, int cmp, unsigned int val);
__device__ int rocshmem_ulong_test(
unsigned long *ivars, int cmp, unsigned long val);
__host__ int rocshmem_ulong_test(
unsigned long *ivars, int cmp, unsigned long val);
__device__ int rocshmem_ulonglong_test(
unsigned long long *ivars, int cmp, unsigned long long val);
__host__ int rocshmem_ulonglong_test(
unsigned long long *ivars, int cmp, unsigned long long val);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
Разница между файлами не показана из-за своего большого размера Загрузить разницу
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+623
Просмотреть файл
@@ -0,0 +1,623 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
namespace rocshmem {
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wg(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wg(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wg(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wg(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wg(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wg(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wg(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wg(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wg(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wg(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wg(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wg(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wg(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wg(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wg(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wave(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wave(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wave(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wave(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wave(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wave(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wave(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wave(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wave(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wave(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wave(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wave(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wave(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wave(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wave(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wg(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wg(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wg(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wg(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wg(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wg(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wg(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wg(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wg(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wg(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wg(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wg(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wg(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wg(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wg(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wave(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wave(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wave(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wave(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wave(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wave(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wave(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wave(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wave(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wave(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wave(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wave(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wave(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wave(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wave(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
+130
Просмотреть файл
@@ -0,0 +1,130 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
namespace rocshmem {
#ifdef USE_FUNC_CALL
#define ATTR_NO_INLINE __attribute__((noinline))
#else
#define ATTR_NO_INLINE
#endif
enum ROCSHMEM_STATUS {
ROCSHMEM_SUCCESS = 0,
ROCSHMEM_ERROR = 1,
};
enum ROCSHMEM_OP {
ROCSHMEM_SUM,
ROCSHMEM_MAX,
ROCSHMEM_MIN,
ROCSHMEM_PROD,
ROCSHMEM_AND,
ROCSHMEM_OR,
ROCSHMEM_XOR,
ROCSHMEM_REPLACE
};
enum ROCSHMEM_SIGNAL_OPS {
ROCSHMEM_SIGNAL_SET,
ROCSHMEM_SIGNAL_ADD,
};
/**
* @brief Types defined for rocshmem_wait() operations.
*/
enum rocshmem_cmps {
ROCSHMEM_CMP_EQ,
ROCSHMEM_CMP_NE,
ROCSHMEM_CMP_GT,
ROCSHMEM_CMP_GE,
ROCSHMEM_CMP_LT,
ROCSHMEM_CMP_LE,
};
enum rocshmem_thread_ops {
ROCSHMEM_THREAD_SINGLE,
ROCSHMEM_THREAD_FUNNELED,
ROCSHMEM_THREAD_WG_FUNNELED,
ROCSHMEM_THREAD_SERIALIZED,
ROCSHMEM_THREAD_MULTIPLE
};
/**
* @brief Bitwise flags to mask configuration parameters.
*/
enum rocshmem_team_configs {
ROCSHMEM_TEAM_DEFAULT_CONFIGS,
ROCSHMEM_TEAM_NUM_CONTEXTS
};
typedef struct {
int num_contexts;
} rocshmem_team_config_t;
constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024;
constexpr size_t ROCSHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024);
constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256;
constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256;
// Internally calls sync function, which matches barrier implementation
constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE;
constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1;
constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE;
constexpr size_t ROCSHMEM_SYNC_VALUE = 0;
const int ROCSHMEM_CTX_ZERO = 0;
const int ROCSHMEM_CTX_NOSTORE = 1;
const int ROCSHMEM_CTX_SERIALIZED = 2;
const int ROCSHMEM_CTX_WG_PRIVATE = 4;
const int ROCSHMEM_CTX_SHARED = 8;
/**
* @brief GPU side OpenSHMEM context created from each work-groups'
* rocshmem_wg_handle_t
*/
typedef struct {
void *ctx_opaque;
void *team_opaque;
} rocshmem_ctx_t;
/**
* Shmem default context.
*/
extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT;
/**
* Used internally to set default context.
*/
void set_internal_ctx(rocshmem_ctx_t *ctx);
typedef uint64_t *rocshmem_team_t;
extern rocshmem_team_t ROCSHMEM_TEAM_WORLD;
const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr;
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
+1 -1
Просмотреть файл
@@ -35,7 +35,7 @@
#include <vector>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "backend_type.hpp"
#include "ipc_policy.hpp"
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@
* functions are not supported at this time.
*/
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -20,7 +20,7 @@
* IN THE SOFTWARE.
*****************************************************************************/
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_bc.hpp"
#include "context_incl.hpp"
#include "util.hpp"
+1 -1
Просмотреть файл
@@ -20,7 +20,7 @@
* IN THE SOFTWARE.
*****************************************************************************/
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_bc.hpp"
#include "context_incl.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_
#define LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_type.hpp"
#ifdef USE_GPU_IB
#include "gpu_ib/context_ib_device.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_CONTEXT_TMPL_HOST_HPP_
#define LIBRARY_SRC_CONTEXT_TMPL_HOST_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_type.hpp"
#ifdef USE_GPU_IB
#include "gpu_ib/context_ib_host.hpp"
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <infiniband/mlx5dv.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "dynamic_connection.hpp"
#include "queue_pair.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
#define LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "infiniband_structs.hpp"
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <hip/hip_runtime.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "../backend_type.hpp"
#include "../context_incl.hpp"
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <mpi.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../backend_type.hpp"
#include "../context_incl.hpp"
#include "backend_ib.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "context_ib_device.hpp"
#include "gpu_ib_team.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../host/host_templates.hpp"
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <mpi.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../atomic_return.hpp"
#include "../context_incl.hpp"
#include "backend_ib.hpp"
+1 -1
Просмотреть файл
@@ -26,7 +26,7 @@
#include <hip/hip_runtime.h>
#include <mpi.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "connection_policy.hpp"
#include "queue_pair.hpp"
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <hip/hip_runtime.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_ib.hpp"
#include "endian.hpp"
#include "segment_builder.hpp"
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@
#include <infiniband/mlx5dv.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../atomic_return.hpp"
#include "connection_policy.hpp"
#include "thread_policy.hpp"
+1 -1
Просмотреть файл
@@ -22,7 +22,7 @@
#include "thread_policy.hpp"
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "queue_pair.hpp"
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
#define LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../util.hpp"
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -26,7 +26,7 @@
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "memory/hip_allocator.hpp"
#include "util.hpp"
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <mpi.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "host_helpers.hpp"
#include "../memory/window_info.hpp"
#include "../util.hpp"
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <utility>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "host_helpers.hpp"
#include "../memory/window_info.hpp"
#include "../team.hpp"
+1 -1
Просмотреть файл
@@ -30,7 +30,7 @@
#include <cstdio>
#include <cstdlib>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "backend_ipc.hpp"
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <mpi.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../backend_type.hpp"
#include "../context_incl.hpp"
#include "backend_ipc.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_
#define LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "context_ipc_device.hpp"
#include "../util.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_IPC_CONTEXT_TMPL_HOST_HPP_
#define LIBRARY_SRC_IPC_CONTEXT_TMPL_HOST_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../host/host_templates.hpp"
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <mpi.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_bc.hpp"
#include "context_incl.hpp"
#include "util.hpp"
+1 -1
Просмотреть файл
@@ -29,7 +29,7 @@
#include <atomic>
#include <vector>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "memory/hip_allocator.hpp"
#include "util.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_MEMORY_HEAP_TYPE_HPP_
#define LIBRARY_SRC_MEMORY_HEAP_TYPE_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "hip_allocator.hpp"
/**
+1 -1
Просмотреть файл
@@ -29,7 +29,7 @@
#include <cstdio>
#include <cstdlib>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "../backend_type.hpp"
#include "../hdp_policy.hpp"
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@
#include <mpi.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../backend_type.hpp"
#include "../context_incl.hpp"
#include "../host/host.hpp"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_GPU_TEMPLATES_HPP_
#define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_GPU_TEMPLATES_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "commands_types.hpp"
#include "context_ro_device.hpp"
#include "queue_proxy.hpp"
+1 -1
Просмотреть файл
@@ -22,7 +22,7 @@
#ifndef LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_HOST_TEMPLATES_HPP_
#define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_HOST_TEMPLATES_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../host/host_templates.hpp"
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -26,7 +26,7 @@
#include <array>
#include <cassert>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../device_proxy.hpp"
#include "../memory/../memory/hip_allocator.hpp"
#include "../stats.hpp"
+1 -1
Просмотреть файл
@@ -41,7 +41,7 @@
#include <cstdlib>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "backend_bc.hpp"
#include "context_incl.hpp"
+1 -1
Просмотреть файл
@@ -26,7 +26,7 @@
#include <cstdlib>
#include <vector>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -30,7 +30,7 @@
#include <cstdio>
#include "assembly.hpp"
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "constants.hpp"
namespace rocshmem {
+1 -1
Просмотреть файл
@@ -25,7 +25,7 @@
#include <hip/hip_runtime.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "util.hpp"
namespace rocshmem {
+639
Просмотреть файл
@@ -0,0 +1,639 @@
"""
******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************
"""
import os
types = [
("int", "int"),
("long", "long"),
("long long", "longlong"),
("unsigned int", "uint"),
("unsigned long", "ulong"),
("unsigned long long", "ulonglong"),
("int32_t", "int32"),
("int64_t", "int64"),
("uint32_t", "uint32"),
("uint64_t", "uint64"),
("size_t", "size"),
("ptrdiff_t", "ptrdiff"),
]
float_types = [
("float", "float"),
("double", "double"),
]
bitwise_types = types[3:10]
def atomic_fetch_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch(\n"
f" rocshmem_ctx_t ctx, {T} *source, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch(\n"
f" {T} *source, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch(\n"
f" rocshmem_ctx_t ctx, {T} *source, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch(\n"
f" {T} *source, int pe);\n\n"
)
def generate_atomic_fetch_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_FETCH
* @brief Atomically return the value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return The value of \p dest.
*/\n"""
for type_, tname_ in float_types:
expanded_code += atomic_fetch_api(type_, tname_)
for type_, tname_ in types:
expanded_code += atomic_fetch_api(type_, tname_)
return expanded_code
def atomic_set_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_set(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_set(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_atomic_set(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_{TNAME}_atomic_set(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_set_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_SET
* @brief Atomically set the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/\n"""
for type_, tname_ in float_types:
expanded_code += atomic_set_api(type_, tname_)
for type_, tname_ in types:
expanded_code += atomic_set_api(type_, tname_)
return expanded_code
def atomic_compare_swap_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_compare_swap(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} cond, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_compare_swap(\n"
f" {T} *dest, {T} cond, {T} value, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_compare_swap(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} cond, {T} value, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_atomic_compare_swap(\n"
f" {T} *dest, {T} cond, {T} value, int pe);\n\n"
)
def generate_atomic_compare_swap_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_COMPARE_SWAP
* @brief Atomically compares if the value in \p dest with \p cond is equal
* then put \p val in \p dest. The operation returns the older value of \p dest
* to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] cond The value to be compare with.
* @param[in] val The value to be atomically swapped.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest.
*/\n"""
for type_, tname_ in types:
expanded_code += atomic_compare_swap_api(type_, tname_)
return expanded_code
def atomic_swap_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_swap(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_swap(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_swap(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_atomic_swap(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_swap_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_SWAP
* @brief Atomically swap the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/\n"""
for type_, tname_ in float_types:
expanded_code += atomic_swap_api(type_, tname_)
for type_, tname_ in types:
expanded_code += atomic_swap_api(type_, tname_)
return expanded_code
def atomic_fetch_inc_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_inc(\n"
f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_inc(\n"
f" {T} *dest, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_inc(\n"
f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_inc(\n"
f" {T} *dest, int pe);\n\n"
)
def generate_atomic_fetch_inc_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_FETCH_INC
* @brief Atomically add 1 to \p dest on \p pe. The operation
* returns the older value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest before it was incremented by 1.
*/\n"""
for type_, tname_ in types:
expanded_code += atomic_fetch_inc_api(type_, tname_)
return expanded_code
def atomic_inc_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_inc(\n"
f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_inc(\n"
f" {T} *dest, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_atomic_inc(\n"
f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n"
f"__host__ void rocshmem_{TNAME}_atomic_inc(\n"
f" {T} *dest, int pe);\n\n"
)
def generate_atomic_inc_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_INC
* @brief Atomically add 1 to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] pe PE of the remote process.
*
* @return void
*/\n"""
for type_, tname_ in types:
expanded_code += atomic_inc_api(type_, tname_)
return expanded_code
def atomic_fetch_add_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_add(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_add(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_add(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_add(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_fetch_add_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_FETCH_ADD
* @brief Atomically add the value \p val to \p dest on \p pe. The operation
* returns the older value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest before the \p val was added.
*/\n"""
for type_, tname_ in types:
expanded_code += atomic_fetch_add_api(type_, tname_)
return expanded_code
def atomic_add_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_add(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_add(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_atomic_add(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_{TNAME}_atomic_add(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_add_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_ADD
* @brief Atomically add the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/\n"""
for type_, tname_ in types:
expanded_code += atomic_add_api(type_, tname_)
return expanded_code
def atomic_fetch_and_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_and(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_and(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_and(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_and(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_fetch_and_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_FETCH_AND
* @brief Atomically bitwise-and the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/\n"""
for type_, tname_ in bitwise_types:
expanded_code += atomic_fetch_and_api(type_, tname_)
return expanded_code
def atomic_and_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_and(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_and(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_atomic_and(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_{TNAME}_atomic_and(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_and_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_AND
* @brief Atomically bitwise-and the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/\n"""
for type_, tname_ in bitwise_types:
expanded_code += atomic_and_api(type_, tname_)
return expanded_code
def atomic_fetch_or_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_or(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_or(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_or(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_or(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_fetch_or_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_FETCH_OR
* @brief Atomically bitwise-or the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/\n"""
for type_, tname_ in bitwise_types:
expanded_code += atomic_fetch_or_api(type_, tname_)
return expanded_code
def atomic_or_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_or(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_or(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_atomic_or(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_{TNAME}_atomic_or(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_or_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_OR
* @brief Atomically bitwise-or the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/\n"""
for type_, tname_ in bitwise_types:
expanded_code += atomic_or_api(type_, tname_)
return expanded_code
def atomic_fetch_xor_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_xor(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_xor(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_xor(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_xor(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_fetch_xor_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_FETCH_XOR
* @brief Atomically bitwise-xor the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/\n"""
for type_, tname_ in bitwise_types:
expanded_code += atomic_fetch_xor_api(type_, tname_)
return expanded_code
def atomic_xor_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_xor(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_xor(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_atomic_xor(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_{TNAME}_atomic_xor(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_atomic_xor_api():
expanded_code = """
/**
* @name SHMEM_ATOMIC_XOR
* @brief Atomically bitwise-xor the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/\n"""
for type_, tname_ in bitwise_types:
expanded_code += atomic_xor_api(type_, tname_)
return expanded_code
def write_to_file(filename, content):
with open(filename, 'w') as file:
file.write(content)
def generate_AMO_header(output_dir, copyright):
expanded_code = copyright
expanded_code += """
#ifndef LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
namespace rocshmem {
"""
expanded_code += (
generate_atomic_fetch_api() +
generate_atomic_set_api() +
generate_atomic_compare_swap_api() +
generate_atomic_swap_api() +
generate_atomic_fetch_inc_api() +
generate_atomic_inc_api() +
generate_atomic_fetch_add_api() +
generate_atomic_add_api() +
generate_atomic_fetch_and_api() +
generate_atomic_and_api() +
generate_atomic_fetch_or_api() +
generate_atomic_or_api() +
generate_atomic_fetch_xor_api() +
generate_atomic_xor_api()
)
expanded_code += """
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
"""
output_file = os.path.join(
output_dir, 'rocshmem_AMO.hpp'
)
write_to_file(output_file, expanded_code)
+246
Просмотреть файл
@@ -0,0 +1,246 @@
"""
******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************
"""
import os
types = [
("float", "float"),
("double", "double"),
("char", "char"),
("signed char", "schar"),
("short", "short"),
("int", "int"),
("long", "long"),
("long long", "longlong"),
("unsigned char", "uchar"),
("unsigned short", "ushort"),
("unsigned int", "uint"),
("unsigned long", "ulong"),
("unsigned long long", "ulonglong"),
]
def alltoall_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_alltoall(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
f" const {T} *source, int nelems);\n\n"
)
def generate_alltoall_api():
expanded_code = """
/**
* @name SHMEM_ALLTOALL
* @brief Exchanges a fixed amount of contiguous data blocks between all pairs
* of PEs participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks transferred per pair of PEs.
*
* @return void
*/\n"""
for type_, tname_ in types:
expanded_code += alltoall_api(type_, tname_)
return expanded_code
def broadcast_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_broadcast(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
f" const {T} *source, int nelems, int pe_root);\n"
f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" int nelems, int pe_root, int pe_start, int log_pe_stride,\n"
f" int pe_size, long *p_sync);\n"
f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
f" const {T} *source, int nelems, int pe_root);\n\n"
)
def generate_broadcast_api():
expanded_code = """
/**
* @name SHMEM_BROADCAST
* @brief Perform a broadcast between PEs in the active set. The caller
* is blocked until the broadcase completes.
*
* This function must be called as a work-group collective.
*
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelement Size of the buffer to participate in the broadcast.
* @param[in] PE_root Zero-based ordinal of the PE, with respect to the
active set, from which the data is copied
* @param[in] PE_start PE to start the reduction.
* @param[in] logPE_stride Stride of PEs participating in the reduction.
* @param[in] PE_size Number PEs participating in the reduction.
* @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must
be of size at least ROCSHMEM_REDUCE_SYNC_SIZE.
*
* @return void
*/\n"""
for type_, tname_ in types:
expanded_code += broadcast_api(type_, tname_)
return expanded_code
def fcollect_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_fcollect(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
f" const {T} *source, int nelems);\n\n"
)
def generate_fcollect_api():
expanded_code = """
/**
* @name SHMEM_FCOLLECT
* @brief Concatenates blocks of data from multiple PEs to an array in every
* PE participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks in source array.
*
* @return void
*/\n"""
for type_, tname_ in types:
expanded_code += fcollect_api(type_, tname_)
return expanded_code
def reduction_api(T, TNAME, Op_API):
return (
f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_wg_reduce(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n"
f" int nreduce);\n"
f"__host__ int rocshmem_ctx_{TNAME}_{Op_API}_reduce(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n"
f" int nreduce);\n\n"
)
def arith_reduction_api(T, TNAME):
operations = ["sum", "min", "max", "prod"]
return "".join([reduction_api(T, TNAME, op) for op in operations])
def bitwise_reduction_api(T, TNAME):
operations = ["or", "and", "xor"]
return "".join([reduction_api(T, TNAME, op) for op in operations])
def generate_reduction_api():
expanded_code = """
/**
* @name SHMEM_REDUCTIONS
* @brief Perform an allreduce between PEs in the active set. The caller
* is blocked until the reduction completes.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nreduce Size of the buffer to participate in the reduction.
*
* @return int (Zero on successful local completion. Nonzero otherwise.)
*/\n"""
int_types = [
("short", "short"),
("int", "int"),
("long", "long"),
("long long", "longlong")
]
float_types = [
("float", "float"),
("double", "double")
]
for type_, tname_ in int_types:
expanded_code += arith_reduction_api(type_, tname_)
expanded_code += bitwise_reduction_api(type_, tname_)
for type_, tname_ in float_types:
expanded_code += arith_reduction_api(type_, tname_)
return expanded_code
def write_to_file(filename, content):
with open(filename, 'w') as file:
file.write(content)
def generate_COLL_header(output_dir, copyright):
expanded_code = copyright
expanded_code += """
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
namespace rocshmem {
"""
expanded_code += (
generate_alltoall_api() +
generate_broadcast_api() +
generate_fcollect_api() +
generate_reduction_api()
)
expanded_code += """
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
"""
output_file = os.path.join(
output_dir, 'rocshmem_COLL.hpp'
)
write_to_file(output_file, expanded_code)
+176
Просмотреть файл
@@ -0,0 +1,176 @@
"""
******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************
"""
import os
types = [
("float", "float"),
("double", "double"),
("char", "char"),
("signed char", "schar"),
("short", "short"),
("int", "int"),
("long", "long"),
("long long", "longlong"),
("unsigned char", "uchar"),
("unsigned short", "ushort"),
("unsigned int", "uint"),
("unsigned long", "ulong"),
("unsigned long long", "ulonglong"),
]
def wait_until_api(T, TNAME):
return (
f"__device__ void rocshmem_{TNAME}_wait_until(\n"
f" {T} *ivars, int cmp, {T} val);\n"
f"__device__ size_t rocshmem_{TNAME}_wait_until_any(\n"
f" {T} *ivars, size_t nelems, const int* status,\n"
f" int cmp, {T} val);\n"
f"__device__ void rocshmem_{TNAME}_wait_until_all(\n"
f" {T} *ivars, size_t nelems, const int* status,\n"
f" int cmp, {T} val);\n"
f"__device__ size_t rocshmem_{TNAME}_wait_until_some(\n"
f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n"
f" int cmp, {T} val);\n"
f"__device__ size_t rocshmem_{TNAME}_wait_until_any_vector(\n"
f" {T} *ivars, size_t nelems, const int* status,\n"
f" int cmp, {T} val);\n"
f"__device__ void rocshmem_{TNAME}_wait_until_all_vector(\n"
f" {T} *ivars, size_t nelems, const int* status,\n"
f" int cmp, {T} val);\n"
f"__device__ size_t rocshmem_{TNAME}_wait_until_some_vector(\n"
f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n"
f" int cmp, {T} val);\n"
f"__host__ void rocshmem_{TNAME}_wait_until(\n"
f" {T} *ivars, int cmp, {T} val);\n"
f"__host__ size_t rocshmem_{TNAME}_wait_until_any(\n"
f" {T} *ivars, size_t nelems, const int* status,\n"
f" int cmp, {T} val);\n"
f"__host__ void rocshmem_{TNAME}_wait_until_all(\n"
f" {T} *ivars, size_t nelems, const int* status,\n"
f" int cmp, {T} val);\n"
f"__host__ size_t rocshmem_{TNAME}_wait_until_some(\n"
f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n"
f" int cmp, {T} val);\n"
f"__host__ size_t rocshmem_{TNAME}_wait_until_any_vector(\n"
f" {T} *ivars, size_t nelems, const int* status,\n"
f" int cmp, {T} val);\n"
f"__host__ void rocshmem_{TNAME}_wait_until_all_vector(\n"
f" {T} *ivars, size_t nelems, const int* status,\n"
f" int cmp, {T} val);\n"
f"__host__ size_t rocshmem_{TNAME}_wait_until_some_vector(\n"
f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n"
f" int cmp, {T} val);\n\n"
)
def generate_wait_until_api():
expanded_code = """
/**
* @name SHMEM_WAIT_UNTIL
* @brief Block the caller until the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return void
*/\n"""
for type_, tname_ in types:
expanded_code += wait_until_api(type_, tname_)
return expanded_code
def test_api(T, TNAME):
return (
f"__device__ int rocshmem_{TNAME}_test(\n"
f" {T} *ivars, int cmp, {T} val);\n"
f"__host__ int rocshmem_{TNAME}_test(\n"
f" {T} *ivars, int cmp, {T} val);\n\n"
)
def generate_test_api():
expanded_code = """
/**
* @name SHMEM_TEST
* @brief test if the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return 1 if the evaluation is true else 0
*/\n"""
for type_, tname_ in types:
expanded_code += test_api(type_, tname_)
return expanded_code
def write_to_file(filename, content):
with open(filename, 'w') as file:
file.write(content)
def generate_P2P_SYNC_header(output_dir, copyright):
expanded_code = copyright
expanded_code += """
#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
namespace rocshmem {
"""
expanded_code += (
generate_wait_until_api() +
generate_test_api()
)
expanded_code += """
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
"""
output_file = os.path.join(
output_dir, 'rocshmem_P2P_SYNC.hpp'
)
write_to_file(output_file, expanded_code)
+335
Просмотреть файл
@@ -0,0 +1,335 @@
"""
******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************
"""
import os
types = [
("float", "float"),
("double", "double"),
("char", "char"),
("signed char", "schar"),
("short", "short"),
("int", "int"),
("long", "long"),
("long long", "longlong"),
("unsigned char", "uchar"),
("unsigned short", "ushort"),
("unsigned int", "uint"),
("unsigned long", "ulong"),
("unsigned long long", "ulonglong"),
]
def put_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_put(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__host__ void rocshmem_{TNAME}_put({T} *dest,\n"
f" const {T} *source, size_t nelems, int pe);\n\n"
)
def generate_put_api():
expanded_code = """
/**
* @name SHMEM_PUT
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += put_api(type_, tname_)
return expanded_code
def get_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_get(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__host__ void rocshmem_{TNAME}_get({T} *dest,\n"
f" const {T} *source, size_t nelems, int pe);\n\n"
)
def generate_get_api():
expanded_code = """
/**
* @name SHMEM_GET
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += get_api(type_, tname_)
return expanded_code
def p_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_p(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value,\n"
f" int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_p(\n"
f" {T} *dest, {T} value, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_p(\n"
f" rocshmem_ctx_t ctx, {T} *dest, {T} value,\n"
f" int pe);\n"
f"__host__ void rocshmem_{TNAME}_p(\n"
f" {T} *dest, {T} value, int pe);\n\n"
)
def generate_p_api():
expanded_code = """
/**
* @name SHMEM_P
* @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe.
* The caller must call into rocshmem_quiet() if remote completion is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] value Value to write to dest at \p pe.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += p_api(type_, tname_)
return expanded_code
def g_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_g(\n"
f" rocshmem_ctx_t ctx, const {T} *source, int pe);\n"
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_g(\n"
f" const {T} *source, int pe);\n"
f"__host__ {T} rocshmem_ctx_{TNAME}_g(\n"
f" rocshmem_ctx_t ctx, const {T} *source, int pe);\n"
f"__host__ {T} rocshmem_{TNAME}_g(\n"
f" const {T} *source, int pe);\n\n"
)
def generate_g_api():
expanded_code = """
/**
* @name SHMEM_G
* @brief reads and returns single value from \p source at \p pe.
* The calling work-group/thread will block until the operation completes.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] source Source address. Must be an address on the symmetric
* heap.
* @param[in] pe PE of the remote process.
*
* @return the value read from remote \p source at \p pe.
*/\n"""
for type_, tname_ in types:
expanded_code += g_api(type_, tname_)
return expanded_code
def put_nbi_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_nbi(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_nbi(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_put_nbi(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__host__ void rocshmem_{TNAME}_put_nbi(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
)
def generate_put_nbi_api():
expanded_code = """
/**
* @name SHMEM_PUT_NBI
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += put_nbi_api(type_, tname_)
return expanded_code
def get_nbi_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_nbi(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_nbi(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n"
f"__host__ void rocshmem_ctx_{TNAME}_get_nbi(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__host__ void rocshmem_{TNAME}_get_nbi({T} *dest,\n"
f" const {T} *source, size_t nelems, int pe);\n\n"
)
def generate_get_nbi_api():
expanded_code = """
/**
* @name SHMEM_GET_NBI
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller will
* return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += get_nbi_api(type_, tname_)
return expanded_code
def write_to_file(filename, content):
with open(filename, 'w') as file:
file.write(content)
def generate_RMA_header(output_dir, copyright):
expanded_code = copyright
expanded_code += """
#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
namespace rocshmem {
"""
expanded_code += (
generate_put_api() +
generate_p_api() +
generate_get_api() +
generate_g_api() +
generate_put_nbi_api() +
generate_get_nbi_api()
)
expanded_code += """
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
"""
output_file = os.path.join(
output_dir, 'rocshmem_RMA.hpp'
)
write_to_file(output_file, expanded_code)
+318
Просмотреть файл
@@ -0,0 +1,318 @@
"""
******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************
"""
import os
types = [
("float", "float"),
("double", "double"),
("char", "char"),
("signed char", "schar"),
("short", "short"),
("int", "int"),
("long", "long"),
("long long", "longlong"),
("unsigned char", "uchar"),
("unsigned short", "ushort"),
("unsigned int", "uint"),
("unsigned long", "ulong"),
("unsigned long long", "ulonglong"),
]
def put_api_x(GRAN, T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_{GRAN}(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_{GRAN}(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
)
def generate_put_api_x():
expanded_code = """
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a wave must collectively participate
* in the call using the same arguments
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += put_api_x("wave", type_, tname_)
expanded_code += """
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-workgroup
* (WG) granularity. However, All threads in a WG must collectively participate
* in the call using the same arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += put_api_x("wg", type_, tname_)
return expanded_code
def get_api_x(GRAN, T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_{GRAN}(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_{GRAN}(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
)
def generate_get_api_x():
expanded_code = """
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must participate in the
* call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += get_api_x("wave", type_, tname_)
expanded_code += """
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the workgroup must participate in
* the call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += get_api_x("wg", type_, tname_)
return expanded_code
def put_nbi_api_x(GRAN, T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_nbi_{GRAN}(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_nbi_{GRAN}(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
)
def generate_put_nbi_api_x():
expanded_code = """
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += put_nbi_api_x("wave", type_, tname_)
expanded_code += """
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the WG must call in with the sameo
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += put_nbi_api_x("wg", type_, tname_)
return expanded_code
def get_nbi_api_x(GRAN, T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_nbi_{GRAN}(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
f" size_t nelems, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_nbi_{GRAN}(\n"
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
)
def generate_get_nbi_api_x():
expanded_code = """
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += get_nbi_api_x("wave", type_, tname_)
expanded_code += """
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the WG must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/\n"""
for type_, tname_ in types:
expanded_code += get_nbi_api_x("wg", type_, tname_)
return expanded_code
def write_to_file(filename, content):
with open(filename, 'w') as file:
file.write(content)
def generate_RMA_X_header(output_dir, copyright):
expanded_code = copyright
expanded_code += """
#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
namespace rocshmem {
"""
expanded_code += (
generate_put_api_x() +
generate_get_api_x() +
generate_put_nbi_api_x() +
generate_get_nbi_api_x()
)
expanded_code += """
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
"""
output_file = os.path.join(
output_dir, 'rocshmem_RMA_X.hpp'
)
write_to_file(output_file, expanded_code)
+108
Просмотреть файл
@@ -0,0 +1,108 @@
"""
******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************
"""
import os
types = [
("float", "float"),
("double", "double"),
("char", "char"),
("signed char", "schar"),
("short", "short"),
("int", "int"),
("long", "long"),
("long long", "longlong"),
("unsigned char", "uchar"),
("unsigned short", "ushort"),
("unsigned int", "uint"),
("unsigned long", "ulong"),
("unsigned long long", "ulonglong"),
]
def putmem_signal_dec(SUFFIX):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_putmem_signal{SUFFIX}(\n"
f" void *dest, const void *source, size_t nelems, uint64_t *sig_addr,\n"
f" uint64_t signal, int sig_op, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal{SUFFIX}(\n"
f" rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,\n"
f" uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);\n\n"
)
def put_signal_typed_dec(T, TNAME, SUFFIX):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_signal{SUFFIX}(\n"
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source, size_t nelems,\n"
f" uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);\n"
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_signal{SUFFIX}(\n"
f" {T} *dest, const {T} *source, size_t nelems, uint64_t *sig_addr,\n"
f" uint64_t signal, int sig_op, int pe);\n\n"
)
def put_signal_dec(SUFFIX):
return "".join([put_signal_typed_dec(T, TNAME, SUFFIX) for T, TNAME in types])
def signaling_api_dec(SUFFIX):
return (putmem_signal_dec(SUFFIX) + put_signal_dec(SUFFIX))
def generate_signal_api():
suffixes = ["", "_wg", "_wave", "_nbi", "_nbi_wg", "_nbi_wave"]
return "".join([signaling_api_dec(suffix) for suffix in suffixes])
def write_to_file(filename, content):
with open(filename, 'w') as file:
file.write(content)
def generate_SIG_OP_header(output_dir, copyright):
expanded_code = copyright
expanded_code += """
#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
namespace rocshmem {
"""
expanded_code += generate_signal_api()
expanded_code += """
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
"""
output_file = os.path.join(
output_dir, 'rocshmem_SIG_OP.hpp'
)
write_to_file(output_file, expanded_code)
+77
Просмотреть файл
@@ -0,0 +1,77 @@
"""
******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************
"""
import argparse
from RMA import generate_RMA_header
from AMO import generate_AMO_header
from SIG_OP import generate_SIG_OP_header
from COLL import generate_COLL_header
from P2P_SYNC import generate_P2P_SYNC_header
from RMA_X import generate_RMA_X_header
copyright = """
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
"""
def main():
parser = argparse.ArgumentParser(
description='Generate an expanded header files.'
)
parser.add_argument(
'directory', type=str, nargs='?', default='.',
help='Directory to write the header files to (default: current directory)'
)
args = parser.parse_args()
generate_RMA_header(args.directory, copyright)
generate_AMO_header(args.directory, copyright)
generate_SIG_OP_header(args.directory, copyright)
generate_COLL_header(args.directory, copyright)
generate_P2P_SYNC_header(args.directory, copyright)
generate_RMA_X_header(args.directory, copyright)
if __name__ == "__main__":
main()