Merge pull request #60 from avinashkethineedi/rocshmem_header_files
rocshmem header files
[ROCm/rocshmem commit: 1ae7b9812a]
Этот коммит содержится в:
@@ -74,7 +74,7 @@ option(BUILD_UNIT_TESTS "Build the unit tests" ON)
|
||||
|
||||
set(ROCM_PATH "" CACHE PATH "ROCm path to use")
|
||||
|
||||
configure_file(cmake/config.h.in config.h)
|
||||
configure_file(cmake/rocshmem_config.h.in rocshmem_config.h)
|
||||
|
||||
###############################################################################
|
||||
# Validate user passed options
|
||||
@@ -179,7 +179,7 @@ target_include_directories(
|
||||
${PROJECT_NAME}
|
||||
PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
||||
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}> # CONFIG.H
|
||||
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}> # rocshmem_config.h
|
||||
$<INSTALL_INTERFACE:include>
|
||||
)
|
||||
|
||||
@@ -190,7 +190,7 @@ set_target_properties(
|
||||
${PROJECT_NAME}
|
||||
PROPERTIES
|
||||
PUBLIC_HEADER
|
||||
"${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/rocshmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/debug.hpp"
|
||||
"${CMAKE_BINARY_DIR}/rocshmem_config.h"
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
@@ -386,6 +386,11 @@ install(
|
||||
COMPONENT bin
|
||||
)
|
||||
|
||||
install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/
|
||||
DESTINATION ${INSTALL_INCLUDEDIR}
|
||||
COMPONENT dev
|
||||
)
|
||||
|
||||
install(
|
||||
EXPORT
|
||||
${PROJECT_NAME}Targets
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
/*
|
||||
hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc \
|
||||
-I/opt/rocm/include \
|
||||
-I$ROCSHMEM_SRC_DIR/include \
|
||||
-I$ROCSHMEM_INSTALL_DIR/include \
|
||||
-I$OPENMPI_UCX_INSTALL_DIR/include/
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
/*
|
||||
hipcc -c -fgpu-rdc -x hip rocshmem_alltoall_test.cc \
|
||||
-I/opt/rocm/include \
|
||||
-I$ROCSHMEM_SRC_DIR/include \
|
||||
-I$ROCSHMEM_INSTALL_DIR/include \
|
||||
-I$OPENMPI_UCX_INSTALL_DIR/include/
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
/*
|
||||
hipcc -c -fgpu-rdc -x hip rocshmem_broadcast_test.cc \
|
||||
-I/opt/rocm/include \
|
||||
-I$ROCSHMEM_SRC_DIR/include \
|
||||
-I$ROCSHMEM_INSTALL_DIR/include \
|
||||
-I$OPENMPI_UCX_INSTALL_DIR/include/
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
/*
|
||||
hipcc -c -fgpu-rdc -x hip rocshmem_getmem_test.cc \
|
||||
-I/opt/rocm/include \
|
||||
-I$ROCSHMEM_SRC_DIR/include \
|
||||
-I$ROCSHMEM_INSTALL_DIR/include \
|
||||
-I$OPENMPI_UCX_INSTALL_DIR/include/
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
/*
|
||||
hipcc -c -fgpu-rdc -x hip rocshmem_put_signal_test.cc \
|
||||
-I/opt/rocm/include \
|
||||
-I$ROCSHMEM_SRC_DIR/include \
|
||||
-I$ROCSHMEM_INSTALL_DIR/include \
|
||||
-I$OPENMPI_UCX_INSTALL_DIR/include/
|
||||
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,603 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
/**
|
||||
* @name SHMEM_ALLTOALL
|
||||
* @brief Exchanges a fixed amount of contiguous data blocks between all pairs
|
||||
* of PEs participating in the collective routine.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] team The team participating in the collective.
|
||||
* @param[in] dest Destination address. Must be an address on the
|
||||
* symmetric heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] nelems Number of data blocks transferred per pair of PEs.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
|
||||
const float *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
|
||||
const double *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
|
||||
const char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
|
||||
const signed char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
|
||||
const short *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
|
||||
const int *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
|
||||
const long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
|
||||
const long long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
|
||||
const unsigned char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
|
||||
const unsigned short *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
|
||||
const unsigned int *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
|
||||
const unsigned long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
|
||||
const unsigned long long *source, int nelems);
|
||||
|
||||
|
||||
/**
|
||||
* @name SHMEM_BROADCAST
|
||||
* @brief Perform a broadcast between PEs in the active set. The caller
|
||||
* is blocked until the broadcase completes.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] dest Destination address. Must be an address on the
|
||||
* symmetric heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] nelement Size of the buffer to participate in the broadcast.
|
||||
* @param[in] PE_root Zero-based ordinal of the PE, with respect to the
|
||||
active set, from which the data is copied
|
||||
* @param[in] PE_start PE to start the reduction.
|
||||
* @param[in] logPE_stride Stride of PEs participating in the reduction.
|
||||
* @param[in] PE_size Number PEs participating in the reduction.
|
||||
* @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must
|
||||
be of size at least ROCSHMEM_REDUCE_SYNC_SIZE.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
|
||||
const float *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_float_broadcast(
|
||||
rocshmem_ctx_t ctx, float *dest, const float *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_float_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
|
||||
const float *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
|
||||
const double *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_double_broadcast(
|
||||
rocshmem_ctx_t ctx, double *dest, const double *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_double_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
|
||||
const double *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
|
||||
const char *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_char_broadcast(
|
||||
rocshmem_ctx_t ctx, char *dest, const char *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_char_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
|
||||
const char *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
|
||||
const signed char *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_schar_broadcast(
|
||||
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_schar_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
|
||||
const signed char *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
|
||||
const short *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_short_broadcast(
|
||||
rocshmem_ctx_t ctx, short *dest, const short *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_short_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
|
||||
const short *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
|
||||
const int *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_int_broadcast(
|
||||
rocshmem_ctx_t ctx, int *dest, const int *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_int_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
|
||||
const int *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
|
||||
const long *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_long_broadcast(
|
||||
rocshmem_ctx_t ctx, long *dest, const long *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_long_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
|
||||
const long *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
|
||||
const long long *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_longlong_broadcast(
|
||||
rocshmem_ctx_t ctx, long long *dest, const long long *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_longlong_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
|
||||
const long long *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
|
||||
const unsigned char *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_uchar_broadcast(
|
||||
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_uchar_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
|
||||
const unsigned char *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
|
||||
const unsigned short *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_ushort_broadcast(
|
||||
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_ushort_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
|
||||
const unsigned short *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
|
||||
const unsigned int *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_uint_broadcast(
|
||||
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_uint_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
|
||||
const unsigned int *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
|
||||
const unsigned long *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_ulong_broadcast(
|
||||
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_ulong_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
|
||||
const unsigned long *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
|
||||
const unsigned long long *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_ulonglong_broadcast(
|
||||
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
|
||||
int nelems, int pe_root, int pe_start, int log_pe_stride,
|
||||
int pe_size, long *p_sync);
|
||||
__host__ void rocshmem_ctx_ulonglong_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
|
||||
const unsigned long long *source, int nelems, int pe_root);
|
||||
|
||||
|
||||
/**
|
||||
* @name SHMEM_FCOLLECT
|
||||
* @brief Concatenates blocks of data from multiple PEs to an array in every
|
||||
* PE participating in the collective routine.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] team The team participating in the collective.
|
||||
* @param[in] dest Destination address. Must be an address on the
|
||||
* symmetric heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] nelems Number of data blocks in source array.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
|
||||
const float *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
|
||||
const double *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
|
||||
const char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
|
||||
const signed char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
|
||||
const short *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
|
||||
const int *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
|
||||
const long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
|
||||
const long long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
|
||||
const unsigned char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
|
||||
const unsigned short *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
|
||||
const unsigned int *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
|
||||
const unsigned long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
|
||||
const unsigned long long *source, int nelems);
|
||||
|
||||
|
||||
/**
|
||||
* @name SHMEM_REDUCTIONS
|
||||
* @brief Perform an allreduce between PEs in the active set. The caller
|
||||
* is blocked until the reduction completes.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] team The team participating in the collective.
|
||||
* @param[in] dest Destination address. Must be an address on the
|
||||
* symmetric heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] nreduce Size of the buffer to participate in the reduction.
|
||||
*
|
||||
* @return int (Zero on successful local completion. Nonzero otherwise.)
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_or_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_and_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_xor_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_or_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_and_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_xor_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_or_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_and_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_xor_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_or_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_and_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_xor_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_float_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_float_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_float_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_float_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_double_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_double_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_double_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_double_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
|
||||
@@ -0,0 +1,662 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
/**
|
||||
* @name SHMEM_WAIT_UNTIL
|
||||
* @brief Block the caller until the condition (* \p ptr \p cmps \p val) is
|
||||
* true.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
|
||||
* @param[in] cmp Operation for the comparison.
|
||||
* @param[in] val Value to compare the memory at \p ptr to.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ void rocshmem_float_wait_until(
|
||||
float *ivars, int cmp, float val);
|
||||
__device__ size_t rocshmem_float_wait_until_any(
|
||||
float *ivars, size_t nelems, const int* status,
|
||||
int cmp, float val);
|
||||
__device__ void rocshmem_float_wait_until_all(
|
||||
float *ivars, size_t nelems, const int* status,
|
||||
int cmp, float val);
|
||||
__device__ size_t rocshmem_float_wait_until_some(
|
||||
float *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, float val);
|
||||
__device__ size_t rocshmem_float_wait_until_any_vector(
|
||||
float *ivars, size_t nelems, const int* status,
|
||||
int cmp, float val);
|
||||
__device__ void rocshmem_float_wait_until_all_vector(
|
||||
float *ivars, size_t nelems, const int* status,
|
||||
int cmp, float val);
|
||||
__device__ size_t rocshmem_float_wait_until_some_vector(
|
||||
float *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, float val);
|
||||
__host__ void rocshmem_float_wait_until(
|
||||
float *ivars, int cmp, float val);
|
||||
__host__ size_t rocshmem_float_wait_until_any(
|
||||
float *ivars, size_t nelems, const int* status,
|
||||
int cmp, float val);
|
||||
__host__ void rocshmem_float_wait_until_all(
|
||||
float *ivars, size_t nelems, const int* status,
|
||||
int cmp, float val);
|
||||
__host__ size_t rocshmem_float_wait_until_some(
|
||||
float *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, float val);
|
||||
__host__ size_t rocshmem_float_wait_until_any_vector(
|
||||
float *ivars, size_t nelems, const int* status,
|
||||
int cmp, float val);
|
||||
__host__ void rocshmem_float_wait_until_all_vector(
|
||||
float *ivars, size_t nelems, const int* status,
|
||||
int cmp, float val);
|
||||
__host__ size_t rocshmem_float_wait_until_some_vector(
|
||||
float *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, float val);
|
||||
|
||||
__device__ void rocshmem_double_wait_until(
|
||||
double *ivars, int cmp, double val);
|
||||
__device__ size_t rocshmem_double_wait_until_any(
|
||||
double *ivars, size_t nelems, const int* status,
|
||||
int cmp, double val);
|
||||
__device__ void rocshmem_double_wait_until_all(
|
||||
double *ivars, size_t nelems, const int* status,
|
||||
int cmp, double val);
|
||||
__device__ size_t rocshmem_double_wait_until_some(
|
||||
double *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, double val);
|
||||
__device__ size_t rocshmem_double_wait_until_any_vector(
|
||||
double *ivars, size_t nelems, const int* status,
|
||||
int cmp, double val);
|
||||
__device__ void rocshmem_double_wait_until_all_vector(
|
||||
double *ivars, size_t nelems, const int* status,
|
||||
int cmp, double val);
|
||||
__device__ size_t rocshmem_double_wait_until_some_vector(
|
||||
double *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, double val);
|
||||
__host__ void rocshmem_double_wait_until(
|
||||
double *ivars, int cmp, double val);
|
||||
__host__ size_t rocshmem_double_wait_until_any(
|
||||
double *ivars, size_t nelems, const int* status,
|
||||
int cmp, double val);
|
||||
__host__ void rocshmem_double_wait_until_all(
|
||||
double *ivars, size_t nelems, const int* status,
|
||||
int cmp, double val);
|
||||
__host__ size_t rocshmem_double_wait_until_some(
|
||||
double *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, double val);
|
||||
__host__ size_t rocshmem_double_wait_until_any_vector(
|
||||
double *ivars, size_t nelems, const int* status,
|
||||
int cmp, double val);
|
||||
__host__ void rocshmem_double_wait_until_all_vector(
|
||||
double *ivars, size_t nelems, const int* status,
|
||||
int cmp, double val);
|
||||
__host__ size_t rocshmem_double_wait_until_some_vector(
|
||||
double *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, double val);
|
||||
|
||||
__device__ void rocshmem_char_wait_until(
|
||||
char *ivars, int cmp, char val);
|
||||
__device__ size_t rocshmem_char_wait_until_any(
|
||||
char *ivars, size_t nelems, const int* status,
|
||||
int cmp, char val);
|
||||
__device__ void rocshmem_char_wait_until_all(
|
||||
char *ivars, size_t nelems, const int* status,
|
||||
int cmp, char val);
|
||||
__device__ size_t rocshmem_char_wait_until_some(
|
||||
char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, char val);
|
||||
__device__ size_t rocshmem_char_wait_until_any_vector(
|
||||
char *ivars, size_t nelems, const int* status,
|
||||
int cmp, char val);
|
||||
__device__ void rocshmem_char_wait_until_all_vector(
|
||||
char *ivars, size_t nelems, const int* status,
|
||||
int cmp, char val);
|
||||
__device__ size_t rocshmem_char_wait_until_some_vector(
|
||||
char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, char val);
|
||||
__host__ void rocshmem_char_wait_until(
|
||||
char *ivars, int cmp, char val);
|
||||
__host__ size_t rocshmem_char_wait_until_any(
|
||||
char *ivars, size_t nelems, const int* status,
|
||||
int cmp, char val);
|
||||
__host__ void rocshmem_char_wait_until_all(
|
||||
char *ivars, size_t nelems, const int* status,
|
||||
int cmp, char val);
|
||||
__host__ size_t rocshmem_char_wait_until_some(
|
||||
char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, char val);
|
||||
__host__ size_t rocshmem_char_wait_until_any_vector(
|
||||
char *ivars, size_t nelems, const int* status,
|
||||
int cmp, char val);
|
||||
__host__ void rocshmem_char_wait_until_all_vector(
|
||||
char *ivars, size_t nelems, const int* status,
|
||||
int cmp, char val);
|
||||
__host__ size_t rocshmem_char_wait_until_some_vector(
|
||||
char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, char val);
|
||||
|
||||
__device__ void rocshmem_schar_wait_until(
|
||||
signed char *ivars, int cmp, signed char val);
|
||||
__device__ size_t rocshmem_schar_wait_until_any(
|
||||
signed char *ivars, size_t nelems, const int* status,
|
||||
int cmp, signed char val);
|
||||
__device__ void rocshmem_schar_wait_until_all(
|
||||
signed char *ivars, size_t nelems, const int* status,
|
||||
int cmp, signed char val);
|
||||
__device__ size_t rocshmem_schar_wait_until_some(
|
||||
signed char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, signed char val);
|
||||
__device__ size_t rocshmem_schar_wait_until_any_vector(
|
||||
signed char *ivars, size_t nelems, const int* status,
|
||||
int cmp, signed char val);
|
||||
__device__ void rocshmem_schar_wait_until_all_vector(
|
||||
signed char *ivars, size_t nelems, const int* status,
|
||||
int cmp, signed char val);
|
||||
__device__ size_t rocshmem_schar_wait_until_some_vector(
|
||||
signed char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, signed char val);
|
||||
__host__ void rocshmem_schar_wait_until(
|
||||
signed char *ivars, int cmp, signed char val);
|
||||
__host__ size_t rocshmem_schar_wait_until_any(
|
||||
signed char *ivars, size_t nelems, const int* status,
|
||||
int cmp, signed char val);
|
||||
__host__ void rocshmem_schar_wait_until_all(
|
||||
signed char *ivars, size_t nelems, const int* status,
|
||||
int cmp, signed char val);
|
||||
__host__ size_t rocshmem_schar_wait_until_some(
|
||||
signed char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, signed char val);
|
||||
__host__ size_t rocshmem_schar_wait_until_any_vector(
|
||||
signed char *ivars, size_t nelems, const int* status,
|
||||
int cmp, signed char val);
|
||||
__host__ void rocshmem_schar_wait_until_all_vector(
|
||||
signed char *ivars, size_t nelems, const int* status,
|
||||
int cmp, signed char val);
|
||||
__host__ size_t rocshmem_schar_wait_until_some_vector(
|
||||
signed char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, signed char val);
|
||||
|
||||
__device__ void rocshmem_short_wait_until(
|
||||
short *ivars, int cmp, short val);
|
||||
__device__ size_t rocshmem_short_wait_until_any(
|
||||
short *ivars, size_t nelems, const int* status,
|
||||
int cmp, short val);
|
||||
__device__ void rocshmem_short_wait_until_all(
|
||||
short *ivars, size_t nelems, const int* status,
|
||||
int cmp, short val);
|
||||
__device__ size_t rocshmem_short_wait_until_some(
|
||||
short *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, short val);
|
||||
__device__ size_t rocshmem_short_wait_until_any_vector(
|
||||
short *ivars, size_t nelems, const int* status,
|
||||
int cmp, short val);
|
||||
__device__ void rocshmem_short_wait_until_all_vector(
|
||||
short *ivars, size_t nelems, const int* status,
|
||||
int cmp, short val);
|
||||
__device__ size_t rocshmem_short_wait_until_some_vector(
|
||||
short *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, short val);
|
||||
__host__ void rocshmem_short_wait_until(
|
||||
short *ivars, int cmp, short val);
|
||||
__host__ size_t rocshmem_short_wait_until_any(
|
||||
short *ivars, size_t nelems, const int* status,
|
||||
int cmp, short val);
|
||||
__host__ void rocshmem_short_wait_until_all(
|
||||
short *ivars, size_t nelems, const int* status,
|
||||
int cmp, short val);
|
||||
__host__ size_t rocshmem_short_wait_until_some(
|
||||
short *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, short val);
|
||||
__host__ size_t rocshmem_short_wait_until_any_vector(
|
||||
short *ivars, size_t nelems, const int* status,
|
||||
int cmp, short val);
|
||||
__host__ void rocshmem_short_wait_until_all_vector(
|
||||
short *ivars, size_t nelems, const int* status,
|
||||
int cmp, short val);
|
||||
__host__ size_t rocshmem_short_wait_until_some_vector(
|
||||
short *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, short val);
|
||||
|
||||
__device__ void rocshmem_int_wait_until(
|
||||
int *ivars, int cmp, int val);
|
||||
__device__ size_t rocshmem_int_wait_until_any(
|
||||
int *ivars, size_t nelems, const int* status,
|
||||
int cmp, int val);
|
||||
__device__ void rocshmem_int_wait_until_all(
|
||||
int *ivars, size_t nelems, const int* status,
|
||||
int cmp, int val);
|
||||
__device__ size_t rocshmem_int_wait_until_some(
|
||||
int *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, int val);
|
||||
__device__ size_t rocshmem_int_wait_until_any_vector(
|
||||
int *ivars, size_t nelems, const int* status,
|
||||
int cmp, int val);
|
||||
__device__ void rocshmem_int_wait_until_all_vector(
|
||||
int *ivars, size_t nelems, const int* status,
|
||||
int cmp, int val);
|
||||
__device__ size_t rocshmem_int_wait_until_some_vector(
|
||||
int *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, int val);
|
||||
__host__ void rocshmem_int_wait_until(
|
||||
int *ivars, int cmp, int val);
|
||||
__host__ size_t rocshmem_int_wait_until_any(
|
||||
int *ivars, size_t nelems, const int* status,
|
||||
int cmp, int val);
|
||||
__host__ void rocshmem_int_wait_until_all(
|
||||
int *ivars, size_t nelems, const int* status,
|
||||
int cmp, int val);
|
||||
__host__ size_t rocshmem_int_wait_until_some(
|
||||
int *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, int val);
|
||||
__host__ size_t rocshmem_int_wait_until_any_vector(
|
||||
int *ivars, size_t nelems, const int* status,
|
||||
int cmp, int val);
|
||||
__host__ void rocshmem_int_wait_until_all_vector(
|
||||
int *ivars, size_t nelems, const int* status,
|
||||
int cmp, int val);
|
||||
__host__ size_t rocshmem_int_wait_until_some_vector(
|
||||
int *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, int val);
|
||||
|
||||
__device__ void rocshmem_long_wait_until(
|
||||
long *ivars, int cmp, long val);
|
||||
__device__ size_t rocshmem_long_wait_until_any(
|
||||
long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long val);
|
||||
__device__ void rocshmem_long_wait_until_all(
|
||||
long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long val);
|
||||
__device__ size_t rocshmem_long_wait_until_some(
|
||||
long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, long val);
|
||||
__device__ size_t rocshmem_long_wait_until_any_vector(
|
||||
long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long val);
|
||||
__device__ void rocshmem_long_wait_until_all_vector(
|
||||
long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long val);
|
||||
__device__ size_t rocshmem_long_wait_until_some_vector(
|
||||
long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, long val);
|
||||
__host__ void rocshmem_long_wait_until(
|
||||
long *ivars, int cmp, long val);
|
||||
__host__ size_t rocshmem_long_wait_until_any(
|
||||
long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long val);
|
||||
__host__ void rocshmem_long_wait_until_all(
|
||||
long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long val);
|
||||
__host__ size_t rocshmem_long_wait_until_some(
|
||||
long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, long val);
|
||||
__host__ size_t rocshmem_long_wait_until_any_vector(
|
||||
long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long val);
|
||||
__host__ void rocshmem_long_wait_until_all_vector(
|
||||
long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long val);
|
||||
__host__ size_t rocshmem_long_wait_until_some_vector(
|
||||
long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, long val);
|
||||
|
||||
__device__ void rocshmem_longlong_wait_until(
|
||||
long long *ivars, int cmp, long long val);
|
||||
__device__ size_t rocshmem_longlong_wait_until_any(
|
||||
long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long long val);
|
||||
__device__ void rocshmem_longlong_wait_until_all(
|
||||
long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long long val);
|
||||
__device__ size_t rocshmem_longlong_wait_until_some(
|
||||
long long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, long long val);
|
||||
__device__ size_t rocshmem_longlong_wait_until_any_vector(
|
||||
long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long long val);
|
||||
__device__ void rocshmem_longlong_wait_until_all_vector(
|
||||
long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long long val);
|
||||
__device__ size_t rocshmem_longlong_wait_until_some_vector(
|
||||
long long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, long long val);
|
||||
__host__ void rocshmem_longlong_wait_until(
|
||||
long long *ivars, int cmp, long long val);
|
||||
__host__ size_t rocshmem_longlong_wait_until_any(
|
||||
long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long long val);
|
||||
__host__ void rocshmem_longlong_wait_until_all(
|
||||
long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long long val);
|
||||
__host__ size_t rocshmem_longlong_wait_until_some(
|
||||
long long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, long long val);
|
||||
__host__ size_t rocshmem_longlong_wait_until_any_vector(
|
||||
long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long long val);
|
||||
__host__ void rocshmem_longlong_wait_until_all_vector(
|
||||
long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, long long val);
|
||||
__host__ size_t rocshmem_longlong_wait_until_some_vector(
|
||||
long long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, long long val);
|
||||
|
||||
__device__ void rocshmem_uchar_wait_until(
|
||||
unsigned char *ivars, int cmp, unsigned char val);
|
||||
__device__ size_t rocshmem_uchar_wait_until_any(
|
||||
unsigned char *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__device__ void rocshmem_uchar_wait_until_all(
|
||||
unsigned char *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__device__ size_t rocshmem_uchar_wait_until_some(
|
||||
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__device__ size_t rocshmem_uchar_wait_until_any_vector(
|
||||
unsigned char *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__device__ void rocshmem_uchar_wait_until_all_vector(
|
||||
unsigned char *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__device__ size_t rocshmem_uchar_wait_until_some_vector(
|
||||
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__host__ void rocshmem_uchar_wait_until(
|
||||
unsigned char *ivars, int cmp, unsigned char val);
|
||||
__host__ size_t rocshmem_uchar_wait_until_any(
|
||||
unsigned char *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__host__ void rocshmem_uchar_wait_until_all(
|
||||
unsigned char *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__host__ size_t rocshmem_uchar_wait_until_some(
|
||||
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__host__ size_t rocshmem_uchar_wait_until_any_vector(
|
||||
unsigned char *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__host__ void rocshmem_uchar_wait_until_all_vector(
|
||||
unsigned char *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
__host__ size_t rocshmem_uchar_wait_until_some_vector(
|
||||
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned char val);
|
||||
|
||||
__device__ void rocshmem_ushort_wait_until(
|
||||
unsigned short *ivars, int cmp, unsigned short val);
|
||||
__device__ size_t rocshmem_ushort_wait_until_any(
|
||||
unsigned short *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__device__ void rocshmem_ushort_wait_until_all(
|
||||
unsigned short *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__device__ size_t rocshmem_ushort_wait_until_some(
|
||||
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__device__ size_t rocshmem_ushort_wait_until_any_vector(
|
||||
unsigned short *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__device__ void rocshmem_ushort_wait_until_all_vector(
|
||||
unsigned short *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__device__ size_t rocshmem_ushort_wait_until_some_vector(
|
||||
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__host__ void rocshmem_ushort_wait_until(
|
||||
unsigned short *ivars, int cmp, unsigned short val);
|
||||
__host__ size_t rocshmem_ushort_wait_until_any(
|
||||
unsigned short *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__host__ void rocshmem_ushort_wait_until_all(
|
||||
unsigned short *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__host__ size_t rocshmem_ushort_wait_until_some(
|
||||
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__host__ size_t rocshmem_ushort_wait_until_any_vector(
|
||||
unsigned short *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__host__ void rocshmem_ushort_wait_until_all_vector(
|
||||
unsigned short *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
__host__ size_t rocshmem_ushort_wait_until_some_vector(
|
||||
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned short val);
|
||||
|
||||
__device__ void rocshmem_uint_wait_until(
|
||||
unsigned int *ivars, int cmp, unsigned int val);
|
||||
__device__ size_t rocshmem_uint_wait_until_any(
|
||||
unsigned int *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__device__ void rocshmem_uint_wait_until_all(
|
||||
unsigned int *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__device__ size_t rocshmem_uint_wait_until_some(
|
||||
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__device__ size_t rocshmem_uint_wait_until_any_vector(
|
||||
unsigned int *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__device__ void rocshmem_uint_wait_until_all_vector(
|
||||
unsigned int *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__device__ size_t rocshmem_uint_wait_until_some_vector(
|
||||
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__host__ void rocshmem_uint_wait_until(
|
||||
unsigned int *ivars, int cmp, unsigned int val);
|
||||
__host__ size_t rocshmem_uint_wait_until_any(
|
||||
unsigned int *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__host__ void rocshmem_uint_wait_until_all(
|
||||
unsigned int *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__host__ size_t rocshmem_uint_wait_until_some(
|
||||
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__host__ size_t rocshmem_uint_wait_until_any_vector(
|
||||
unsigned int *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__host__ void rocshmem_uint_wait_until_all_vector(
|
||||
unsigned int *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
__host__ size_t rocshmem_uint_wait_until_some_vector(
|
||||
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned int val);
|
||||
|
||||
__device__ void rocshmem_ulong_wait_until(
|
||||
unsigned long *ivars, int cmp, unsigned long val);
|
||||
__device__ size_t rocshmem_ulong_wait_until_any(
|
||||
unsigned long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__device__ void rocshmem_ulong_wait_until_all(
|
||||
unsigned long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__device__ size_t rocshmem_ulong_wait_until_some(
|
||||
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__device__ size_t rocshmem_ulong_wait_until_any_vector(
|
||||
unsigned long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__device__ void rocshmem_ulong_wait_until_all_vector(
|
||||
unsigned long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__device__ size_t rocshmem_ulong_wait_until_some_vector(
|
||||
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__host__ void rocshmem_ulong_wait_until(
|
||||
unsigned long *ivars, int cmp, unsigned long val);
|
||||
__host__ size_t rocshmem_ulong_wait_until_any(
|
||||
unsigned long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__host__ void rocshmem_ulong_wait_until_all(
|
||||
unsigned long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__host__ size_t rocshmem_ulong_wait_until_some(
|
||||
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__host__ size_t rocshmem_ulong_wait_until_any_vector(
|
||||
unsigned long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__host__ void rocshmem_ulong_wait_until_all_vector(
|
||||
unsigned long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
__host__ size_t rocshmem_ulong_wait_until_some_vector(
|
||||
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned long val);
|
||||
|
||||
__device__ void rocshmem_ulonglong_wait_until(
|
||||
unsigned long long *ivars, int cmp, unsigned long long val);
|
||||
__device__ size_t rocshmem_ulonglong_wait_until_any(
|
||||
unsigned long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__device__ void rocshmem_ulonglong_wait_until_all(
|
||||
unsigned long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__device__ size_t rocshmem_ulonglong_wait_until_some(
|
||||
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__device__ size_t rocshmem_ulonglong_wait_until_any_vector(
|
||||
unsigned long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__device__ void rocshmem_ulonglong_wait_until_all_vector(
|
||||
unsigned long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__device__ size_t rocshmem_ulonglong_wait_until_some_vector(
|
||||
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__host__ void rocshmem_ulonglong_wait_until(
|
||||
unsigned long long *ivars, int cmp, unsigned long long val);
|
||||
__host__ size_t rocshmem_ulonglong_wait_until_any(
|
||||
unsigned long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__host__ void rocshmem_ulonglong_wait_until_all(
|
||||
unsigned long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__host__ size_t rocshmem_ulonglong_wait_until_some(
|
||||
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__host__ size_t rocshmem_ulonglong_wait_until_any_vector(
|
||||
unsigned long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__host__ void rocshmem_ulonglong_wait_until_all_vector(
|
||||
unsigned long long *ivars, size_t nelems, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
__host__ size_t rocshmem_ulonglong_wait_until_some_vector(
|
||||
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
|
||||
int cmp, unsigned long long val);
|
||||
|
||||
|
||||
/**
|
||||
* @name SHMEM_TEST
|
||||
* @brief test if the condition (* \p ptr \p cmps \p val) is
|
||||
* true.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
|
||||
* @param[in] cmp Operation for the comparison.
|
||||
* @param[in] val Value to compare the memory at \p ptr to.
|
||||
*
|
||||
* @return 1 if the evaluation is true else 0
|
||||
*/
|
||||
__device__ int rocshmem_float_test(
|
||||
float *ivars, int cmp, float val);
|
||||
__host__ int rocshmem_float_test(
|
||||
float *ivars, int cmp, float val);
|
||||
|
||||
__device__ int rocshmem_double_test(
|
||||
double *ivars, int cmp, double val);
|
||||
__host__ int rocshmem_double_test(
|
||||
double *ivars, int cmp, double val);
|
||||
|
||||
__device__ int rocshmem_char_test(
|
||||
char *ivars, int cmp, char val);
|
||||
__host__ int rocshmem_char_test(
|
||||
char *ivars, int cmp, char val);
|
||||
|
||||
__device__ int rocshmem_schar_test(
|
||||
signed char *ivars, int cmp, signed char val);
|
||||
__host__ int rocshmem_schar_test(
|
||||
signed char *ivars, int cmp, signed char val);
|
||||
|
||||
__device__ int rocshmem_short_test(
|
||||
short *ivars, int cmp, short val);
|
||||
__host__ int rocshmem_short_test(
|
||||
short *ivars, int cmp, short val);
|
||||
|
||||
__device__ int rocshmem_int_test(
|
||||
int *ivars, int cmp, int val);
|
||||
__host__ int rocshmem_int_test(
|
||||
int *ivars, int cmp, int val);
|
||||
|
||||
__device__ int rocshmem_long_test(
|
||||
long *ivars, int cmp, long val);
|
||||
__host__ int rocshmem_long_test(
|
||||
long *ivars, int cmp, long val);
|
||||
|
||||
__device__ int rocshmem_longlong_test(
|
||||
long long *ivars, int cmp, long long val);
|
||||
__host__ int rocshmem_longlong_test(
|
||||
long long *ivars, int cmp, long long val);
|
||||
|
||||
__device__ int rocshmem_uchar_test(
|
||||
unsigned char *ivars, int cmp, unsigned char val);
|
||||
__host__ int rocshmem_uchar_test(
|
||||
unsigned char *ivars, int cmp, unsigned char val);
|
||||
|
||||
__device__ int rocshmem_ushort_test(
|
||||
unsigned short *ivars, int cmp, unsigned short val);
|
||||
__host__ int rocshmem_ushort_test(
|
||||
unsigned short *ivars, int cmp, unsigned short val);
|
||||
|
||||
__device__ int rocshmem_uint_test(
|
||||
unsigned int *ivars, int cmp, unsigned int val);
|
||||
__host__ int rocshmem_uint_test(
|
||||
unsigned int *ivars, int cmp, unsigned int val);
|
||||
|
||||
__device__ int rocshmem_ulong_test(
|
||||
unsigned long *ivars, int cmp, unsigned long val);
|
||||
__host__ int rocshmem_ulong_test(
|
||||
unsigned long *ivars, int cmp, unsigned long val);
|
||||
|
||||
__device__ int rocshmem_ulonglong_test(
|
||||
unsigned long long *ivars, int cmp, unsigned long long val);
|
||||
__host__ int rocshmem_ulonglong_test(
|
||||
unsigned long long *ivars, int cmp, unsigned long long val);
|
||||
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,623 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal(
|
||||
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal(
|
||||
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal(
|
||||
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal(
|
||||
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal(
|
||||
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal(
|
||||
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal(
|
||||
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal(
|
||||
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal(
|
||||
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal(
|
||||
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal(
|
||||
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal(
|
||||
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal(
|
||||
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal(
|
||||
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal(
|
||||
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal(
|
||||
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal(
|
||||
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal(
|
||||
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal(
|
||||
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal(
|
||||
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal(
|
||||
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal(
|
||||
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal(
|
||||
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal(
|
||||
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal(
|
||||
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal(
|
||||
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal(
|
||||
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal(
|
||||
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wg(
|
||||
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wg(
|
||||
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wg(
|
||||
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wg(
|
||||
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wg(
|
||||
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wg(
|
||||
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wg(
|
||||
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wg(
|
||||
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wg(
|
||||
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wg(
|
||||
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wg(
|
||||
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wg(
|
||||
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wg(
|
||||
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wg(
|
||||
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wg(
|
||||
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wg(
|
||||
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wave(
|
||||
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wave(
|
||||
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wave(
|
||||
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wave(
|
||||
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wave(
|
||||
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wave(
|
||||
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wave(
|
||||
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wave(
|
||||
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wave(
|
||||
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wave(
|
||||
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wave(
|
||||
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wave(
|
||||
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wave(
|
||||
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wave(
|
||||
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wave(
|
||||
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wave(
|
||||
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi(
|
||||
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi(
|
||||
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi(
|
||||
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi(
|
||||
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi(
|
||||
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi(
|
||||
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi(
|
||||
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi(
|
||||
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi(
|
||||
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi(
|
||||
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi(
|
||||
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi(
|
||||
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi(
|
||||
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi(
|
||||
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi(
|
||||
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi(
|
||||
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wg(
|
||||
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wg(
|
||||
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wg(
|
||||
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wg(
|
||||
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wg(
|
||||
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wg(
|
||||
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wg(
|
||||
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wg(
|
||||
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wg(
|
||||
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wg(
|
||||
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wg(
|
||||
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wg(
|
||||
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wg(
|
||||
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wg(
|
||||
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wg(
|
||||
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wave(
|
||||
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wave(
|
||||
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wave(
|
||||
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wave(
|
||||
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wave(
|
||||
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wave(
|
||||
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wave(
|
||||
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wave(
|
||||
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wave(
|
||||
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wave(
|
||||
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wave(
|
||||
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wave(
|
||||
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wave(
|
||||
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wave(
|
||||
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
|
||||
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wave(
|
||||
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
|
||||
uint64_t signal, int sig_op, int pe);
|
||||
|
||||
|
||||
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr);
|
||||
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr);
|
||||
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr);
|
||||
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
|
||||
@@ -0,0 +1,130 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
#ifdef USE_FUNC_CALL
|
||||
#define ATTR_NO_INLINE __attribute__((noinline))
|
||||
#else
|
||||
#define ATTR_NO_INLINE
|
||||
#endif
|
||||
|
||||
|
||||
enum ROCSHMEM_STATUS {
|
||||
ROCSHMEM_SUCCESS = 0,
|
||||
ROCSHMEM_ERROR = 1,
|
||||
};
|
||||
|
||||
enum ROCSHMEM_OP {
|
||||
ROCSHMEM_SUM,
|
||||
ROCSHMEM_MAX,
|
||||
ROCSHMEM_MIN,
|
||||
ROCSHMEM_PROD,
|
||||
ROCSHMEM_AND,
|
||||
ROCSHMEM_OR,
|
||||
ROCSHMEM_XOR,
|
||||
ROCSHMEM_REPLACE
|
||||
};
|
||||
|
||||
enum ROCSHMEM_SIGNAL_OPS {
|
||||
ROCSHMEM_SIGNAL_SET,
|
||||
ROCSHMEM_SIGNAL_ADD,
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Types defined for rocshmem_wait() operations.
|
||||
*/
|
||||
enum rocshmem_cmps {
|
||||
ROCSHMEM_CMP_EQ,
|
||||
ROCSHMEM_CMP_NE,
|
||||
ROCSHMEM_CMP_GT,
|
||||
ROCSHMEM_CMP_GE,
|
||||
ROCSHMEM_CMP_LT,
|
||||
ROCSHMEM_CMP_LE,
|
||||
};
|
||||
|
||||
enum rocshmem_thread_ops {
|
||||
ROCSHMEM_THREAD_SINGLE,
|
||||
ROCSHMEM_THREAD_FUNNELED,
|
||||
ROCSHMEM_THREAD_WG_FUNNELED,
|
||||
ROCSHMEM_THREAD_SERIALIZED,
|
||||
ROCSHMEM_THREAD_MULTIPLE
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Bitwise flags to mask configuration parameters.
|
||||
*/
|
||||
enum rocshmem_team_configs {
|
||||
ROCSHMEM_TEAM_DEFAULT_CONFIGS,
|
||||
ROCSHMEM_TEAM_NUM_CONTEXTS
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
int num_contexts;
|
||||
} rocshmem_team_config_t;
|
||||
|
||||
constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024;
|
||||
constexpr size_t ROCSHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024);
|
||||
constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256;
|
||||
constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256;
|
||||
// Internally calls sync function, which matches barrier implementation
|
||||
constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE;
|
||||
constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1;
|
||||
constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE;
|
||||
constexpr size_t ROCSHMEM_SYNC_VALUE = 0;
|
||||
|
||||
const int ROCSHMEM_CTX_ZERO = 0;
|
||||
const int ROCSHMEM_CTX_NOSTORE = 1;
|
||||
const int ROCSHMEM_CTX_SERIALIZED = 2;
|
||||
const int ROCSHMEM_CTX_WG_PRIVATE = 4;
|
||||
const int ROCSHMEM_CTX_SHARED = 8;
|
||||
|
||||
/**
|
||||
* @brief GPU side OpenSHMEM context created from each work-groups'
|
||||
* rocshmem_wg_handle_t
|
||||
*/
|
||||
typedef struct {
|
||||
void *ctx_opaque;
|
||||
void *team_opaque;
|
||||
} rocshmem_ctx_t;
|
||||
|
||||
/**
|
||||
* Shmem default context.
|
||||
*/
|
||||
extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT;
|
||||
|
||||
/**
|
||||
* Used internally to set default context.
|
||||
*/
|
||||
void set_internal_ctx(rocshmem_ctx_t *ctx);
|
||||
|
||||
typedef uint64_t *rocshmem_team_t;
|
||||
extern rocshmem_team_t ROCSHMEM_TEAM_WORLD;
|
||||
|
||||
const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr;
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
|
||||
@@ -35,7 +35,7 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "backend_type.hpp"
|
||||
#include "ipc_policy.hpp"
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
* functions are not supported at this time.
|
||||
*/
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_bc.hpp"
|
||||
#include "context_incl.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_bc.hpp"
|
||||
#include "context_incl.hpp"
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_
|
||||
#define LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_type.hpp"
|
||||
#ifdef USE_GPU_IB
|
||||
#include "gpu_ib/context_ib_device.hpp"
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_CONTEXT_TMPL_HOST_HPP_
|
||||
#define LIBRARY_SRC_CONTEXT_TMPL_HOST_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_type.hpp"
|
||||
#ifdef USE_GPU_IB
|
||||
#include "gpu_ib/context_ib_host.hpp"
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "dynamic_connection.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "infiniband_structs.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "backend_ib.hpp"
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "context_ib_device.hpp"
|
||||
#include "gpu_ib_team.hpp"
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../host/host_templates.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../atomic_return.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "backend_ib.hpp"
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "connection_policy.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_ib.hpp"
|
||||
#include "endian.hpp"
|
||||
#include "segment_builder.hpp"
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../atomic_return.hpp"
|
||||
#include "connection_policy.hpp"
|
||||
#include "thread_policy.hpp"
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
|
||||
#include "thread_policy.hpp"
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "queue_pair.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "memory/hip_allocator.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "host_helpers.hpp"
|
||||
#include "../memory/window_info.hpp"
|
||||
#include "../util.hpp"
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "host_helpers.hpp"
|
||||
#include "../memory/window_info.hpp"
|
||||
#include "../team.hpp"
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "backend_ipc.hpp"
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "backend_ipc.hpp"
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_
|
||||
#define LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "context_ipc_device.hpp"
|
||||
#include "../util.hpp"
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_IPC_CONTEXT_TMPL_HOST_HPP_
|
||||
#define LIBRARY_SRC_IPC_CONTEXT_TMPL_HOST_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../host/host_templates.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_bc.hpp"
|
||||
#include "context_incl.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
#include <atomic>
|
||||
#include <vector>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "memory/hip_allocator.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_MEMORY_HEAP_TYPE_HPP_
|
||||
#define LIBRARY_SRC_MEMORY_HEAP_TYPE_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "hip_allocator.hpp"
|
||||
|
||||
/**
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../backend_type.hpp"
|
||||
#include "../hdp_policy.hpp"
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "../host/host.hpp"
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_GPU_TEMPLATES_HPP_
|
||||
#define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_GPU_TEMPLATES_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "commands_types.hpp"
|
||||
#include "context_ro_device.hpp"
|
||||
#include "queue_proxy.hpp"
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
#ifndef LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_HOST_TEMPLATES_HPP_
|
||||
#define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_HOST_TEMPLATES_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../host/host_templates.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../device_proxy.hpp"
|
||||
#include "../memory/../memory/hip_allocator.hpp"
|
||||
#include "../stats.hpp"
|
||||
|
||||
@@ -41,7 +41,7 @@
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "backend_bc.hpp"
|
||||
#include "context_incl.hpp"
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include <cstdio>
|
||||
|
||||
#include "assembly.hpp"
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "constants.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -0,0 +1,639 @@
|
||||
"""
|
||||
******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
types = [
|
||||
("int", "int"),
|
||||
("long", "long"),
|
||||
("long long", "longlong"),
|
||||
("unsigned int", "uint"),
|
||||
("unsigned long", "ulong"),
|
||||
("unsigned long long", "ulonglong"),
|
||||
("int32_t", "int32"),
|
||||
("int64_t", "int64"),
|
||||
("uint32_t", "uint32"),
|
||||
("uint64_t", "uint64"),
|
||||
("size_t", "size"),
|
||||
("ptrdiff_t", "ptrdiff"),
|
||||
]
|
||||
|
||||
|
||||
float_types = [
|
||||
("float", "float"),
|
||||
("double", "double"),
|
||||
]
|
||||
|
||||
bitwise_types = types[3:10]
|
||||
|
||||
|
||||
def atomic_fetch_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *source, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch(\n"
|
||||
f" {T} *source, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *source, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch(\n"
|
||||
f" {T} *source, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_fetch_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_FETCH
|
||||
* @brief Atomically return the value of \p dest to the calling PE.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return The value of \p dest.
|
||||
*/\n"""
|
||||
|
||||
for type_, tname_ in float_types:
|
||||
expanded_code += atomic_fetch_api(type_, tname_)
|
||||
|
||||
for type_, tname_ in types:
|
||||
expanded_code += atomic_fetch_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_set_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_set(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_set(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_atomic_set(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_atomic_set(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_set_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_SET
|
||||
* @brief Atomically set the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
|
||||
for type_, tname_ in float_types:
|
||||
expanded_code += atomic_set_api(type_, tname_)
|
||||
|
||||
for type_, tname_ in types:
|
||||
expanded_code += atomic_set_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_compare_swap_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_compare_swap(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} cond, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_compare_swap(\n"
|
||||
f" {T} *dest, {T} cond, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_compare_swap(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} cond, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_atomic_compare_swap(\n"
|
||||
f" {T} *dest, {T} cond, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_compare_swap_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_COMPARE_SWAP
|
||||
* @brief Atomically compares if the value in \p dest with \p cond is equal
|
||||
* then put \p val in \p dest. The operation returns the older value of \p dest
|
||||
* to the calling PE.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] cond The value to be compare with.
|
||||
* @param[in] val The value to be atomically swapped.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return The old value of \p dest.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += atomic_compare_swap_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_swap_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_swap(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_swap(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_swap(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_atomic_swap(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_swap_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_SWAP
|
||||
* @brief Atomically swap the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return original value
|
||||
*/\n"""
|
||||
|
||||
for type_, tname_ in float_types:
|
||||
expanded_code += atomic_swap_api(type_, tname_)
|
||||
|
||||
for type_, tname_ in types:
|
||||
expanded_code += atomic_swap_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_fetch_inc_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_inc(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_inc(\n"
|
||||
f" {T} *dest, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_inc(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_inc(\n"
|
||||
f" {T} *dest, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_fetch_inc_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_FETCH_INC
|
||||
* @brief Atomically add 1 to \p dest on \p pe. The operation
|
||||
* returns the older value of \p dest to the calling PE.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return The old value of \p dest before it was incremented by 1.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += atomic_fetch_inc_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_inc_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_inc(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_inc(\n"
|
||||
f" {T} *dest, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_atomic_inc(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_atomic_inc(\n"
|
||||
f" {T} *dest, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_inc_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_INC
|
||||
* @brief Atomically add 1 to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += atomic_inc_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_fetch_add_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_add(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_add(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_add(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_add(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_fetch_add_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_FETCH_ADD
|
||||
* @brief Atomically add the value \p val to \p dest on \p pe. The operation
|
||||
* returns the older value of \p dest to the calling PE.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return The old value of \p dest before the \p val was added.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += atomic_fetch_add_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_add_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_add(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_add(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_atomic_add(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_atomic_add(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_add_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_ADD
|
||||
* @brief Atomically add the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += atomic_add_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_fetch_and_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_and(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_and(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_and(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_and(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_fetch_and_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_FETCH_AND
|
||||
* @brief Atomically bitwise-and the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return original value
|
||||
*/\n"""
|
||||
for type_, tname_ in bitwise_types:
|
||||
expanded_code += atomic_fetch_and_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_and_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_and(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_and(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_atomic_and(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_atomic_and(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_and_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_AND
|
||||
* @brief Atomically bitwise-and the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in bitwise_types:
|
||||
expanded_code += atomic_and_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_fetch_or_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_or(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_or(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_or(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_or(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_fetch_or_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_FETCH_OR
|
||||
* @brief Atomically bitwise-or the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return original value
|
||||
*/\n"""
|
||||
for type_, tname_ in bitwise_types:
|
||||
expanded_code += atomic_fetch_or_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_or_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_or(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_or(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_atomic_or(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_atomic_or(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_or_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_OR
|
||||
* @brief Atomically bitwise-or the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in bitwise_types:
|
||||
expanded_code += atomic_or_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_fetch_xor_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_atomic_fetch_xor(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_atomic_fetch_xor(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_atomic_fetch_xor(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_atomic_fetch_xor(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_fetch_xor_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_FETCH_XOR
|
||||
* @brief Atomically bitwise-xor the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return original value
|
||||
*/\n"""
|
||||
for type_, tname_ in bitwise_types:
|
||||
expanded_code += atomic_fetch_xor_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def atomic_xor_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_atomic_xor(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_atomic_xor(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_atomic_xor(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_atomic_xor(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_atomic_xor_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ATOMIC_XOR
|
||||
* @brief Atomically bitwise-xor the value \p val to \p dest on \p pe.
|
||||
*
|
||||
* The operation is blocking.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] val The value to be atomically added.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in bitwise_types:
|
||||
expanded_code += atomic_xor_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
def write_to_file(filename, content):
|
||||
with open(filename, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
|
||||
def generate_AMO_header(output_dir, copyright):
|
||||
expanded_code = copyright
|
||||
|
||||
expanded_code += """
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
"""
|
||||
|
||||
expanded_code += (
|
||||
generate_atomic_fetch_api() +
|
||||
generate_atomic_set_api() +
|
||||
generate_atomic_compare_swap_api() +
|
||||
generate_atomic_swap_api() +
|
||||
generate_atomic_fetch_inc_api() +
|
||||
generate_atomic_inc_api() +
|
||||
generate_atomic_fetch_add_api() +
|
||||
generate_atomic_add_api() +
|
||||
generate_atomic_fetch_and_api() +
|
||||
generate_atomic_and_api() +
|
||||
generate_atomic_fetch_or_api() +
|
||||
generate_atomic_or_api() +
|
||||
generate_atomic_fetch_xor_api() +
|
||||
generate_atomic_xor_api()
|
||||
)
|
||||
|
||||
expanded_code += """
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
|
||||
"""
|
||||
|
||||
output_file = os.path.join(
|
||||
output_dir, 'rocshmem_AMO.hpp'
|
||||
)
|
||||
|
||||
write_to_file(output_file, expanded_code)
|
||||
@@ -0,0 +1,246 @@
|
||||
"""
|
||||
******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
types = [
|
||||
("float", "float"),
|
||||
("double", "double"),
|
||||
("char", "char"),
|
||||
("signed char", "schar"),
|
||||
("short", "short"),
|
||||
("int", "int"),
|
||||
("long", "long"),
|
||||
("long long", "longlong"),
|
||||
("unsigned char", "uchar"),
|
||||
("unsigned short", "ushort"),
|
||||
("unsigned int", "uint"),
|
||||
("unsigned long", "ulong"),
|
||||
("unsigned long long", "ulonglong"),
|
||||
]
|
||||
|
||||
|
||||
def alltoall_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_alltoall(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
|
||||
f" const {T} *source, int nelems);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_alltoall_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_ALLTOALL
|
||||
* @brief Exchanges a fixed amount of contiguous data blocks between all pairs
|
||||
* of PEs participating in the collective routine.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] team The team participating in the collective.
|
||||
* @param[in] dest Destination address. Must be an address on the
|
||||
* symmetric heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] nelems Number of data blocks transferred per pair of PEs.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += alltoall_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def broadcast_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_broadcast(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
|
||||
f" const {T} *source, int nelems, int pe_root);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" int nelems, int pe_root, int pe_start, int log_pe_stride,\n"
|
||||
f" int pe_size, long *p_sync);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
|
||||
f" const {T} *source, int nelems, int pe_root);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_broadcast_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_BROADCAST
|
||||
* @brief Perform a broadcast between PEs in the active set. The caller
|
||||
* is blocked until the broadcase completes.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] dest Destination address. Must be an address on the
|
||||
* symmetric heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] nelement Size of the buffer to participate in the broadcast.
|
||||
* @param[in] PE_root Zero-based ordinal of the PE, with respect to the
|
||||
active set, from which the data is copied
|
||||
* @param[in] PE_start PE to start the reduction.
|
||||
* @param[in] logPE_stride Stride of PEs participating in the reduction.
|
||||
* @param[in] PE_size Number PEs participating in the reduction.
|
||||
* @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must
|
||||
be of size at least ROCSHMEM_REDUCE_SYNC_SIZE.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += broadcast_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def fcollect_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_fcollect(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
|
||||
f" const {T} *source, int nelems);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_fcollect_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_FCOLLECT
|
||||
* @brief Concatenates blocks of data from multiple PEs to an array in every
|
||||
* PE participating in the collective routine.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] team The team participating in the collective.
|
||||
* @param[in] dest Destination address. Must be an address on the
|
||||
* symmetric heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] nelems Number of data blocks in source array.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += fcollect_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def reduction_api(T, TNAME, Op_API):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_wg_reduce(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n"
|
||||
f" int nreduce);\n"
|
||||
f"__host__ int rocshmem_ctx_{TNAME}_{Op_API}_reduce(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n"
|
||||
f" int nreduce);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def arith_reduction_api(T, TNAME):
|
||||
operations = ["sum", "min", "max", "prod"]
|
||||
return "".join([reduction_api(T, TNAME, op) for op in operations])
|
||||
|
||||
def bitwise_reduction_api(T, TNAME):
|
||||
operations = ["or", "and", "xor"]
|
||||
return "".join([reduction_api(T, TNAME, op) for op in operations])
|
||||
|
||||
|
||||
def generate_reduction_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_REDUCTIONS
|
||||
* @brief Perform an allreduce between PEs in the active set. The caller
|
||||
* is blocked until the reduction completes.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] team The team participating in the collective.
|
||||
* @param[in] dest Destination address. Must be an address on the
|
||||
* symmetric heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] nreduce Size of the buffer to participate in the reduction.
|
||||
*
|
||||
* @return int (Zero on successful local completion. Nonzero otherwise.)
|
||||
*/\n"""
|
||||
|
||||
int_types = [
|
||||
("short", "short"),
|
||||
("int", "int"),
|
||||
("long", "long"),
|
||||
("long long", "longlong")
|
||||
]
|
||||
|
||||
float_types = [
|
||||
("float", "float"),
|
||||
("double", "double")
|
||||
]
|
||||
|
||||
for type_, tname_ in int_types:
|
||||
expanded_code += arith_reduction_api(type_, tname_)
|
||||
expanded_code += bitwise_reduction_api(type_, tname_)
|
||||
|
||||
for type_, tname_ in float_types:
|
||||
expanded_code += arith_reduction_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def write_to_file(filename, content):
|
||||
with open(filename, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
|
||||
def generate_COLL_header(output_dir, copyright):
|
||||
expanded_code = copyright
|
||||
|
||||
expanded_code += """
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
"""
|
||||
|
||||
expanded_code += (
|
||||
generate_alltoall_api() +
|
||||
generate_broadcast_api() +
|
||||
generate_fcollect_api() +
|
||||
generate_reduction_api()
|
||||
)
|
||||
|
||||
expanded_code += """
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
|
||||
"""
|
||||
|
||||
output_file = os.path.join(
|
||||
output_dir, 'rocshmem_COLL.hpp'
|
||||
)
|
||||
|
||||
write_to_file(output_file, expanded_code)
|
||||
@@ -0,0 +1,176 @@
|
||||
"""
|
||||
******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
types = [
|
||||
("float", "float"),
|
||||
("double", "double"),
|
||||
("char", "char"),
|
||||
("signed char", "schar"),
|
||||
("short", "short"),
|
||||
("int", "int"),
|
||||
("long", "long"),
|
||||
("long long", "longlong"),
|
||||
("unsigned char", "uchar"),
|
||||
("unsigned short", "ushort"),
|
||||
("unsigned int", "uint"),
|
||||
("unsigned long", "ulong"),
|
||||
("unsigned long long", "ulonglong"),
|
||||
]
|
||||
|
||||
|
||||
def wait_until_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ void rocshmem_{TNAME}_wait_until(\n"
|
||||
f" {T} *ivars, int cmp, {T} val);\n"
|
||||
f"__device__ size_t rocshmem_{TNAME}_wait_until_any(\n"
|
||||
f" {T} *ivars, size_t nelems, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__device__ void rocshmem_{TNAME}_wait_until_all(\n"
|
||||
f" {T} *ivars, size_t nelems, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__device__ size_t rocshmem_{TNAME}_wait_until_some(\n"
|
||||
f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__device__ size_t rocshmem_{TNAME}_wait_until_any_vector(\n"
|
||||
f" {T} *ivars, size_t nelems, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__device__ void rocshmem_{TNAME}_wait_until_all_vector(\n"
|
||||
f" {T} *ivars, size_t nelems, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__device__ size_t rocshmem_{TNAME}_wait_until_some_vector(\n"
|
||||
f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_wait_until(\n"
|
||||
f" {T} *ivars, int cmp, {T} val);\n"
|
||||
f"__host__ size_t rocshmem_{TNAME}_wait_until_any(\n"
|
||||
f" {T} *ivars, size_t nelems, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_wait_until_all(\n"
|
||||
f" {T} *ivars, size_t nelems, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__host__ size_t rocshmem_{TNAME}_wait_until_some(\n"
|
||||
f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__host__ size_t rocshmem_{TNAME}_wait_until_any_vector(\n"
|
||||
f" {T} *ivars, size_t nelems, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_wait_until_all_vector(\n"
|
||||
f" {T} *ivars, size_t nelems, const int* status,\n"
|
||||
f" int cmp, {T} val);\n"
|
||||
f"__host__ size_t rocshmem_{TNAME}_wait_until_some_vector(\n"
|
||||
f" {T} *ivars, size_t nelems, size_t* indices, const int* status,\n"
|
||||
f" int cmp, {T} val);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_wait_until_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_WAIT_UNTIL
|
||||
* @brief Block the caller until the condition (* \p ptr \p cmps \p val) is
|
||||
* true.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
|
||||
* @param[in] cmp Operation for the comparison.
|
||||
* @param[in] val Value to compare the memory at \p ptr to.
|
||||
*
|
||||
* @return void
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += wait_until_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def test_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ int rocshmem_{TNAME}_test(\n"
|
||||
f" {T} *ivars, int cmp, {T} val);\n"
|
||||
f"__host__ int rocshmem_{TNAME}_test(\n"
|
||||
f" {T} *ivars, int cmp, {T} val);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_test_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_TEST
|
||||
* @brief test if the condition (* \p ptr \p cmps \p val) is
|
||||
* true.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
|
||||
* @param[in] cmp Operation for the comparison.
|
||||
* @param[in] val Value to compare the memory at \p ptr to.
|
||||
*
|
||||
* @return 1 if the evaluation is true else 0
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += test_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def write_to_file(filename, content):
|
||||
with open(filename, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
|
||||
def generate_P2P_SYNC_header(output_dir, copyright):
|
||||
expanded_code = copyright
|
||||
|
||||
expanded_code += """
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
"""
|
||||
|
||||
expanded_code += (
|
||||
generate_wait_until_api() +
|
||||
generate_test_api()
|
||||
)
|
||||
|
||||
expanded_code += """
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
|
||||
"""
|
||||
|
||||
output_file = os.path.join(
|
||||
output_dir, 'rocshmem_P2P_SYNC.hpp'
|
||||
)
|
||||
|
||||
write_to_file(output_file, expanded_code)
|
||||
@@ -0,0 +1,335 @@
|
||||
"""
|
||||
******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
types = [
|
||||
("float", "float"),
|
||||
("double", "double"),
|
||||
("char", "char"),
|
||||
("signed char", "schar"),
|
||||
("short", "short"),
|
||||
("int", "int"),
|
||||
("long", "long"),
|
||||
("long long", "longlong"),
|
||||
("unsigned char", "uchar"),
|
||||
("unsigned short", "ushort"),
|
||||
("unsigned int", "uint"),
|
||||
("unsigned long", "ulong"),
|
||||
("unsigned long long", "ulonglong"),
|
||||
]
|
||||
|
||||
|
||||
def put_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_put(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_put({T} *dest,\n"
|
||||
f" const {T} *source, size_t nelems, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_put_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_PUT
|
||||
* @brief Writes contiguous data of \p nelems elements from \p source on the
|
||||
* calling PE to \p dest at \p pe. The caller will block until the operation
|
||||
* completes locally (it is safe to reuse \p source). The caller must
|
||||
* call into rocshmem_quiet() if remote completion is required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in number of elements.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += put_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def get_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_get(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_get({T} *dest,\n"
|
||||
f" const {T} *source, size_t nelems, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_get_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_GET
|
||||
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
|
||||
* to \p dest on the calling PE. The calling work-group will block until the
|
||||
* operation completes (data has been placed in \p dest).
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += get_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def p_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_p(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value,\n"
|
||||
f" int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_p(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_p(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, {T} value,\n"
|
||||
f" int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_p(\n"
|
||||
f" {T} *dest, {T} value, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_p_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_P
|
||||
* @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe.
|
||||
* The caller must call into rocshmem_quiet() if remote completion is
|
||||
* required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] value Value to write to dest at \p pe.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += p_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def g_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_ctx_{TNAME}_g(\n"
|
||||
f" rocshmem_ctx_t ctx, const {T} *source, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE {T} rocshmem_{TNAME}_g(\n"
|
||||
f" const {T} *source, int pe);\n"
|
||||
f"__host__ {T} rocshmem_ctx_{TNAME}_g(\n"
|
||||
f" rocshmem_ctx_t ctx, const {T} *source, int pe);\n"
|
||||
f"__host__ {T} rocshmem_{TNAME}_g(\n"
|
||||
f" const {T} *source, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_g_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_G
|
||||
* @brief reads and returns single value from \p source at \p pe.
|
||||
* The calling work-group/thread will block until the operation completes.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] source Source address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return the value read from remote \p source at \p pe.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += g_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def put_nbi_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_nbi(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_nbi(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_put_nbi(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_put_nbi(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_put_nbi_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_PUT_NBI
|
||||
* @brief Writes contiguous data of \p nelems elements from \p source on the
|
||||
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
|
||||
* will return as soon as the request is posted. The caller must call
|
||||
* rocshmem_quiet() on the same context if completion notification is
|
||||
* required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += put_nbi_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def get_nbi_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_nbi(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_nbi(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_get_nbi(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__host__ void rocshmem_{TNAME}_get_nbi({T} *dest,\n"
|
||||
f" const {T} *source, size_t nelems, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_get_nbi_api():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @name SHMEM_GET_NBI
|
||||
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
|
||||
* to \p dest on the calling PE. The operation is not blocking. The caller will
|
||||
* return as soon as the request is posted. The caller must call
|
||||
* rocshmem_quiet() on the same context if completion notification is
|
||||
* required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-thread
|
||||
* granularity. However, performance may be improved if the caller can
|
||||
* coalesce contiguous messages and elect a leader thread to call into the
|
||||
* ROCSHMEM function.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += get_nbi_api(type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def write_to_file(filename, content):
|
||||
with open(filename, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
|
||||
def generate_RMA_header(output_dir, copyright):
|
||||
expanded_code = copyright
|
||||
|
||||
expanded_code += """
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
"""
|
||||
|
||||
expanded_code += (
|
||||
generate_put_api() +
|
||||
generate_p_api() +
|
||||
generate_get_api() +
|
||||
generate_g_api() +
|
||||
generate_put_nbi_api() +
|
||||
generate_get_nbi_api()
|
||||
)
|
||||
|
||||
expanded_code += """
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
|
||||
"""
|
||||
|
||||
output_file = os.path.join(
|
||||
output_dir, 'rocshmem_RMA.hpp'
|
||||
)
|
||||
|
||||
write_to_file(output_file, expanded_code)
|
||||
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
types = [
|
||||
("float", "float"),
|
||||
("double", "double"),
|
||||
("char", "char"),
|
||||
("signed char", "schar"),
|
||||
("short", "short"),
|
||||
("int", "int"),
|
||||
("long", "long"),
|
||||
("long long", "longlong"),
|
||||
("unsigned char", "uchar"),
|
||||
("unsigned short", "ushort"),
|
||||
("unsigned int", "uint"),
|
||||
("unsigned long", "ulong"),
|
||||
("unsigned long long", "ulonglong"),
|
||||
]
|
||||
|
||||
|
||||
def put_api_x(GRAN, T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_{GRAN}(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_{GRAN}(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_put_api_x():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @brief Writes contiguous data of \p nelems elements from \p source on the
|
||||
* calling PE to \p dest at \p pe. The caller will block until the operation
|
||||
* completes locally (it is safe to reuse \p source). The caller must
|
||||
* call into rocshmem_quiet() if remote completion is required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-wave
|
||||
* granularity. However, all threads in a wave must collectively participate
|
||||
* in the call using the same arguments
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in number of elements.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += put_api_x("wave", type_, tname_)
|
||||
|
||||
expanded_code += """
|
||||
/**
|
||||
* @brief Writes contiguous data of \p nelems elements from \p source on the
|
||||
* calling PE to \p dest at \p pe. The caller will block until the operation
|
||||
* completes locally (it is safe to reuse \p source). The caller must
|
||||
* call into rocshmem_quiet() if remote completion is required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-workgroup
|
||||
* (WG) granularity. However, All threads in a WG must collectively participate
|
||||
* in the call using the same arguments.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in number of elements.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += put_api_x("wg", type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def get_api_x(GRAN, T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_{GRAN}(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_{GRAN}(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_get_api_x():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
|
||||
* to \p dest on the calling PE. The calling work-group will block until the
|
||||
* operation completes (data has been placed in \p dest).
|
||||
*
|
||||
* This function can be called from divergent control paths at per-wave
|
||||
* granularity. However, all threads in the wave must participate in the
|
||||
* call using the same parameters
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += get_api_x("wave", type_, tname_)
|
||||
|
||||
expanded_code += """
|
||||
/**
|
||||
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
|
||||
* to \p dest on the calling PE. The calling work-group will block until the
|
||||
* operation completes (data has been placed in \p dest).
|
||||
*
|
||||
* This function can be called from divergent control paths at per-workgroup
|
||||
* granularity. However, all threads in the workgroup must participate in
|
||||
* the call using the same parameters
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += get_api_x("wg", type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def put_nbi_api_x(GRAN, T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_nbi_{GRAN}(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_nbi_{GRAN}(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_put_nbi_api_x():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @brief Writes contiguous data of \p nelems elements from \p source on the
|
||||
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
|
||||
* will return as soon as the request is posted. The caller must call
|
||||
* rocshmem_quiet() on the same context if completion notification is
|
||||
* required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-wave
|
||||
* granularity. However, all threads in the wave must call in with the same
|
||||
* arguments.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += put_nbi_api_x("wave", type_, tname_)
|
||||
|
||||
expanded_code += """
|
||||
/**
|
||||
* @brief Writes contiguous data of \p nelems elements from \p source on the
|
||||
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
|
||||
* will return as soon as the request is posted. The caller must call
|
||||
* rocshmem_quiet() on the same context if completion notification is
|
||||
* required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-workgroup
|
||||
* granularity. However, all threads in the WG must call in with the sameo
|
||||
* arguments.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += put_nbi_api_x("wg", type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def get_nbi_api_x(GRAN, T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_get_nbi_{GRAN}(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source,\n"
|
||||
f" size_t nelems, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_get_nbi_{GRAN}(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def generate_get_nbi_api_x():
|
||||
expanded_code = """
|
||||
/**
|
||||
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
|
||||
* to \p dest on the calling PE. The operation is not blocking. The caller
|
||||
* will return as soon as the request is posted. The caller must call
|
||||
* rocshmem_quiet() on the same context if completion notification is
|
||||
* required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-wave
|
||||
* granularity. However, all threads in the wave must call in with the same
|
||||
* arguments.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += get_nbi_api_x("wave", type_, tname_)
|
||||
|
||||
expanded_code += """
|
||||
/**
|
||||
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
|
||||
* to \p dest on the calling PE. The operation is not blocking. The caller
|
||||
* will return as soon as the request is posted. The caller must call
|
||||
* rocshmem_quiet() on the same context if completion notification is
|
||||
* required.
|
||||
*
|
||||
* This function can be called from divergent control paths at per-workgroup
|
||||
* granularity. However, all threads in the WG must call in with the same
|
||||
* arguments.
|
||||
*
|
||||
* @param[in] ctx Context with which to perform this operation.
|
||||
* @param[in] dest Destination address. Must be an address on the symmetric
|
||||
* heap.
|
||||
* @param[in] source Source address. Must be an address on the symmetric heap.
|
||||
* @param[in] nelems Size of the transfer in bytes.
|
||||
* @param[in] pe PE of the remote process.
|
||||
*
|
||||
* @return void.
|
||||
*/\n"""
|
||||
for type_, tname_ in types:
|
||||
expanded_code += get_nbi_api_x("wg", type_, tname_)
|
||||
|
||||
return expanded_code
|
||||
|
||||
|
||||
def write_to_file(filename, content):
|
||||
with open(filename, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
|
||||
def generate_RMA_X_header(output_dir, copyright):
|
||||
expanded_code = copyright
|
||||
|
||||
expanded_code += """
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
"""
|
||||
|
||||
expanded_code += (
|
||||
generate_put_api_x() +
|
||||
generate_get_api_x() +
|
||||
generate_put_nbi_api_x() +
|
||||
generate_get_nbi_api_x()
|
||||
)
|
||||
|
||||
expanded_code += """
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
|
||||
"""
|
||||
|
||||
output_file = os.path.join(
|
||||
output_dir, 'rocshmem_RMA_X.hpp'
|
||||
)
|
||||
|
||||
write_to_file(output_file, expanded_code)
|
||||
@@ -0,0 +1,108 @@
|
||||
"""
|
||||
******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
types = [
|
||||
("float", "float"),
|
||||
("double", "double"),
|
||||
("char", "char"),
|
||||
("signed char", "schar"),
|
||||
("short", "short"),
|
||||
("int", "int"),
|
||||
("long", "long"),
|
||||
("long long", "longlong"),
|
||||
("unsigned char", "uchar"),
|
||||
("unsigned short", "ushort"),
|
||||
("unsigned int", "uint"),
|
||||
("unsigned long", "ulong"),
|
||||
("unsigned long long", "ulonglong"),
|
||||
]
|
||||
|
||||
|
||||
def putmem_signal_dec(SUFFIX):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_putmem_signal{SUFFIX}(\n"
|
||||
f" void *dest, const void *source, size_t nelems, uint64_t *sig_addr,\n"
|
||||
f" uint64_t signal, int sig_op, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal{SUFFIX}(\n"
|
||||
f" rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,\n"
|
||||
f" uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def put_signal_typed_dec(T, TNAME, SUFFIX):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_put_signal{SUFFIX}(\n"
|
||||
f" rocshmem_ctx_t ctx, {T} *dest, const {T} *source, size_t nelems,\n"
|
||||
f" uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_{TNAME}_put_signal{SUFFIX}(\n"
|
||||
f" {T} *dest, const {T} *source, size_t nelems, uint64_t *sig_addr,\n"
|
||||
f" uint64_t signal, int sig_op, int pe);\n\n"
|
||||
)
|
||||
|
||||
|
||||
def put_signal_dec(SUFFIX):
|
||||
return "".join([put_signal_typed_dec(T, TNAME, SUFFIX) for T, TNAME in types])
|
||||
|
||||
|
||||
def signaling_api_dec(SUFFIX):
|
||||
return (putmem_signal_dec(SUFFIX) + put_signal_dec(SUFFIX))
|
||||
|
||||
|
||||
def generate_signal_api():
|
||||
|
||||
suffixes = ["", "_wg", "_wave", "_nbi", "_nbi_wg", "_nbi_wave"]
|
||||
|
||||
return "".join([signaling_api_dec(suffix) for suffix in suffixes])
|
||||
|
||||
|
||||
def write_to_file(filename, content):
|
||||
with open(filename, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
|
||||
def generate_SIG_OP_header(output_dir, copyright):
|
||||
expanded_code = copyright
|
||||
|
||||
expanded_code += """
|
||||
#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
|
||||
#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
|
||||
|
||||
namespace rocshmem {
|
||||
"""
|
||||
|
||||
expanded_code += generate_signal_api()
|
||||
|
||||
expanded_code += """
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
|
||||
"""
|
||||
|
||||
output_file = os.path.join(
|
||||
output_dir, 'rocshmem_SIG_OP.hpp'
|
||||
)
|
||||
|
||||
write_to_file(output_file, expanded_code)
|
||||
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from RMA import generate_RMA_header
|
||||
from AMO import generate_AMO_header
|
||||
from SIG_OP import generate_SIG_OP_header
|
||||
from COLL import generate_COLL_header
|
||||
from P2P_SYNC import generate_P2P_SYNC_header
|
||||
from RMA_X import generate_RMA_X_header
|
||||
|
||||
|
||||
copyright = """
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
"""
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate an expanded header files.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'directory', type=str, nargs='?', default='.',
|
||||
help='Directory to write the header files to (default: current directory)'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_RMA_header(args.directory, copyright)
|
||||
generate_AMO_header(args.directory, copyright)
|
||||
generate_SIG_OP_header(args.directory, copyright)
|
||||
generate_COLL_header(args.directory, copyright)
|
||||
generate_P2P_SYNC_header(args.directory, copyright)
|
||||
generate_RMA_X_header(args.directory, copyright)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Ссылка в новой задаче
Block a user