diff --git a/AUTHORS b/AUTHORS index 65bac892f5..56ba925ac9 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,4 @@ -# This is the list of ROCSHMEM's significant contributors. +# This is the list of rocSHMEM's significant contributors. # # This does not necessarily list everyone who has contributed code, # especially since many employees of one corporation may be contributing. @@ -10,4 +10,4 @@ Michael LeBeane Rohit Zambre Kishore Punniyamurthy Ruchi Shah -Muhammad A. Awad \ No newline at end of file +Muhammad A. Awad diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ff7ba6b6d..5b9a9b33e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,8 +137,8 @@ set( rocshmem VERSION ${ROCSHMEM_VERSION} LANGUAGES CXX - DESCRIPTION "ROCSHMEM" - HOMEPAGE_URL "https://github.com/ROCm-Developer-Tools/ROC_SHMEM") + DESCRIPTION "rocSHMEM" + HOMEPAGE_URL "https://github.com/ROCm-Developer-Tools/rocSHMEM") ############################################################################### # DEFAULT BUILD TYPE @@ -190,7 +190,7 @@ set_target_properties( ${PROJECT_NAME} PROPERTIES PUBLIC_HEADER - "${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/roc_shmem/roc_shmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/roc_shmem/debug.hpp" + "${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/rocshmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/debug.hpp" ) ############################################################################### diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 51ad5d4913..9d6cd92e28 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,11 +1,11 @@ ## How to fork from us -To keep our development fast and conflict free, we recommend you to [fork](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/fork) our repository and start your work from our `dev` branch in your private repository. +To keep our development fast and conflict free, we recommend you to [fork](https://github.com/ROCm-Developer-Tools/rocSHMEM/fork) our repository and start your work from our `dev` branch in your private repository. Afterwards, git clone your repository to your local machine. But that is not it! To keep track of the original develop repository, add it as another remote. ``` -git remote add mainline https://github.com/ROCm-Developer-Tools/ROC_SHMEM.git +git remote add mainline https://github.com/ROCm-Developer-Tools/rocSHMEM.git git checkout dev ``` @@ -17,23 +17,23 @@ git checkout -b topic- and apply your changes there. -## How to contribute to ROCSHMEM +## How to contribute to rocSHMEM ### Did you find a bug? -- Ensure the bug was not already reported by searching on GitHub under [Issues](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/issues). +- Ensure the bug was not already reported by searching on GitHub under [Issues](https://github.com/ROCm-Developer-Tools/rocSHMEM/issues). -- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/issues/new). +- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ROCm-Developer-Tools/rocSHMEM/issues/new). ### Did you write a patch that fixes a bug? -- Open a new GitHub [pull request](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/compare) with the patch. +- Open a new GitHub [pull request](https://github.com/ROCm-Developer-Tools/rocSHMEM/compare) with the patch. - Ensure the PR description clearly describes the problem and solution. If there is an existing GitHub issue open describing this bug, please include it in the description so we can close it. -- Ensure the PR is based on the `dev` branch of the ROCSHMEM GitHub repository. +- Ensure the PR is based on the `dev` branch of the rocSHMEM GitHub repository. -- ROCSHMEM requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/blob/main/LICENSE): +- rocSHMEM requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/ROCm-Developer-Tools/rocSHMEM/blob/main/LICENSE): > (a) The contribution was created in whole or in part by me and I diff --git a/README.md b/README.md index ba671ff51a..395b5ff6ab 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,19 @@ -# ROCm OpenSHMEM (ROC_SHMEM) +# ROCm OpenSHMEM (rocSHMEM) -The ROCm OpenSHMEM (ROC_SHMEM) runtime is part of an AMD Research +The ROCm OpenSHMEM (rocSHMEM) runtime is part of an AMD Research initiative to provide a unified runtime for heterogeneous systems. -ROC_SHMEM supports both host-centric (a traditional host-driven +rocSHMEM supports both host-centric (a traditional host-driven OpenSHMEM runtime) and GPU-centric networking (provided a GPU kernel the ability to perform network operations) through an OpenSHMEM-like interface. This intra-kernel networking simplifies application code complexity and enables more fine-grained communication/computation overlap than traditional host-driven networking. -ROC_SHMEM's primary target is heterogeneous computing; hence, for both -CPU-centric and GPU-centric communications, ROC_SHMEM uses a single +rocSHMEM's primary target is heterogeneous computing; hence, for both +CPU-centric and GPU-centric communications, rocSHMEM uses a single symmetric heap (SHEAP) that is allocated on GPU memories. -ROC_SHMEM's GPU-centric communication has two different backend designs. +rocSHMEM's GPU-centric communication has two different backend designs. The backends primarily differ in their implementations of intra-kernel networking. @@ -24,23 +24,23 @@ the doorbell on the NIC to send network commands. GPU-IB is the default and preferred backend design that offers the best performance. The second design will be referred to as the Reverse Offload (RO) backend. With -the RO backend, the GPU runtime forwards ROC_SHMEM networking operations to the +the RO backend, the GPU runtime forwards rocSHMEM networking operations to the host-side runtime, which calls into a traditional MPI or OpenSHMEM implementation. This forwarding of requests is transparent to the programmer, who only sees the GPU-side interface. Both designs of the GPU-centric interface coexist seamlessly with the -CPU-centric interface of the unified runtime. ROC_SHMEM ensures that CPU-centric +CPU-centric interface of the unified runtime. rocSHMEM ensures that CPU-centric updates to the SHEAP are consistent and visible to a GPU kernel that is executing in parallel to host-initiated communication. ## Limitations -ROC_SHMEM is an experimental prototype from AMD Research and not an official +rocSHMEM is an experimental prototype from AMD Research and not an official ROCm product. The software is provided as-is with no guarantees of support from AMD or AMD Research. -ROC_SHMEM base requirements: +rocSHMEM base requirements: * ROCm version 4.3.1 onwards * May work with other versions, but not tested * AMD GFX9 GPUs (e.g.: MI25, Vega 56, Vega 64, MI50, MI60, MI100, Radeon VII) @@ -51,16 +51,16 @@ ROC_SHMEM base requirements: * InfiniBand adaptor compatable with ROCm RDMA technology * UCX 1.6 or greater with ROCm support -ROC_SHMEM optional requirements +rocSHMEM optional requirements * For Documentation: * Doxygen -ROC_SHMEM only supports HIP applications. There are no plans to port to +rocSHMEM only supports HIP applications. There are no plans to port to OpenCL. ## Building and Installation -ROC_SHMEM uses the CMake build system. The CMakeLists file contains +rocSHMEM uses the CMake build system. The CMakeLists file contains additional details about library options. To create an out-of-source build: @@ -84,34 +84,34 @@ custom install path by supplying it as an argument. For example: ../scripts/build_configs/rc_single /path/to/install -## Compiling/linking and Running with ROC_SHMEM +## Compiling/linking and Running with rocSHMEM -ROC_SHMEM is built as a host and device side library that can be statically +rocSHMEM is built as a host and device side library that can be statically linked to your application during compilation using hipcc. -During the compilation of your application, include the ROC_SHMEM header files -and the ROC_SHMEM library when using hipcc: +During the compilation of your application, include the rocSHMEM header files +and the rocSHMEM library when using hipcc: -I/path/to/rocshmem/install/include -L/path/to/rocshmem/install/lib -lrocshmem -NOTE: ROC_SHMEM depends on MPI for its host code. So, you will need to link +NOTE: rocSHMEM depends on MPI for its host code. So, you will need to link to an MPI library. Since you must use the hipcc compiler, the arguments for MPI linkage must be added manually as opposed to using mpicc. Similary, -ROC_SHMEM depends on Verbs for its device code. So, you will need to link +rocSHMEM depends on Verbs for its device code. So, you will need to link to a Verbs library. When using hipcc directly (as opposed to through a build system), we recommend performing the compilation and linking steps separately. Here are the steps to build a standalone program, say -roc_shmem_hello.cpp. +rocshmem_hello.cpp. ``` # Compile -/opt/rocm/bin/hipcc ./roc_shmem_hello.cpp -I/path/to/rocshmem/install/include -fgpu-rdc -o ./roc_shmem_hello.o -c +/opt/rocm/bin/hipcc ./rocshmem_hello.cpp -I/path/to/rocshmem/install/include -fgpu-rdc -o ./rocshmem_hello.o -c # Link -/opt/rocm/bin/hipcc ./roc_shmem_hello.o /path/to/rocshmem/install/lib/librocshmem.a -lmpi -lmlx5 -libverbs -lhsa-runtime64 -fgpu-rdc -o roc_shmem_hello +/opt/rocm/bin/hipcc ./rocshmem_hello.o /path/to/rocshmem/install/lib/librocshmem.a -lmpi -lmlx5 -libverbs -lhsa-runtime64 -fgpu-rdc -o rocshmem_hello ``` @@ -122,20 +122,20 @@ page useful. ## Runtime Parameters - ROC_SHMEM_HEAP_SIZE (default : 1 GB) + ROCSHMEM_HEAP_SIZE (default : 1 GB) Defines the size of the OpenSHMEM symmetric heap Note the heap is on the GPU memory. - ROC_SHMEM_SQ_SIZE (default 1024) + ROCSHMEM_SQ_SIZE (default 1024) Defines the size of the SQ as number of network packet (WQE). Each WQE is 64B. This only for GPU-IB conduit - ROC_SHMEM_USE_CQ_GPU_MEM (default : 1) + ROCSHMEM_USE_CQ_GPU_MEM (default : 1) Set the placement of CQ on GPU memory (1) or CPU memory (0) - ROC_SHMEM_USE_SQ_GPU_MEM (default : 1) + ROCSHMEM_USE_SQ_GPU_MEM (default : 1) Set the placement of SQ on GPU memory (1) or CPU memory (0) @@ -143,13 +143,13 @@ page useful. Force producer/consumer queues between CPU and GPU to be in CPU memory. RO backend only. -ROC_SHMEM also requires the following environment variable be set for ROCm: +rocSHMEM also requires the following environment variable be set for ROCm: export HSA_FORCE_FINE_GRAIN_PCIE=1 ## Documentation -To generate doxygen documentation for ROC_SHMEM's API, run the following +To generate doxygen documentation for rocSHMEM's API, run the following from the library's build directory: make docs @@ -158,13 +158,13 @@ The doxygen output will be in the `docs` folder of the build directory. ## Examples -ROC_SHMEM is similar to OpenSHMEM and should be familiar to programmers who +rocSHMEM is similar to OpenSHMEM and should be familiar to programmers who have experience with OpenSHMEM or other PGAS network programming APIs in the -context of CPUs. The best way to learn how to use ROC_SHMEM is to read the +context of CPUs. The best way to learn how to use rocSHMEM is to read the autogenerated doxygen documentation for functions described in -`roc_shmem/roc_shmem.hpp`, or to look at the provided sample applications in the -`tests/` folder. ROC_SHMEM is shipped with a basic test suite for the -supported ROC_SHMEM API. The examples test Puts, Gets, nonblocking Puts, +`rocshmem/rocshmem.hpp`, or to look at the provided sample applications in the +`tests/` folder. rocSHMEM is shipped with a basic test suite for the +supported rocSHMEM API. The examples test Puts, Gets, nonblocking Puts, nonblocking Gets, Quiets, Atomics, Tests, Wai-untils, Broadcasts, and Reductions. @@ -178,7 +178,7 @@ Here are some example uses of the driver script: ## Building the Dependencies -ROC_SHMEM requires an MPI runtime on the host that supports ROCm-Aware MPI. +rocSHMEM requires an MPI runtime on the host that supports ROCm-Aware MPI. Currently all ROCm-Aware MPI runtimes require the usage of ROCm-Aware UCX. To build and configure ROCm-Aware UCX, you need to: diff --git a/cmake/rocshmem.lua.in b/cmake/rocshmem.lua.in index 2a6674685e..6adca9793d 100644 --- a/cmake/rocshmem.lua.in +++ b/cmake/rocshmem.lua.in @@ -1,6 +1,6 @@ local help_message = [[ -ROC_SHMEM is an open-source GPU initiated networking library +rocSHMEM is an open-source GPU initiated networking library for High Performance Computing and Machine Learning workloads. Version @ROCSHMEM_FULL_VERSION@ @@ -12,7 +12,7 @@ whatis("Name: rocshmem") whatis("Version: @ROCSHMEM_FULL_VERSION@") whatis("Keywords: GPU, PGAS, RMA, HPC") whatis("Description: tool for GPU initiated networking") -whatis("URL: https://github.com/ROCm-Developer-Tools/ROC_SHMEM") +whatis("URL: https://github.com/ROCm-Developer-Tools/rocSHMEM") -- Export environmental variables local topDir="@CMAKE_INSTALL_PREFIX@" diff --git a/docs/host_facing_support.txt b/docs/host_facing_support.txt index a01ebb16dc..ea9813b181 100644 --- a/docs/host_facing_support.txt +++ b/docs/host_facing_support.txt @@ -33,7 +33,7 @@ NIC (this does occur during MPI_Win_create), we are good. UCX claims to not support GPU-aware communication because they have not added in support for the different types of scenarios that could exist in a system (eg, when a system does not have GPU-direct). The -scope of ROC_SHMEM is currenlty limited to configurations that UCX +scope of rocSHMEM is currently limited to configurations that UCX already supports. ## But the main branch of MPICH does support HIP now? diff --git a/examples/rocshmem_allreduce_test.cc b/examples/rocshmem_allreduce_test.cc index 671130d1f7..0314e34cdb 100644 --- a/examples/rocshmem_allreduce_test.cc +++ b/examples/rocshmem_allreduce_test.cc @@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_allreduce_test.o -o rocshmem_allreduce_test $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \ -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 -ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test +ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test */ #include #include #include -#include +#include #define CHECK_HIP(condition) { \ hipError_t error = condition; \ @@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test using namespace rocshmem; __global__ void allreduce_test(int *source, int *dest, size_t nelem, - roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; + rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; int64_t ctx_type = 0; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); - int num_pes = roc_shmem_ctx_n_pes(ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); + int num_pes = rocshmem_ctx_n_pes(ctx); - roc_shmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem); + rocshmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem); - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); __syncthreads(); - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } static void init_sendbuf (int *source, int nelem, int my_pe) @@ -82,23 +82,23 @@ int main (int argc, char **argv) nelem = atoi(argv[1]); } - int my_pe = roc_shmem_my_pe(); - int npes = roc_shmem_n_pes(); + int my_pe = rocshmem_my_pe(); + int npes = rocshmem_n_pes(); int ndevices, my_device = 0; CHECK_HIP(hipGetDeviceCount(&ndevices)); my_device = my_pe % ndevices; CHECK_HIP(hipSetDevice(my_device)); - roc_shmem_init(); + rocshmem_init(); - int *source = (int *)roc_shmem_malloc(nelem * sizeof(int)); - int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int)); + int *source = (int *)rocshmem_malloc(nelem * sizeof(int)); + int *dest = (int *)rocshmem_malloc(nelem * sizeof(int)); if (NULL == source || NULL == dest) { std::cout << "Error allocating memory from symmetric heap" << std::endl; std::cout << "source: " << source << ", dest: " << dest << ", size: " << sizeof(int) * nelem << std::endl; - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } init_sendbuf(source, nelem, my_pe); @@ -106,9 +106,9 @@ int main (int argc, char **argv) dest[i] = -1; } - roc_shmem_team_t team_reduce_world_dup; - team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0, + rocshmem_team_t team_reduce_world_dup; + team_reduce_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0, &team_reduce_world_dup); CHECK_HIP(hipDeviceSynchronize()); @@ -121,9 +121,9 @@ int main (int argc, char **argv) bool pass = check_recvbuf(dest, nelem, my_pe, npes); printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); - roc_shmem_free(source); - roc_shmem_free(dest); + rocshmem_free(source); + rocshmem_free(dest); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/examples/rocshmem_alltoall_test.cc b/examples/rocshmem_alltoall_test.cc index 6f48f07653..30dcf5da7b 100644 --- a/examples/rocshmem_alltoall_test.cc +++ b/examples/rocshmem_alltoall_test.cc @@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_alltoall_test.o -o rocshmem_alltoall_test \ $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \ -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 -ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test +ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test */ #include #include #include -#include +#include #define CHECK_HIP(condition) { \ hipError_t error = condition; \ @@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test using namespace rocshmem; __global__ void alltoall_test(int *source, int *dest, size_t nelem, - roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; + rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; int64_t ctx_type = 0; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); - int num_pes = roc_shmem_ctx_n_pes(ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); + int num_pes = rocshmem_ctx_n_pes(ctx); - roc_shmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem); + rocshmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem); - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); __syncthreads(); - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } static void init_sendbuf (int *source, int nelem, int my_pe, int npes) @@ -87,23 +87,23 @@ int main (int argc, char **argv) nelem = atoi(argv[1]); } - int my_pe = roc_shmem_my_pe(); - int npes = roc_shmem_n_pes(); + int my_pe = rocshmem_my_pe(); + int npes = rocshmem_n_pes(); int ndevices, my_device = 0; CHECK_HIP(hipGetDeviceCount(&ndevices)); my_device = my_pe % ndevices; CHECK_HIP(hipSetDevice(my_device)); - roc_shmem_init(); + rocshmem_init(); - int *source = (int *)roc_shmem_malloc(nelem * npes * sizeof(int)); - int *dest = (int *)roc_shmem_malloc(nelem * npes * sizeof(int)); + int *source = (int *)rocshmem_malloc(nelem * npes * sizeof(int)); + int *dest = (int *)rocshmem_malloc(nelem * npes * sizeof(int)); if (NULL == source || NULL == dest) { std::cout << "Error allocating memory from symmetric heap" << std::endl; std::cout << "source: " << source << ", dest: " << dest << ", size: " << sizeof(int) * nelem * npes << std::endl; - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } init_sendbuf(source, nelem, my_pe, npes); @@ -111,9 +111,9 @@ int main (int argc, char **argv) dest[i] = -1; } - roc_shmem_team_t team_reduce_world_dup; - team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0, + rocshmem_team_t team_reduce_world_dup; + team_reduce_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0, &team_reduce_world_dup); CHECK_HIP(hipDeviceSynchronize()); @@ -127,9 +127,9 @@ int main (int argc, char **argv) printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); - roc_shmem_free(source); - roc_shmem_free(dest); + rocshmem_free(source); + rocshmem_free(dest); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/examples/rocshmem_broadcast_test.cc b/examples/rocshmem_broadcast_test.cc index c66eca48f7..b87f982fb4 100644 --- a/examples/rocshmem_broadcast_test.cc +++ b/examples/rocshmem_broadcast_test.cc @@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_broadcast_test.o -o rocshmem_broadcast_test $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \ -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 -ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test +ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test */ #include #include #include -#include +#include #define CHECK_HIP(condition) { \ hipError_t error = condition; \ @@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test using namespace rocshmem; __global__ void broadcast_test(int *source, int *dest, size_t nelem, - int root, roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; + int root, rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; int64_t ctx_type = 0; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); - int num_pes = roc_shmem_ctx_n_pes(ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); + int num_pes = rocshmem_ctx_n_pes(ctx); - roc_shmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root); + rocshmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root); - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); __syncthreads(); - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } static void init_sendbuf(int *source, int nelem, int my_pe) @@ -80,23 +80,23 @@ int main(int argc, char **argv) nelem = atoi(argv[1]); } - int my_pe = roc_shmem_my_pe(); - int npes = roc_shmem_n_pes(); + int my_pe = rocshmem_my_pe(); + int npes = rocshmem_n_pes(); int ndevices, my_device = 0; CHECK_HIP(hipGetDeviceCount(&ndevices)); my_device = my_pe % ndevices; CHECK_HIP(hipSetDevice(my_device)); - roc_shmem_init(); + rocshmem_init(); - int *source = (int *)roc_shmem_malloc(nelem * sizeof(int)); - int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int)); + int *source = (int *)rocshmem_malloc(nelem * sizeof(int)); + int *dest = (int *)rocshmem_malloc(nelem * sizeof(int)); if (NULL == source || NULL == dest) { std::cout << "Error allocating memory from symmetric heap" << std::endl; std::cout << "source: " << source << ", dest: " << dest << ", size: " << sizeof(int) * nelem << std::endl; - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } init_sendbuf(source, nelem, my_pe); @@ -105,9 +105,9 @@ int main(int argc, char **argv) } int root = 0; - roc_shmem_team_t team_reduce_world_dup; - team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0, + rocshmem_team_t team_reduce_world_dup; + team_reduce_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0, &team_reduce_world_dup); CHECK_HIP(hipDeviceSynchronize()); @@ -122,9 +122,9 @@ int main(int argc, char **argv) printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); } - roc_shmem_free(source); - roc_shmem_free(dest); + rocshmem_free(source); + rocshmem_free(dest); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/examples/rocshmem_getmem_test.cc b/examples/rocshmem_getmem_test.cc index e041520c01..75213802ab 100644 --- a/examples/rocshmem_getmem_test.cc +++ b/examples/rocshmem_getmem_test.cc @@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_getmem_test.o -o rocshmem_getmem_test \ $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \ -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 -ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 ./rocshmem_getmem_test +ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 ./rocshmem_getmem_test */ #include #include #include -#include +#include #define CHECK_HIP(condition) { \ hipError_t error = condition; \ @@ -31,25 +31,25 @@ using namespace rocshmem; __global__ void simple_getmem_test(int *src, int *dst, size_t nelem) { - roc_shmem_wg_init(); + rocshmem_wg_init(); int threadId = blockIdx.x * blockDim.x + threadIdx.x; if (threadId == 0) { - int rank = roc_shmem_my_pe(); + int rank = rocshmem_my_pe(); int peer = rank ? 0 : 1; - roc_shmem_getmem(dst, src, nelem * sizeof(int), peer); - roc_shmem_quiet(); + rocshmem_getmem(dst, src, nelem * sizeof(int), peer); + rocshmem_quiet(); } __syncthreads(); - roc_shmem_wg_finalize(); + rocshmem_wg_finalize(); } #define MAX_ELEM 256 int main (int argc, char **argv) { - int rank = roc_shmem_my_pe(); + int rank = rocshmem_my_pe(); int ndevices, my_device = 0; CHECK_HIP(hipGetDeviceCount(&ndevices)); my_device = rank % ndevices; @@ -60,15 +60,15 @@ int main (int argc, char **argv) nelem = atoi(argv[1]); } - roc_shmem_init(); - int npes = roc_shmem_n_pes(); - int *src = (int *)roc_shmem_malloc(nelem * sizeof(int)); - int *dst = (int *)roc_shmem_malloc(nelem * sizeof(int)); + rocshmem_init(); + int npes = rocshmem_n_pes(); + int *src = (int *)rocshmem_malloc(nelem * sizeof(int)); + int *dst = (int *)rocshmem_malloc(nelem * sizeof(int)); if (NULL == src || NULL == dst) { std::cout << "Error allocating memory from symmetric heap" << std::endl; std::cout << "source: " << src << ", dest: " << dst << ", size: " << sizeof(int) * nelem << std::endl; - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } for (int i=0; i>>(src, dst, nelem); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); CHECK_HIP(hipDeviceSynchronize()); bool pass = true; @@ -93,8 +93,8 @@ int main (int argc, char **argv) } printf("Test %s \t %s\n", argv[0], pass ? "[PASS]" : "[FAIL]"); - roc_shmem_free(src); - roc_shmem_free(dst); - roc_shmem_finalize(); + rocshmem_free(src); + rocshmem_free(dst); + rocshmem_finalize(); return 0; } diff --git a/include/roc_shmem/debug.hpp b/include/rocshmem/debug.hpp similarity index 100% rename from include/roc_shmem/debug.hpp rename to include/rocshmem/debug.hpp diff --git a/include/roc_shmem/roc_shmem.hpp b/include/rocshmem/rocshmem.hpp similarity index 78% rename from include/roc_shmem/roc_shmem.hpp rename to include/rocshmem/rocshmem.hpp index d5e90ab0db..f766b28551 100644 --- a/include/roc_shmem/roc_shmem.hpp +++ b/include/rocshmem/rocshmem.hpp @@ -20,16 +20,16 @@ * IN THE SOFTWARE. *****************************************************************************/ -#ifndef LIBRARY_INCLUDE_ROC_SHMEM_HPP -#define LIBRARY_INCLUDE_ROC_SHMEM_HPP +#ifndef LIBRARY_INCLUDE_ROCSHMEM_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_HPP #include #include #include "config.h" /** - * @file roc_shmem.hpp - * @brief Public header for ROC_SHMEM device and host libraries. + * @file rocshmem.hpp + * @brief Public header for rocSHMEM device and host libraries. * * This file contains all the callable functions and data structures for both * the device-side runtime and host-side runtime. @@ -50,117 +50,117 @@ namespace rocshmem { #endif -enum ROC_SHMEM_STATUS { - ROC_SHMEM_SUCCESS = 0, - ROC_SHMEM_ERROR = 1, +enum ROCSHMEM_STATUS { + ROCSHMEM_SUCCESS = 0, + ROCSHMEM_ERROR = 1, }; -enum ROC_SHMEM_OP { - ROC_SHMEM_SUM, - ROC_SHMEM_MAX, - ROC_SHMEM_MIN, - ROC_SHMEM_PROD, - ROC_SHMEM_AND, - ROC_SHMEM_OR, - ROC_SHMEM_XOR, - ROC_SHMEM_REPLACE +enum ROCSHMEM_OP { + ROCSHMEM_SUM, + ROCSHMEM_MAX, + ROCSHMEM_MIN, + ROCSHMEM_PROD, + ROCSHMEM_AND, + ROCSHMEM_OR, + ROCSHMEM_XOR, + ROCSHMEM_REPLACE }; /** - * @brief Types defined for roc_shmem_wait() operations. + * @brief Types defined for rocshmem_wait() operations. */ -enum roc_shmem_cmps { - ROC_SHMEM_CMP_EQ, - ROC_SHMEM_CMP_NE, - ROC_SHMEM_CMP_GT, - ROC_SHMEM_CMP_GE, - ROC_SHMEM_CMP_LT, - ROC_SHMEM_CMP_LE, +enum rocshmem_cmps { + ROCSHMEM_CMP_EQ, + ROCSHMEM_CMP_NE, + ROCSHMEM_CMP_GT, + ROCSHMEM_CMP_GE, + ROCSHMEM_CMP_LT, + ROCSHMEM_CMP_LE, }; -enum roc_shmem_thread_ops { - ROC_SHMEM_THREAD_SINGLE, - ROC_SHMEM_THREAD_FUNNELED, - ROC_SHMEM_THREAD_WG_FUNNELED, - ROC_SHMEM_THREAD_SERIALIZED, - ROC_SHMEM_THREAD_MULTIPLE +enum rocshmem_thread_ops { + ROCSHMEM_THREAD_SINGLE, + ROCSHMEM_THREAD_FUNNELED, + ROCSHMEM_THREAD_WG_FUNNELED, + ROCSHMEM_THREAD_SERIALIZED, + ROCSHMEM_THREAD_MULTIPLE }; /** * @brief Bitwise flags to mask configuration parameters. */ -enum roc_shmem_team_configs { - ROC_SHMEM_TEAM_DEFAULT_CONFIGS, - ROC_SHMEM_TEAM_NUM_CONTEXTS +enum rocshmem_team_configs { + ROCSHMEM_TEAM_DEFAULT_CONFIGS, + ROCSHMEM_TEAM_NUM_CONTEXTS }; typedef struct { int num_contexts; -} roc_shmem_team_config_t; +} rocshmem_team_config_t; -constexpr size_t ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024; -constexpr size_t ROC_SHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024); -constexpr size_t ROC_SHMEM_BARRIER_SYNC_SIZE = 256; -constexpr size_t ROC_SHMEM_REDUCE_SYNC_SIZE = 256; +constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024; +constexpr size_t ROCSHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024); +constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256; +constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256; // Internally calls sync function, which matches barrier implementation -constexpr size_t ROC_SHMEM_BCAST_SYNC_SIZE = ROC_SHMEM_BARRIER_SYNC_SIZE; -constexpr size_t ROC_SHMEM_ALLTOALL_SYNC_SIZE = ROC_SHMEM_BARRIER_SYNC_SIZE + 1; -constexpr size_t ROC_SHMEM_FCOLLECT_SYNC_SIZE = ROC_SHMEM_ALLTOALL_SYNC_SIZE; -constexpr size_t ROC_SHMEM_SYNC_VALUE = 0; +constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE; +constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1; +constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE; +constexpr size_t ROCSHMEM_SYNC_VALUE = 0; -const int ROC_SHMEM_CTX_ZERO = 0; -const int ROC_SHMEM_CTX_NOSTORE = 1; -const int ROC_SHMEM_CTX_SERIALIZED = 2; -const int ROC_SHMEM_CTX_WG_PRIVATE = 4; -const int ROC_SHMEM_CTX_SHARED = 8; +const int ROCSHMEM_CTX_ZERO = 0; +const int ROCSHMEM_CTX_NOSTORE = 1; +const int ROCSHMEM_CTX_SERIALIZED = 2; +const int ROCSHMEM_CTX_WG_PRIVATE = 4; +const int ROCSHMEM_CTX_SHARED = 8; /** * @brief GPU side OpenSHMEM context created from each work-groups' - * roc_shmem_wg_handle_t + * rocshmem_wg_handle_t */ typedef struct { void *ctx_opaque; void *team_opaque; -} roc_shmem_ctx_t; +} rocshmem_ctx_t; /** * Shmem default context. */ -extern __constant__ roc_shmem_ctx_t ROC_SHMEM_CTX_DEFAULT; +extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT; /** * Used internally to set default context. */ -void set_internal_ctx(roc_shmem_ctx_t *ctx); +void set_internal_ctx(rocshmem_ctx_t *ctx); -typedef uint64_t *roc_shmem_team_t; -extern roc_shmem_team_t ROC_SHMEM_TEAM_WORLD; +typedef uint64_t *rocshmem_team_t; +extern rocshmem_team_t ROCSHMEM_TEAM_WORLD; -const roc_shmem_team_t ROC_SHMEM_TEAM_INVALID = nullptr; +const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr; /****************************************************************************** **************************** HOST INTERFACE ********************************** *****************************************************************************/ /** - * @brief Initialize the ROC_SHMEM runtime and underlying transport layer. + * @brief Initialize the rocSHMEM runtime and underlying transport layer. * - * @param[in] comm (Optional) MPI Communicator that ROC_SHMEM will be using - * If MPI_COMM_NULL, ROC_SHMEM will be using MPI_COMM_WORLD + * @param[in] comm (Optional) MPI Communicator that rocSHMEM will be using + * If MPI_COMM_NULL, rocSHMEM will be using MPI_COMM_WORLD */ -__host__ void roc_shmem_init(MPI_Comm comm = MPI_COMM_WORLD); +__host__ void rocshmem_init(MPI_Comm comm = MPI_COMM_WORLD); /** - * @brief Initialize the ROC_SHMEM runtime and underlying transport layer + * @brief Initialize the rocSHMEM runtime and underlying transport layer * with an attempt to enable the requested thread support. * - * @param[in] requested Requested thread mode (from roc_shmem_thread_ops) + * @param[in] requested Requested thread mode (from rocshmem_thread_ops) * for host-facing functions. * @param[out] provided Thread mode selected by the runtime. May not be equal * to requested thread mode. - * @param[in] comm (Optional) MPI Communicator that ROC_SHMEM will be using - * If MPI_COMM_NULL, ROC_SHMEM will be using MPI_COMM_WORLD + * @param[in] comm (Optional) MPI Communicator that rocSHMEM will be using + * If MPI_COMM_NULL, rocSHMEM will be using MPI_COMM_WORLD */ -__host__ void roc_shmem_init_thread(int requested, int *provided, +__host__ void rocshmem_init_thread(int requested, int *provided, MPI_Comm comm = MPI_COMM_WORLD); /** @@ -170,22 +170,22 @@ __host__ void roc_shmem_init_thread(int requested, int *provided, * * @return void. */ -__host__ void roc_shmem_query_thread(int *provided); +__host__ void rocshmem_query_thread(int *provided); /** * @brief Function that dumps internal stats to stdout. */ -__host__ void roc_shmem_dump_stats(); +__host__ void rocshmem_dump_stats(); /** * @brief Reset all internal stats. */ -__host__ void roc_shmem_reset_stats(); +__host__ void rocshmem_reset_stats(); /** - * @brief Finalize the ROC_SHMEM runtime. + * @brief Finalize the rocSHMEM runtime. */ -__host__ void roc_shmem_finalize(); +__host__ void rocshmem_finalize(); /** * @brief Allocate memory of \p size bytes from the symmetric heap. @@ -197,7 +197,7 @@ __host__ void roc_shmem_finalize(); * * @todo Return error code instead of ptr. */ -__host__ void *roc_shmem_malloc(size_t size); +__host__ void *rocshmem_malloc(size_t size); /** * @brief Free a memory allocation from the symmetric heap. @@ -205,21 +205,21 @@ __host__ void *roc_shmem_malloc(size_t size); * * @param[in] ptr Pointer to previously allocated memory on the symmetric heap. */ -__host__ void roc_shmem_free(void *ptr); +__host__ void rocshmem_free(void *ptr); /** * @brief Query for the number of PEs. * * @return Number of PEs. */ -__host__ int roc_shmem_n_pes(); +__host__ int rocshmem_n_pes(); /** * @brief Query the PE ID of the caller. * * @return PE ID of the caller. */ -__host__ int roc_shmem_my_pe(); +__host__ int rocshmem_my_pe(); /** * @brief Creates an OpenSHMEM context. @@ -229,7 +229,7 @@ __host__ int roc_shmem_my_pe(); * * @return Zero on success and nonzero otherwise. */ -__host__ int roc_shmem_ctx_create(int64_t options, roc_shmem_ctx_t *ctx); +__host__ int rocshmem_ctx_create(int64_t options, rocshmem_ctx_t *ctx); /** * @brief Destroys an OpenSHMEM context. @@ -238,7 +238,7 @@ __host__ int roc_shmem_ctx_create(int64_t options, roc_shmem_ctx_t *ctx); * * @return void. */ -__host__ void roc_shmem_ctx_destroy(roc_shmem_ctx_t ctx); +__host__ void rocshmem_ctx_destroy(rocshmem_ctx_t ctx); /** * @brief Translate the PE in src_team to that in dest_team. @@ -251,8 +251,8 @@ __host__ void roc_shmem_ctx_destroy(roc_shmem_ctx_t ctx); * or if src_pe is not in both source and destination * teams, a value of -1 is returned. */ -__host__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, int src_pe, - roc_shmem_team_t dest_team); +__host__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, int src_pe, + rocshmem_team_t dest_team); /** * @brief Query the number of PEs in a team. @@ -261,7 +261,7 @@ __host__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, int src_pe, * * @return Number of PEs in the provided team. */ -__host__ int roc_shmem_team_n_pes(roc_shmem_team_t team); +__host__ int rocshmem_team_n_pes(rocshmem_team_t team); /** * @brief Query the PE ID of the caller in a team. @@ -270,7 +270,7 @@ __host__ int roc_shmem_team_n_pes(roc_shmem_team_t team); * * @return PE ID of the caller in the provided team. */ -__host__ int roc_shmem_team_my_pe(roc_shmem_team_t team); +__host__ int rocshmem_team_my_pe(rocshmem_team_t team); /** * @brief Create a new a team of PEs. Must be called by all PEs @@ -291,15 +291,15 @@ __host__ int roc_shmem_team_my_pe(roc_shmem_team_t team); * @param[out] new_team Pointer to the newly created team. If an error * occurs during team creation, or if the PE in * the parent team is not in the new team, the - * value will be ROC_SHMEM_TEAM_INVALID. + * value will be ROCSHMEM_TEAM_INVALID. * * @return Zero upon successful team creation; non-zero if erroneous. */ -__host__ int roc_shmem_team_split_strided(roc_shmem_team_t parent_team, +__host__ int rocshmem_team_split_strided(rocshmem_team_t parent_team, int start, int stride, int size, - const roc_shmem_team_config_t *config, + const rocshmem_team_config_t *config, long config_mask, - roc_shmem_team_t *new_team); + rocshmem_team_t *new_team); /** * @brief Destroy a team. Must be called by all PEs in the team. @@ -309,19 +309,19 @@ __host__ int roc_shmem_team_split_strided(roc_shmem_team_t parent_team, * created from the referenced team. * * @param[in] team The team to destroy. The behavior is undefined if - * the input team is ROC_SHMEM_TEAM_WORLD or any other - * invalid team. If the input is ROC_SHMEM_TEAM_INVALID, + * the input team is ROCSHMEM_TEAM_WORLD or any other + * invalid team. If the input is ROCSHMEM_TEAM_INVALID, * this function will not perform any operation. * * @return None. */ -__host__ void roc_shmem_team_destroy(roc_shmem_team_t team); +__host__ void rocshmem_team_destroy(rocshmem_team_t team); /** * @brief Writes contiguous data of \p nelems bytes from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into __host__ roc_shmem_quiet() if remote completion is required. + * call into __host__ rocshmem_quiet() if remote completion is required. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -332,17 +332,17 @@ __host__ void roc_shmem_team_destroy(roc_shmem_team_t team); * * @return void. */ -__host__ void roc_shmem_ctx_putmem(roc_shmem_ctx_t ctx, void *dest, +__host__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__host__ void roc_shmem_putmem(void *dest, const void *source, size_t nelems, +__host__ void rocshmem_putmem(void *dest, const void *source, size_t nelems, int pe); /** * @brief Writes contiguous data of \p nelems bytes from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * _host__ roc_shmem_quiet() if completion notification is required. + * _host__ rocshmem_quiet() if completion notification is required. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -353,11 +353,11 @@ __host__ void roc_shmem_putmem(void *dest, const void *source, size_t nelems, * * @return void. */ -__host__ void roc_shmem_ctx_putmem_nbi(roc_shmem_ctx_t ctx, void *dest, +__host__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__host__ void roc_shmem_putmem_nbi(void *dest, const void *source, +__host__ void rocshmem_putmem_nbi(void *dest, const void *source, size_t nelems, int pe); /** @@ -368,7 +368,7 @@ __host__ void roc_shmem_putmem_nbi(void *dest, const void *source, * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -379,23 +379,23 @@ __host__ void roc_shmem_putmem_nbi(void *dest, const void *source, * * @return void. */ -__host__ void roc_shmem_ctx_getmem(roc_shmem_ctx_t ctx, void *dest, +__host__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__host__ void roc_shmem_getmem(void *dest, const void *source, size_t nelems, +__host__ void rocshmem_getmem(void *dest, const void *source, size_t nelems, int pe); /** * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller will * return as soon as the request is posted. The caller must call - * __host__ roc_shmem_quiet() on the same context if completion notification is + * __host__ rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -406,11 +406,11 @@ __host__ void roc_shmem_getmem(void *dest, const void *source, size_t nelems, * * @return void. */ -__host__ void roc_shmem_ctx_getmem_nbi(roc_shmem_ctx_t ctx, void *dest, +__host__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__host__ void roc_shmem_getmem_nbi(void *dest, const void *source, +__host__ void rocshmem_getmem_nbi(void *dest, const void *source, size_t nelems, int pe); /** @@ -421,9 +421,9 @@ __host__ void roc_shmem_getmem_nbi(void *dest, const void *source, * * @return void. */ -__host__ void roc_shmem_ctx_fence(roc_shmem_ctx_t ctx); +__host__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx); -__host__ void roc_shmem_fence(); +__host__ void rocshmem_fence(); /** * @brief Completes all previous operations posted on the host. @@ -432,9 +432,9 @@ __host__ void roc_shmem_fence(); * * @return void. */ -__host__ void roc_shmem_ctx_quiet(roc_shmem_ctx_t ctx); +__host__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx); -__host__ void roc_shmem_quiet(); +__host__ void rocshmem_quiet(); /** * @brief perform a collective barrier between all PEs in the system. @@ -442,7 +442,7 @@ __host__ void roc_shmem_quiet(); * * @return void */ -__host__ void roc_shmem_barrier_all(); +__host__ void rocshmem_barrier_all(); /** * @brief registers the arrival of a PE at a barrier. @@ -454,7 +454,7 @@ __host__ void roc_shmem_barrier_all(); * * @return void */ -__host__ void roc_shmem_sync_all(); +__host__ void rocshmem_sync_all(); /** * @brief allows any PE to force the termination of an entire program. @@ -463,39 +463,39 @@ __host__ void roc_shmem_sync_all(); * * @return void */ -__host__ void roc_shmem_global_exit(int status); +__host__ void rocshmem_global_exit(int status); /****************************************************************************** **************************** DEVICE INTERFACE ******************************** *****************************************************************************/ /** - * @brief Initializes device-side ROC_SHMEM resources. Must be called before - * any threads in this work-group invoke other ROC_SHMEM functions. + * @brief Initializes device-side rocSHMEM resources. Must be called before + * any threads in this work-group invoke other rocSHMEM functions. * * Must be called collectively by all threads in the work-group. * * @return void. */ -__device__ void roc_shmem_wg_init(); +__device__ void rocshmem_wg_init(); /** - * @brief Finalizes device-side ROC_SHMEM resources. Must be called before - * work-group completion if the work-group also called roc_shmem_wg_init(). + * @brief Finalizes device-side rocSHMEM resources. Must be called before + * work-group completion if the work-group also called rocshmem_wg_init(). * * Must be called collectively by all threads in the work-group. * * @return void. */ -__device__ void roc_shmem_wg_finalize(); +__device__ void rocshmem_wg_finalize(); /** - * @brief Initializes device-side ROC_SHMEM resources. Must be called before - * any threads in this work-group invoke other ROC_SHMEM functions. This is - * a variant of roc_shmem_wg_init that allows the caller to request a + * @brief Initializes device-side rocSHMEM resources. Must be called before + * any threads in this work-group invoke other rocSHMEM functions. This is + * a variant of rocshmem_wg_init that allows the caller to request a * threading mode. * - * @param[in] requested Requested thread mode from roc_shmem_thread_ops. + * @param[in] requested Requested thread mode from rocshmem_thread_ops. * @param[out] provided Thread mode selected by the runtime. May not be equal * to requested thread mode. * @@ -503,7 +503,7 @@ __device__ void roc_shmem_wg_finalize(); * * @return void. */ -__device__ void roc_shmem_wg_init_thread(int requested, int *provided); +__device__ void rocshmem_wg_init_thread(int requested, int *provided); /** * @brief Query the thread mode used by the runtime. @@ -512,7 +512,7 @@ __device__ void roc_shmem_wg_init_thread(int requested, int *provided); * * @return void. */ -__device__ void roc_shmem_query_thread(int *provided); +__device__ void rocshmem_query_thread(int *provided); /** * @brief Creates an OpenSHMEM context. By design, the context is private @@ -525,13 +525,13 @@ __device__ void roc_shmem_query_thread(int *provided); * * @return All threads returns 0 if the context was created successfully. If any * thread returns non-zero value, the operation failed and a higher number of - * `ROC_SHMEM_MAX_NUM_CONTEXTS` is required. + * `ROCSHMEM_MAX_NUM_CONTEXTS` is required. */ -__device__ ATTR_NO_INLINE int roc_shmem_wg_ctx_create(int64_t options, - roc_shmem_ctx_t *ctx); +__device__ ATTR_NO_INLINE int rocshmem_wg_ctx_create(int64_t options, + rocshmem_ctx_t *ctx); -__device__ ATTR_NO_INLINE int roc_shmem_wg_team_create_ctx( - roc_shmem_team_t team, long options, roc_shmem_ctx_t *ctx); +__device__ ATTR_NO_INLINE int rocshmem_wg_team_create_ctx( + rocshmem_team_t team, long options, rocshmem_ctx_t *ctx); /** * @brief Destroys an OpenSHMEM context. @@ -542,18 +542,18 @@ __device__ ATTR_NO_INLINE int roc_shmem_wg_team_create_ctx( * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_wg_ctx_destroy(roc_shmem_ctx_t *ctx); +__device__ ATTR_NO_INLINE void rocshmem_wg_ctx_destroy(rocshmem_ctx_t *ctx); /** * @brief Writes contiguous data of \p nelems bytes from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -564,12 +564,12 @@ __device__ ATTR_NO_INLINE void roc_shmem_wg_ctx_destroy(roc_shmem_ctx_t *ctx); * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_putmem(roc_shmem_ctx_t ctx, +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_putmem(void *dest, const void *source, +__device__ ATTR_NO_INLINE void rocshmem_putmem(void *dest, const void *source, size_t nelems, int pe); /** @@ -580,7 +580,7 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem(void *dest, const void *source, * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -591,25 +591,25 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem(void *dest, const void *source, * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_getmem(roc_shmem_ctx_t ctx, +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_getmem(void *dest, const void *source, +__device__ ATTR_NO_INLINE void rocshmem_getmem(void *dest, const void *source, size_t nelems, int pe); /** * @brief Writes contiguous data of \p nelems bytes from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -620,12 +620,12 @@ __device__ ATTR_NO_INLINE void roc_shmem_getmem(void *dest, const void *source, * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_putmem_nbi(roc_shmem_ctx_t ctx, +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_putmem_nbi(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi(void *dest, const void *source, size_t nelems, int pe); @@ -633,13 +633,13 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem_nbi(void *dest, * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller will * return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -650,12 +650,12 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem_nbi(void *dest, * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_getmem_nbi(roc_shmem_ctx_t ctx, +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_getmem_nbi(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi(void *dest, const void *source, size_t nelems, int pe); @@ -666,15 +666,15 @@ __device__ ATTR_NO_INLINE void roc_shmem_getmem_nbi(void *dest, * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_fence(roc_shmem_ctx_t ctx); +__device__ ATTR_NO_INLINE void rocshmem_ctx_fence(rocshmem_ctx_t ctx); -__device__ ATTR_NO_INLINE void roc_shmem_fence(); +__device__ ATTR_NO_INLINE void rocshmem_fence(); /** * @brief Guarantees order between messages in this context in accordance with @@ -688,9 +688,9 @@ __device__ ATTR_NO_INLINE void roc_shmem_fence(); * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_fence(roc_shmem_ctx_t ctx, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_fence(rocshmem_ctx_t ctx, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_fence(int pe); +__device__ ATTR_NO_INLINE void rocshmem_fence(int pe); /** * @brief Completes all previous operations posted to this context. @@ -698,15 +698,15 @@ __device__ ATTR_NO_INLINE void roc_shmem_fence(int pe); * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_quiet(roc_shmem_ctx_t ctx); +__device__ ATTR_NO_INLINE void rocshmem_ctx_quiet(rocshmem_ctx_t ctx); -__device__ ATTR_NO_INLINE void roc_shmem_quiet(); +__device__ ATTR_NO_INLINE void rocshmem_quiet(); /** * @brief Query the total number of PEs. @@ -717,9 +717,9 @@ __device__ ATTR_NO_INLINE void roc_shmem_quiet(); * * @return Total number of PEs. */ -__device__ int roc_shmem_ctx_n_pes(roc_shmem_ctx_t ctx); +__device__ int rocshmem_ctx_n_pes(rocshmem_ctx_t ctx); -__device__ int roc_shmem_n_pes(); +__device__ int rocshmem_n_pes(); /** * @brief Query the PE ID of the caller. @@ -730,9 +730,9 @@ __device__ int roc_shmem_n_pes(); * * @return PE ID of the caller. */ -__device__ int roc_shmem_ctx_my_pe(roc_shmem_ctx_t ctx); +__device__ int rocshmem_ctx_my_pe(rocshmem_ctx_t ctx); -__device__ int roc_shmem_my_pe(); +__device__ int rocshmem_my_pe(); /** * @brief Translate the PE in src_team to that in dest_team. @@ -745,9 +745,9 @@ __device__ int roc_shmem_my_pe(); * or if src_pe is not in both source and destination * teams, a value of -1 is returned. */ -__device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, +__device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, int src_pe, - roc_shmem_team_t dest_team); + rocshmem_team_t dest_team); /** * @brief perform a collective barrier between all PEs in the system. @@ -759,10 +759,10 @@ __device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, * * @return void */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_wg_barrier_all( - roc_shmem_ctx_t ctx); +__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_barrier_all( + rocshmem_ctx_t ctx); -__device__ ATTR_NO_INLINE void roc_shmem_wg_barrier_all(); +__device__ ATTR_NO_INLINE void rocshmem_wg_barrier_all(); /** * @brief registers the arrival of a PE at a barrier. @@ -778,9 +778,9 @@ __device__ ATTR_NO_INLINE void roc_shmem_wg_barrier_all(); * * @return void */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_wg_sync_all(roc_shmem_ctx_t ctx); +__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx); -__device__ ATTR_NO_INLINE void roc_shmem_wg_sync_all(); +__device__ ATTR_NO_INLINE void rocshmem_wg_sync_all(); /** * @brief registers the arrival of a PE at a barrier. @@ -797,10 +797,10 @@ __device__ ATTR_NO_INLINE void roc_shmem_wg_sync_all(); * * @return void */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_wg_team_sync( - roc_shmem_ctx_t ctx, roc_shmem_team_t team); +__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_team_sync( + rocshmem_ctx_t ctx, rocshmem_team_t team); -__device__ ATTR_NO_INLINE void roc_shmem_wg_team_sync(roc_shmem_team_t team); +__device__ ATTR_NO_INLINE void rocshmem_wg_team_sync(rocshmem_team_t team); /** * @brief Query a local pointer to a symmetric data object on the @@ -809,18 +809,18 @@ __device__ ATTR_NO_INLINE void roc_shmem_wg_team_sync(roc_shmem_team_t team); * * Can be called per thread with no performance penalty. */ -__device__ ATTR_NO_INLINE void *roc_shmem_ptr(const void *dest, int pe); +__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe); /** * @brief Query the current time. Similar to gettimeofday() on the CPU. To use - * this function, ROC_SHMEM must be configured with profiling support + * this function, rocSHMEM must be configured with profiling support * (--enable-profile). * * Can be called per thread with no performance penalty. * * @return Time in micro-seconds. */ -__device__ uint64_t roc_shmem_timer(); +__device__ uint64_t rocshmem_timer(); /** * @brief Make all uncacheable GPU data visible to other agents in the sytem. @@ -834,20 +834,20 @@ __device__ uint64_t roc_shmem_timer(); * * @return void */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_threadfence_system( - roc_shmem_ctx_t ctx); +__device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system( + rocshmem_ctx_t ctx); -__device__ ATTR_NO_INLINE void roc_shmem_threadfence_system(); +__device__ ATTR_NO_INLINE void rocshmem_threadfence_system(); /* * MACRO DECLARE SHMEM_REDUCTION APIs */ #define REDUCTION_API_GEN(T, TNAME, Op_API) \ - __device__ ATTR_NO_INLINE int roc_shmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ + __device__ ATTR_NO_INLINE int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nreduce); \ - __host__ int roc_shmem_ctx_##TNAME##_##Op_API##_reduce( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ + __host__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nreduce); #define ARITH_REDUCTION_API_GEN(T, TNAME) \ @@ -871,346 +871,346 @@ __device__ ATTR_NO_INLINE void roc_shmem_threadfence_system(); * MACRO DECLARE SHMEM_BROADCAST APIs */ #define BROADCAST_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_wg_broadcast( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_broadcast( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nelem, int pe_root); /* NOLINT */ \ - __host__ void roc_shmem_ctx_##TNAME##_broadcast( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ + __host__ void rocshmem_ctx_##TNAME##_broadcast( \ + rocshmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ int pe_start, int log_pe_stride, int pe_size, \ long *p_sync); /* NOLINT */ \ - __host__ void roc_shmem_ctx_##TNAME##_broadcast( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ + __host__ void rocshmem_ctx_##TNAME##_broadcast( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nelem, int pe_root); /* NOLINT */ /* * MACRO DECLARE SHMEM_ALLTOALL APIs */ #define ALLTOALL_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_wg_alltoall( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_alltoall( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nelem); /* NOLINT */ /* * MACRO DECLARE SHMEM_FCOLLECT APIs */ #define FCOLLECT_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_wg_fcollect( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_fcollect( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nelem); /* NOLINT */ /* * MACRO DECLARE SHMEM_PUT APIs */ #define PUT_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_put( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_put( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put( \ T *dest, const T *source, size_t nelems, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_put( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void roc_shmem_##TNAME##_put(T *dest, const T *source, \ + __host__ void rocshmem_ctx_##TNAME##_put( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __host__ void rocshmem_##TNAME##_put(T *dest, const T *source, \ size_t nelems, int pe); /* * MACRO DECLARE SHMEM_P APIs */ #define P_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_p( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_p(T *dest, T value, \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_p( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_p(T *dest, T value, \ int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_p(roc_shmem_ctx_t ctx, T *dest, \ + __host__ void rocshmem_ctx_##TNAME##_p(rocshmem_ctx_t ctx, T *dest, \ T value, int pe); \ - __host__ void roc_shmem_##TNAME##_p(T *dest, T value, int pe); + __host__ void rocshmem_##TNAME##_p(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_GET APIs */ #define GET_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_get( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_get( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get( \ T *dest, const T *source, size_t nelems, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_get( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void roc_shmem_##TNAME##_get(T *dest, const T *source, \ + __host__ void rocshmem_ctx_##TNAME##_get( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __host__ void rocshmem_##TNAME##_get(T *dest, const T *source, \ size_t nelems, int pe); /* * MACRO DECLARE SHMEM_G APIs */ #define G_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_g( \ - roc_shmem_ctx_t ctx, const T *source, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_g(const T *source, int pe); \ - __host__ T roc_shmem_ctx_##TNAME##_g(roc_shmem_ctx_t ctx, const T *source, \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_g( \ + rocshmem_ctx_t ctx, const T *source, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_g(const T *source, int pe); \ + __host__ T rocshmem_ctx_##TNAME##_g(rocshmem_ctx_t ctx, const T *source, \ int pe); \ - __host__ T roc_shmem_##TNAME##_g(const T *source, int pe); + __host__ T rocshmem_##TNAME##_g(const T *source, int pe); /* * MACRO DECLARE SHMEM_PUT_NBI APIs */ #define PUT_NBI_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_put_nbi( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_put_nbi( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_nbi( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_nbi( \ T *dest, const T *source, size_t nelems, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_put_nbi( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void roc_shmem_##TNAME##_put_nbi(T *dest, const T *source, \ + __host__ void rocshmem_ctx_##TNAME##_put_nbi( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __host__ void rocshmem_##TNAME##_put_nbi(T *dest, const T *source, \ size_t nelems, int pe); /* * MACRO DECLARE SHMEM_GET_NBI APIs */ #define GET_NBI_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_get_nbi( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_get_nbi( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_nbi( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_nbi( \ T *dest, const T *source, size_t nelems, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_get_nbi( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void roc_shmem_##TNAME##_get_nbi(T *dest, const T *source, \ + __host__ void rocshmem_ctx_##TNAME##_get_nbi( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __host__ void rocshmem_##TNAME##_get_nbi(T *dest, const T *source, \ size_t nelems, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_FETCH_ADD APIs */ #define ATOMIC_FETCH_ADD_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_fetch_add( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_fetch_add( \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_add( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_add( \ T *dest, T value, int pe); \ - __host__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_fetch_add( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_fetch_add( \ + __host__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_add( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __host__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_add( \ T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_COMPARE_SWAP APIs */ #define ATOMIC_COMPARE_SWAP_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_compare_swap( \ - roc_shmem_ctx_t ctx, T *dest, T cond, T value, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_compare_swap( \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_compare_swap( \ + rocshmem_ctx_t ctx, T *dest, T cond, T value, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_compare_swap( \ T *dest, T cond, T value, int pe); \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_compare_swap( \ - roc_shmem_ctx_t ctx, T *dest, T cond, T value, int pe); \ - __host__ T roc_shmem_##TNAME##_atomic_compare_swap(T *dest, T cond, T value, \ + __host__ T rocshmem_ctx_##TNAME##_atomic_compare_swap( \ + rocshmem_ctx_t ctx, T *dest, T cond, T value, int pe); \ + __host__ T rocshmem_##TNAME##_atomic_compare_swap(T *dest, T cond, T value, \ int pe); /* * MACRO DECLARE SHMEM_ATOMIC_FETCH_INC APIs */ #define ATOMIC_FETCH_INC_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_fetch_inc( \ - roc_shmem_ctx_t ctx, T *dest, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_fetch_inc(T *dest, \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_inc( \ + rocshmem_ctx_t ctx, T *dest, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_inc(T *dest, \ int pe); \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_inc(roc_shmem_ctx_t ctx, \ + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_inc(rocshmem_ctx_t ctx, \ T *dest, int pe); \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_inc(T *dest, int pe); + __host__ T rocshmem_##TNAME##_atomic_fetch_inc(T *dest, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_FETCH APIs */ #define ATOMIC_FETCH_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_fetch( \ - roc_shmem_ctx_t ctx, T *source, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_fetch(T *source, \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch( \ + rocshmem_ctx_t ctx, T *source, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch(T *source, \ int pe); \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch(roc_shmem_ctx_t ctx, \ + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch(rocshmem_ctx_t ctx, \ T *source, int pe); \ - __host__ T roc_shmem_##TNAME##_atomic_fetch(T *source, int pe); + __host__ T rocshmem_##TNAME##_atomic_fetch(T *source, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_ADD APIs */ #define ATOMIC_ADD_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_atomic_add( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_atomic_add( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_add( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_add( \ T *dest, T value, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_add(roc_shmem_ctx_t ctx, \ + __host__ void rocshmem_ctx_##TNAME##_atomic_add(rocshmem_ctx_t ctx, \ T *dest, T value, int pe); \ - __host__ void roc_shmem_##TNAME##_atomic_add(T *dest, T value, int pe); + __host__ void rocshmem_##TNAME##_atomic_add(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_SET APIs */ #define ATOMIC_SET_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_atomic_set( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_atomic_set( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_set( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_set( \ T *dest, T value, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_set(roc_shmem_ctx_t ctx, \ + __host__ void rocshmem_ctx_##TNAME##_atomic_set(rocshmem_ctx_t ctx, \ T *dest, T value, int pe); \ - __host__ void roc_shmem_##TNAME##_atomic_set(T *dest, T value, int pe); + __host__ void rocshmem_##TNAME##_atomic_set(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_SWAP APIs */ #define ATOMIC_SWAP_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_swap( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_swap( \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_swap( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_swap( \ T *dest, T value, int pe); \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_swap(roc_shmem_ctx_t ctx, T *dest, \ + __host__ T rocshmem_ctx_##TNAME##_atomic_swap(rocshmem_ctx_t ctx, T *dest, \ T value, int pe); \ - __host__ T roc_shmem_##TNAME##_atomic_swap(T *dest, T value, int pe); + __host__ T rocshmem_##TNAME##_atomic_swap(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_FETCH_AND APIs */ #define ATOMIC_FETCH_AND_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_fetch_and( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_fetch_and( \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_and( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_and( \ T *dest, T value, int pe); \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_and( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_and(T *dest, T value, int pe); + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_and( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __host__ T rocshmem_##TNAME##_atomic_fetch_and(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_AND APIs */ #define ATOMIC_AND_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_atomic_and( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_atomic_and( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_and( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_and( \ T *dest, T value, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_and(roc_shmem_ctx_t ctx, \ + __host__ void rocshmem_ctx_##TNAME##_atomic_and(rocshmem_ctx_t ctx, \ T *dest, T value, int pe); \ - __host__ void roc_shmem_##TNAME##_atomic_and(T *dest, T value, int pe); + __host__ void rocshmem_##TNAME##_atomic_and(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_FETCH_OR APIs */ #define ATOMIC_FETCH_OR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_fetch_or( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_fetch_or( \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_or( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_or( \ T *dest, T value, int pe); \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_or( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_or(T *dest, T value, int pe); + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_or( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __host__ T rocshmem_##TNAME##_atomic_fetch_or(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_OR APIs */ #define ATOMIC_OR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_atomic_or( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_atomic_or( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_or( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_or( \ T *dest, T value, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_or(roc_shmem_ctx_t ctx, \ + __host__ void rocshmem_ctx_##TNAME##_atomic_or(rocshmem_ctx_t ctx, \ T *dest, T value, int pe); \ - __host__ void roc_shmem_##TNAME##_atomic_or(T *dest, T value, int pe); + __host__ void rocshmem_##TNAME##_atomic_or(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_FETCH_XOR APIs */ #define ATOMIC_FETCH_XOR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T roc_shmem_ctx_##TNAME##_atomic_fetch_xor( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T roc_shmem_##TNAME##_atomic_fetch_xor( \ + __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_xor( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_xor( \ T *dest, T value, int pe); \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_xor( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_xor(T *dest, T value, int pe); + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_xor( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __host__ T rocshmem_##TNAME##_atomic_fetch_xor(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_XOR APIs */ #define ATOMIC_XOR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_atomic_xor( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_atomic_xor( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_xor( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_xor( \ T *dest, T value, int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_xor(roc_shmem_ctx_t ctx, \ + __host__ void rocshmem_ctx_##TNAME##_atomic_xor(rocshmem_ctx_t ctx, \ T *dest, T value, int pe); \ - __host__ void roc_shmem_##TNAME##_atomic_xor(T *dest, T value, int pe); + __host__ void rocshmem_##TNAME##_atomic_xor(T *dest, T value, int pe); /* * MACRO DECLARE SHMEM_ATOMIC_INC APIs */ #define ATOMIC_INC_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_atomic_inc( \ - roc_shmem_ctx_t ctx, T *dest, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_atomic_inc(T *dest, \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_inc( \ + rocshmem_ctx_t ctx, T *dest, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_inc(T *dest, \ int pe); \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_inc(roc_shmem_ctx_t ctx, \ + __host__ void rocshmem_ctx_##TNAME##_atomic_inc(rocshmem_ctx_t ctx, \ T *dest, int pe); \ - __host__ void roc_shmem_##TNAME##_atomic_inc(T *dest, int pe); + __host__ void rocshmem_##TNAME##_atomic_inc(T *dest, int pe); /* * MACRO DECLARE SHMEM_WAIT_UNTIL APIs */ #define WAIT_UNTIL_API_GEN(T, TNAME) \ - __device__ void roc_shmem_##TNAME##_wait_until(T *ivars, \ + __device__ void rocshmem_##TNAME##_wait_until(T *ivars, \ int cmp, \ T val); \ - __device__ size_t roc_shmem_##TNAME##_wait_until_any(T *ivars, \ + __device__ size_t rocshmem_##TNAME##_wait_until_any(T *ivars, \ size_t nelems, \ const int* status, \ int cmp, \ T val); \ - __device__ void roc_shmem_##TNAME##_wait_until_all(T *ivars, \ + __device__ void rocshmem_##TNAME##_wait_until_all(T *ivars, \ size_t nelems, \ const int* status, \ int cmp, \ T val); \ - __device__ size_t roc_shmem_##TNAME##_wait_until_some(T *ivars, \ + __device__ size_t rocshmem_##TNAME##_wait_until_some(T *ivars, \ size_t nelems, \ size_t* indices, \ const int* status, \ int cmp, \ T val); \ - __device__ size_t roc_shmem_##TNAME##_wait_until_any_vector(T *ivars, \ + __device__ size_t rocshmem_##TNAME##_wait_until_any_vector(T *ivars, \ size_t nelems, \ const int* status, \ int cmp, \ T* vals); \ - __device__ void roc_shmem_##TNAME##_wait_until_all_vector(T *ivars, \ + __device__ void rocshmem_##TNAME##_wait_until_all_vector(T *ivars, \ size_t nelems, \ const int* status, \ int cmp, \ T* vals); \ - __device__ size_t roc_shmem_##TNAME##_wait_until_some_vector(T *ivars, \ + __device__ size_t rocshmem_##TNAME##_wait_until_some_vector(T *ivars, \ size_t nelems, \ size_t* indices, \ const int* status, \ int cmp, \ T* vals); \ - __host__ void roc_shmem_##TNAME##_wait_until(T *ivars, \ + __host__ void rocshmem_##TNAME##_wait_until(T *ivars, \ int cmp, \ T val); \ - __host__ size_t roc_shmem_##TNAME##_wait_until_any(T *ivars, \ + __host__ size_t rocshmem_##TNAME##_wait_until_any(T *ivars, \ size_t nelems, \ const int* status, \ int cmp, \ T val); \ - __host__ void roc_shmem_##TNAME##_wait_until_all(T *ivars, \ + __host__ void rocshmem_##TNAME##_wait_until_all(T *ivars, \ size_t nelems, \ const int* status, \ int cmp, \ T val); \ - __host__ size_t roc_shmem_##TNAME##_wait_until_some(T *ivars, \ + __host__ size_t rocshmem_##TNAME##_wait_until_some(T *ivars, \ size_t nelems, \ size_t* indices, \ const int* status, \ int cmp, \ T val); \ - __host__ size_t roc_shmem_##TNAME##_wait_until_any_vector(T *ivars, \ + __host__ size_t rocshmem_##TNAME##_wait_until_any_vector(T *ivars, \ size_t nelems, \ const int* status, \ int cmp, \ T* vals); \ - __host__ void roc_shmem_##TNAME##_wait_until_all_vector(T *ivars, \ + __host__ void rocshmem_##TNAME##_wait_until_all_vector(T *ivars, \ size_t nelems, \ const int* status, \ int cmp, \ T* vals); \ - __host__ size_t roc_shmem_##TNAME##_wait_until_some_vector(T *ivars, \ + __host__ size_t rocshmem_##TNAME##_wait_until_some_vector(T *ivars, \ size_t nelems, \ size_t* indices, \ const int* status, \ @@ -1221,8 +1221,8 @@ __device__ ATTR_NO_INLINE void roc_shmem_threadfence_system(); * MACRO DECLARE SHMEM_TEST APIs */ #define TEST_API_GEN(T, TNAME) \ - __device__ int roc_shmem_##TNAME##_test(T *ivars, int cmp, T val); \ - __host__ int roc_shmem_##TNAME##_test(T *ivars, int cmp, T val); + __device__ int rocshmem_##TNAME##_test(T *ivars, int cmp, T val); \ + __host__ int rocshmem_##TNAME##_test(T *ivars, int cmp, T val); /** * @name SHMEM_REDUCTIONS @@ -1239,11 +1239,11 @@ __device__ ATTR_NO_INLINE void roc_shmem_threadfence_system(); * @param[in] PE_start PE to start the reduction. * @param[in] logPE_stride Stride of PEs participating in the reduction. * @param[in] PE_size Number PEs participating in the reduction. - * @param[in] pWrk Temporary work buffer provided to ROC_SHMEM. Must + * @param[in] pWrk Temporary work buffer provided to rocSHMEM. Must * be of size at least max(size/2 + 1, - ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE). - * @param[in] pSync Temporary sync buffer provided to ROC_SHMEM. Must - be of size at least ROC_SHMEM_REDUCE_SYNC_SIZE. + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE). + * @param[in] pSync Temporary sync buffer provided to rocSHMEM. Must + be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. * @param[in] handle GPU side handle. * * @return void @@ -1277,8 +1277,8 @@ FLOAT_REDUCTION_API_GEN(double, double) * @param[in] PE_start PE to start the reduction. * @param[in] logPE_stride Stride of PEs participating in the reduction. * @param[in] PE_size Number PEs participating in the reduction. - * @param[in] pSync Temporary sync buffer provided to ROC_SHMEM. Must - be of size at least ROC_SHMEM_REDUCE_SYNC_SIZE. + * @param[in] pSync Temporary sync buffer provided to rocSHMEM. Must + be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. * * @return void */ @@ -1370,12 +1370,12 @@ FCOLLECT_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -1406,13 +1406,13 @@ PUT_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) /** * @name SHMEM_P * @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe. - * The caller must call into roc_shmem_quiet() if remote completion is + * The caller must call into rocshmem_quiet() if remote completion is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -1448,7 +1448,7 @@ P_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -1484,7 +1484,7 @@ GET_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] source sourcen address. Must be an address on the symmetric @@ -1515,13 +1515,13 @@ G_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -1554,13 +1554,13 @@ PUT_NBI_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) * @brief Reads contiguous data of \p nelems elements from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller will * return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -2023,7 +2023,7 @@ ATOMIC_INC_API_GEN(ptrdiff_t, ptrdiff) * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ivars Pointer to memory on the symmetric heap to wait for. * @param[in] cmp Operation for the comparison. @@ -2056,7 +2056,7 @@ WAIT_UNTIL_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ivars Pointer to memory on the symmetric heap to wait for. * @param[in] cmp Operation for the comparison. @@ -2089,43 +2089,43 @@ TEST_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) * MACRO DECLARE SHMEM_PUT APIs */ #define PUT_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_put_##GRAN( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_put_##GRAN( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_##GRAN( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_##GRAN( \ T *dest, const T *source, size_t nelems, int pe); /* * MACRO DECLARE SHMEM_GET APIs */ #define GET_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_get_##GRAN( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_get_##GRAN( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_##GRAN( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_##GRAN( \ T *dest, const T *source, size_t nelems, int pe); /* * MACRO DECLARE SHMEM_PUT_NBI APIs */ #define PUT_NBI_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_put_nbi_##GRAN( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_put_nbi_##GRAN( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_nbi_##GRAN( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_nbi_##GRAN( \ T *dest, const T *source, size_t nelems, int pe); /* * MACRO DECLARE SHMEM_GET_NBI APIs */ #define GET_NBI_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_get_nbi_##GRAN( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void roc_shmem_##TNAME##_get_nbi_##GRAN( \ + __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_nbi_##GRAN( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ + __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_nbi_##GRAN( \ T *dest, const T *source, size_t nelems, int pe); /** * @brief Writes contiguous data of \p nelems bytes from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-wave * granularity. However, all threads in a wave must participate in the @@ -2140,10 +2140,10 @@ TEST_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_putmem_wave( - roc_shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_putmem_wave(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_putmem_wave(void *dest, const void *source, size_t nelems, int pe); @@ -2151,7 +2151,7 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem_wave(void *dest, * @brief Writes contiguous data of \p nelems bytes from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-workgroup * (WG) granularity. However, all threads in the workgroup must participate in @@ -2166,12 +2166,12 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem_wave(void *dest, * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_putmem_wg(roc_shmem_ctx_t ctx, +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_putmem_wg(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_putmem_wg(void *dest, const void *source, size_t nelems, int pe); @@ -2179,7 +2179,7 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem_wg(void *dest, * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-wave * granularity. However, all threads in a wave must collectively participate @@ -2215,7 +2215,7 @@ PUT_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT(runtime/int) * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-workgroub * (WG) granularity. However, All threads in a WG must collectively participate @@ -2265,10 +2265,10 @@ PUT_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_getmem_wave( - roc_shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_getmem_wave(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_getmem_wave(void *dest, const void *source, size_t nelems, int pe); @@ -2290,12 +2290,12 @@ __device__ ATTR_NO_INLINE void roc_shmem_getmem_wave(void *dest, * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_getmem_wg(roc_shmem_ctx_t ctx, +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_getmem_wg(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_getmem_wg(void *dest, const void *source, size_t nelems, int pe); @@ -2373,7 +2373,7 @@ GET_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) * @brief Writes contiguous data of \p nelems bytes from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-wave @@ -2389,10 +2389,10 @@ GET_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_putmem_nbi_wave( - roc_shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_putmem_nbi_wave(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wave(void *dest, const void *source, size_t nelems, int pe); @@ -2401,7 +2401,7 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem_nbi_wave(void *dest, * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-wave @@ -2438,7 +2438,7 @@ PUT_NBI_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT * @brief Writes contiguous data of \p nelems bytes from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-workgroup @@ -2454,10 +2454,10 @@ PUT_NBI_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_putmem_nbi_wg( - roc_shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_putmem_nbi_wg(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wg(void *dest, const void *source, size_t nelems, int pe); @@ -2465,7 +2465,7 @@ __device__ ATTR_NO_INLINE void roc_shmem_putmem_nbi_wg(void *dest, * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-workgroup @@ -2502,7 +2502,7 @@ PUT_NBI_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-wave @@ -2518,10 +2518,10 @@ PUT_NBI_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_getmem_nbi_wave( - roc_shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_getmem_nbi_wave(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wave(void *dest, const void *source, size_t nelems, int pe); @@ -2530,7 +2530,7 @@ __device__ ATTR_NO_INLINE void roc_shmem_getmem_nbi_wave(void *dest, * @brief Reads contiguous data of \p nelems elements from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-wave @@ -2567,7 +2567,7 @@ GET_NBI_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-workgroup @@ -2583,10 +2583,10 @@ GET_NBI_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT * * @return void. */ -__device__ ATTR_NO_INLINE void roc_shmem_ctx_getmem_nbi_wg( - roc_shmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); -__device__ ATTR_NO_INLINE void roc_shmem_getmem_nbi_wg(void *dest, +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wg(void *dest, const void *source, size_t nelems, int pe); @@ -2594,7 +2594,7 @@ __device__ ATTR_NO_INLINE void roc_shmem_getmem_nbi_wg(void *dest, * @brief Reads contiguous data of \p nelems elements from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-workgroup @@ -2629,4 +2629,4 @@ GET_NBI_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) } // namespace rocshmem -#endif // LIBRARY_INCLUDE_ROC_SHMEM_HPP +#endif // LIBRARY_INCLUDE_ROCSHMEM_HPP diff --git a/internal/clients/shmem_rccl/primitive_tester.cpp b/internal/clients/shmem_rccl/primitive_tester.cpp index 7982bc5356..7ecc9faeea 100644 --- a/internal/clients/shmem_rccl/primitive_tester.cpp +++ b/internal/clients/shmem_rccl/primitive_tester.cpp @@ -22,7 +22,7 @@ #include "primitive_tester.hpp" -#include +#include #include #include @@ -41,26 +41,26 @@ PrimitiveTest(int loop, int my_pe, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); int block_id = hipBlockIdx_x; for(int i =0; i< loop; i++){ - roc_shmem_ctx_putmem_nbi_wg(ctx, &r_buf[my_pe*size], &s_buf[block_id * size], size, block_id); + rocshmem_ctx_putmem_nbi_wg(ctx, &r_buf[my_pe*size], &s_buf[block_id * size], size, block_id); if(hipThreadIdx_x==0){ - //roc_shmem_ctx_quiet(ctx); - //roc_shmem_ctx_threadfence_system(ctx); - roc_shmem_ctx_int_p(ctx, &flag[my_pe], i+1, block_id); - //roc_shmem_ctx_quiet(ctx); - roc_shmem_int_wait_until(&flag[block_id], ROC_SHMEM_CMP_EQ, i+1); + //rocshmem_ctx_quiet(ctx); + //rocshmem_ctx_threadfence_system(ctx); + rocshmem_ctx_int_p(ctx, &flag[my_pe], i+1, block_id); + //rocshmem_ctx_quiet(ctx); + rocshmem_int_wait_until(&flag[block_id], ROCSHMEM_CMP_EQ, i+1); } __syncthreads(); } - roc_shmem_wg_ctx_destroy(ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -69,16 +69,16 @@ PrimitiveTest(int loop, PrimitiveTester::PrimitiveTester(TesterArguments args) : Tester(args) { - flag = (int*) roc_shmem_malloc(args.numprocs); + flag = (int*) rocshmem_malloc(args.numprocs); memset(flag, 0, args.numprocs*sizeof(int)); - // s_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size); - // r_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size); + // s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); + // r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); } PrimitiveTester::~PrimitiveTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); + rocshmem_free(s_buf); + rocshmem_free(r_buf); } void @@ -99,8 +99,8 @@ PrimitiveTester::launchKernel(dim3 gridSize, void* sendBuf = malloc(64); void* recvBuf = malloc(64 * nproc); - s_buf = (char *)roc_shmem_malloc(size * nproc); - r_buf = (char *)roc_shmem_malloc(size * nproc); + s_buf = (char *)rocshmem_malloc(size * nproc); + r_buf = (char *)rocshmem_malloc(size * nproc); resetBuffers(size); MPI_Allgather(sendBuf, 64, MPI_CHAR, @@ -108,7 +108,7 @@ PrimitiveTester::launchKernel(dim3 gridSize, MPI_COMM_WORLD); size_t shared_bytes; - roc_shmem_dynamic_shared(&shared_bytes); + rocshmem_dynamic_shared(&shared_bytes); hipLaunchKernelGGL(PrimitiveTest, gridSize, diff --git a/internal/clients/shmem_rccl/test_driver.cpp b/internal/clients/shmem_rccl/test_driver.cpp index 7bcd0e2388..511b42ac9d 100644 --- a/internal/clients/shmem_rccl/test_driver.cpp +++ b/internal/clients/shmem_rccl/test_driver.cpp @@ -22,7 +22,7 @@ #include -#include +#include #include "tester.hpp" #include "tester_arguments.hpp" @@ -39,7 +39,7 @@ int main(int argc, char * argv[]) /*** * Select a GPU */ - int rank = roc_shmem_my_pe(); + int rank = rocshmem_my_pe(); int ndevices, my_device=0; hipGetDeviceCount (&ndevices); my_device = rank % ndevices; @@ -48,7 +48,7 @@ int main(int argc, char * argv[]) /** * Must initialize rocshmem to access arguments needed by the tester. */ - roc_shmem_init(args.num_wgs); + rocshmem_init(args.num_wgs); /** * Now grab the arguments from rocshmem. @@ -78,7 +78,7 @@ int main(int argc, char * argv[]) * The rocshmem library needs to be cleaned up with this call. It pairs * with the init function above. */ - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/internal/clients/shmem_rccl/tester.cpp b/internal/clients/shmem_rccl/tester.cpp index 946d4dfa8e..d43ee09846 100644 --- a/internal/clients/shmem_rccl/tester.cpp +++ b/internal/clients/shmem_rccl/tester.cpp @@ -27,7 +27,7 @@ #include #include #include -#include +#include //#include "broadcast_tester.hpp" #include "primitive_tester.hpp" @@ -125,8 +125,8 @@ Tester::execute() printf("error = %d \n", err); } -// roc_shmem_dump_stats(); - // roc_shmem_reset_stats(); +// rocshmem_dump_stats(); + // rocshmem_reset_stats(); diff --git a/internal/clients/shmem_rccl/tester.hpp b/internal/clients/shmem_rccl/tester.hpp index 03030bf57f..831cc10064 100644 --- a/internal/clients/shmem_rccl/tester.hpp +++ b/internal/clients/shmem_rccl/tester.hpp @@ -25,7 +25,7 @@ #include -#include +#include #include "tester_arguments.hpp" diff --git a/internal/clients/shmem_rccl/tester_arguments.cpp b/internal/clients/shmem_rccl/tester_arguments.cpp index 9167329313..6c835169bb 100644 --- a/internal/clients/shmem_rccl/tester_arguments.cpp +++ b/internal/clients/shmem_rccl/tester_arguments.cpp @@ -26,7 +26,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -64,7 +64,7 @@ void TesterArguments::show_usage(std::string executable_name) { std::cout << "Usage: " << executable_name << std::endl; - std::cout << "\t-t \n"; + std::cout << "\t-t \n"; std::cout << "\t-w \n"; std::cout << "\t-s \n"; std::cout << "\t-a \n"; @@ -78,7 +78,7 @@ TesterArguments::show_usage(std::string executable_name) void TesterArguments::get_rocshmem_arguments() { - numprocs = roc_shmem_n_pes(); - myid = roc_shmem_my_pe(); + numprocs = rocshmem_n_pes(); + myid = rocshmem_my_pe(); } diff --git a/internal/clients/shmem_rccl/tester_arguments.hpp b/internal/clients/shmem_rccl/tester_arguments.hpp index a4e890eeb5..175470df5b 100644 --- a/internal/clients/shmem_rccl/tester_arguments.hpp +++ b/internal/clients/shmem_rccl/tester_arguments.hpp @@ -35,7 +35,7 @@ class TesterArguments /** * Initialize rocshmem members - * Valid after roc_shmem_init function called. + * Valid after rocshmem_init function called. */ void get_rocshmem_arguments(); @@ -54,7 +54,7 @@ class TesterArguments uint64_t min_msg_size = 1; uint64_t max_msg_size = 1 << 20; unsigned wg_size = 64; - unsigned shmem_context = 8; // ROC_SHMEM_CTX_WG_PRIVATE + unsigned shmem_context = 8; // ROCSHMEM_CTX_WG_PRIVATE /** * Arguments obtained from rocshmem diff --git a/internal/clients/spts/CMakeLists.txt b/internal/clients/spts/CMakeLists.txt index d951e086fd..172c667776 100644 --- a/internal/clients/spts/CMakeLists.txt +++ b/internal/clients/spts/CMakeLists.txt @@ -55,7 +55,7 @@ project(spts VERSION 1.1.0 LANGUAGES CXX) # CONFIGURATION OPTIONS ############################################################################### option(USE_HIP "Build HIP version of the solver" OFF) -option(USE_ROCSHMEM "Build ROC_SHMEM enabled version of the solver" OFF) +option(USE_ROCSHMEM "Build rocSHMEM enabled version of the solver" OFF) option(ALL_ANALYZE "Build analyze and solve algorithm" OFF) option(USE_DOUBLE "Use double precision floats for the data" OFF) option(ALL_LEVELSET "Build levelset algorithm" OFF) @@ -84,7 +84,7 @@ target_sources( ) ############################################################################### -# HIP / HIP + ROC_SHMEM +# HIP / HIP + rocSHMEM ############################################################################### if(USE_HIP) find_package(hip REQUIRED) @@ -95,7 +95,7 @@ if(USE_HIP) HIPHelper.cpp ) - if(USE_ROC_SHMEM) + if(USE_ROCSHMEM) find_package(rocshmem CONFIG REQUIRED) target_include_directories( @@ -118,8 +118,8 @@ if(USE_HIP) ############################################################################### else() - if(USE_ROC_SHMEM) - message(FATAL_ERROR "Cannot use ROC_SHMEM without USE_HIP") + if(USE_ROCSHMEM) + message(FATAL_ERROR "Cannot use rocSHMEM without USE_HIP") endif() target_sources( diff --git a/internal/clients/spts/GPUHelper.h b/internal/clients/spts/GPUHelper.h index 7659a20a84..7773726568 100644 --- a/internal/clients/spts/GPUHelper.h +++ b/internal/clients/spts/GPUHelper.h @@ -34,7 +34,7 @@ static int SPTS_BLOCK_SIZE = 0; -#ifdef USE_ROC_SHMEM +#ifdef USE_ROCSHMEM #define WF_PER_WG 1 #else #define WF_PER_WG 16 diff --git a/internal/clients/spts/Main.cpp b/internal/clients/spts/Main.cpp index 3dda6e491a..429e1a242c 100644 --- a/internal/clients/spts/Main.cpp +++ b/internal/clients/spts/Main.cpp @@ -179,7 +179,7 @@ int main(int argc, char *argv[]) else printf("%lf )", ((double)ns_per_levelsync_iter/1000000.)); -#ifdef USE_ROC_SHMEM +#ifdef USE_ROCSHMEM MPI_Allreduce(MPI_IN_PLACE, (void *) &ns_per_analysis_iter, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD); diff --git a/internal/clients/spts/SpTS.h b/internal/clients/spts/SpTS.h index 3ddfcdccda..ccfaaed27f 100644 --- a/internal/clients/spts/SpTS.h +++ b/internal/clients/spts/SpTS.h @@ -40,8 +40,8 @@ #include -#ifdef USE_ROC_SHMEM -#include "roc_shmem.hpp" +#ifdef USE_ROCSHMEM +#include "rocshmem.hpp" #include "mpi.h" #endif @@ -100,8 +100,8 @@ class SparseTriangularSolve : int nCols; int numBlocks; /* - #ifdef USE_ROC_SHMEM - roc_shmem_t* handle; + #ifdef USE_ROCSHMEM + rocshmem_t* handle; #endif */ std::unordered_map *observed_errors; @@ -114,15 +114,15 @@ class SparseTriangularSolve : x = NULL; y = NULL; y_zero = NULL, yref = NULL, observed_errors = NULL, errors_seen = NULL; xDev = yDev = completedRowsDev = remoteInProgressArrayDev = rowBlocksDev = doneArrayDev = shadowDoneArrayDev = numRowsAtLevelDev = maxDepthDev = rowMapDev = totalSpinDev = oneBufDev = 0; - #ifdef USE_ROC_SHMEM - int roc_shmem_queues = (2560 / WF_PER_WG); + #ifdef USE_ROCSHMEM + int rocshmem_queues = (2560 / WF_PER_WG); if (2560 % WF_PER_WG) - roc_shmem_queues++; - printf("roc_shmem_queues %d WF_PER_WG %d \n",roc_shmem_queues, WF_PER_WG); - roc_shmem_init(roc_shmem_queues); + rocshmem_queues++; + printf("rocshmem_queues %d WF_PER_WG %d \n",rocshmem_queues, WF_PER_WG); + rocshmem_init(rocshmem_queues); - this->Set_total_pes(roc_shmem_n_pes()); - this->Set_this_pe(roc_shmem_my_pe()); + this->Set_total_pes(rocshmem_n_pes()); + this->Set_this_pe(rocshmem_my_pe()); #else this->Set_total_pes(1); this->Set_this_pe(0); @@ -173,7 +173,7 @@ class SparseTriangularSolve : if (remoteInProgressArrayDev != 0) this->GPU->FreeMem(remoteInProgressArrayDev); - #ifndef USE_ROC_SHMEM + #ifndef USE_ROCSHMEM if (yDev != 0) this->GPU->FreeMem(yDev); if (doneArrayDev != 0) @@ -184,14 +184,14 @@ class SparseTriangularSolve : this->GPU->FreeMem(shadowDoneArrayDev); #else if (yDev != 0) - roc_shmem_free(yDev); + rocshmem_free(yDev); if (doneArrayDev != 0) - roc_shmem_free(doneArrayDev); + rocshmem_free(doneArrayDev); if (reqUpdateArrayDev != 0) - roc_shmem_free(reqUpdateArrayDev); + rocshmem_free(reqUpdateArrayDev); if (shadowDoneArrayDev != 0) - roc_shmem_free(shadowDoneArrayDev); - roc_shmem_finalize(); + rocshmem_free(shadowDoneArrayDev); + rocshmem_finalize(); #endif } }; @@ -207,8 +207,8 @@ void SparseTriangularSolve::AddDerivedInputFlags() AddInputFlag("non_symmetric", 'n', "false", "Force the program to work on non-symmetric matrices. This will ignore the upper triangular entirely. (Default=false)", "bool"); AddInputFlag("levelsync_size", 'l', "0", "Number of rows to launch in a level-sync kernel invocation (Default = auto-tune)", "int"); AddInputFlag("verify", 'v', "false", "Verify results", "bool"); - AddInputFlag("rocshmem_algorithm", 'a', "0", "ROC_SHMEM algorithm type", "int"); - AddInputFlag("block_size", 'b', "32768", "Use get-based algorithm for ROC_SHMEM", "int"); + AddInputFlag("rocshmem_algorithm", 'a', "0", "rocSHMEM algorithm type", "int"); + AddInputFlag("block_size", 'b', "32768", "Use get-based algorithm for rocSHMEM", "int"); AddInputFlag("put_block_size", 'p', "1024", "Block size for puts", "int"); AddInputFlag("get_backoff_factor", 'g', "128", "Backoff factor for gets", "int"); } @@ -241,10 +241,10 @@ void SparseTriangularSolve::AllocateVectors( } xDev = this->GPU->AllocateMem("xDev", nCols*sizeof(FloatType), GPU_MEM_READ_ONLY, NULL); - #ifndef USE_ROC_SHMEM + #ifndef USE_ROCSHMEM yDev = this->GPU->AllocateMem("yDev", nRows*sizeof(FloatType), GPU_MEM_READ_WRITE, NULL); #else - yDev = (memPointer) roc_shmem_malloc(nRows*sizeof(FloatType)); + yDev = (memPointer) rocshmem_malloc(nRows*sizeof(FloatType)); #endif } @@ -742,10 +742,10 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 /****** SpTS Meta-Data Setup Code ******/ /* Set up the OpenCL buffers for the SpTS meta-data */ // TODO -- is this +1 in doneArray nRows+1 required? Why? - #ifdef USE_ROC_SHMEM - doneArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t)); - reqUpdateArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t)); - shadowDoneArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t)); + #ifdef USE_ROCSHMEM + doneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t)); + reqUpdateArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t)); + shadowDoneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t)); #else doneArrayDev = this->GPU->AllocateMem("doneArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); reqUpdateArrayDev = this->GPU->AllocateMem("reqUpdateArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); @@ -835,7 +835,7 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 bool syncfree_better = false; int total_workitems_per_workgroup = WF_SIZE * WF_PER_WG; - //bool roc_shmem_initialized = false; + //bool rocshmem_initialized = false; /*********************** Actual work of the benchmark *********************/ for(int i = 0; i < iter; i++) @@ -883,18 +883,18 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 #else int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) / total_workitems_per_workgroup; - #ifdef USE_ROC_SHMEM + #ifdef USE_ROCSHMEM global_work_size = this->nRows_p * WF_SIZE; num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) / total_workitems_per_workgroup; /* - int roc_shmem_queues = (2560 / WF_PER_WG); + int rocshmem_queues = (2560 / WF_PER_WG); if (2560 % WF_PER_WG) - roc_shmem_queues++; - if (!roc_shmem_initialized) { + rocshmem_queues++; + if (!rocshmem_initialized) { int num_threads = InputFlags::GetValueInt("num_roshmem_threads"); - roc_shmem_init(&handle, roc_shmem_queues); - roc_shmem_initialized = true; + rocshmem_init(&handle, rocshmem_queues); + rocshmem_initialized = true; } */ int rocshmem_algorithm = InputFlags::GetValueInt("rocshmem_algorithm"); @@ -915,11 +915,11 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 printf("Using put/get hybrid intra-kernel algorithm\n"); break; default: - printf("Unknown ROC_SHMEM algoirthm\n"); + printf("Unknown rocSHMEM algorithm\n"); exit(-1); } size_t LDS_size; - roc_shmem_dynamic_shared(&LDS_size); + rocshmem_dynamic_shared(&LDS_size); printf("Work size %zu, wg size %d num workgroups %d LDS %zu thisPE %d Global %d \n", global_work_size, total_workitems_per_workgroup, num_of_workgroups, LDS_size, this->Get_this_pe(), this->Get_total_pes()); MPI_Barrier(MPI_COMM_WORLD); hipEventRecord(event_array[0], NULL); @@ -969,7 +969,7 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 hipEventRecord(event_array[1], NULL); hipEventSynchronize(event_array[1]); - #ifdef USE_ROC_SHMEM + #ifdef USE_ROCSHMEM // Wait for any outstanding network messages to finish up. We // can have straggler updates to the doneArray that we don't // have any dependencies for but we still eed it to finish so @@ -1000,7 +1000,7 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 this->GPU->CopyToHost(numRowsAtLevelDev, numRowsAtLevel, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL); this->GPU->Flush(); - #ifdef USE_ROC_SHMEM + #ifdef USE_ROCSHMEM // Combine global statistics MPI_Allreduce(MPI_IN_PLACE, (void *) &maxDepth, 1, MPI_UNSIGNED, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, (void *) &totalSpin, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD); @@ -1114,8 +1114,8 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 uint32_t current_iteration = 0; - #ifdef USE_ROC_SHMEM - fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n"); + #ifdef USE_ROCSHMEM + fprintf(stderr, "rocSHMEM not supported for selected algorithm\n"); exit(-1); #endif @@ -1215,8 +1215,8 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 level_sync_cutoff = 81920; } - #ifdef USE_ROC_SHMEM - fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n"); + #ifdef USE_ROCSHMEM + fprintf(stderr, "rocSHMEM not supported for selected algorithm\n"); exit(-1); #endif @@ -1346,8 +1346,8 @@ float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64 // Number of levels is maxDepth. */ levelset_iter++; - #ifdef USE_ROC_SHMEM - fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n"); + #ifdef USE_ROCSHMEM + fprintf(stderr, "rocSHMEM not supported for selected algorithm\n"); exit(-1); #endif diff --git a/internal/clients/spts/SparseMatrix.h b/internal/clients/spts/SparseMatrix.h index 101cc44840..bd36d65a00 100644 --- a/internal/clients/spts/SparseMatrix.h +++ b/internal/clients/spts/SparseMatrix.h @@ -77,8 +77,8 @@ class SparseMatrix d_vals = NULL; d_row_ptrs = NULL; - this_pe = -1;//roc_shmem_my_pe(handle); // this pe - total_pes = -1;//roc_shmem_n_pes(handle); // total number of pes + this_pe = -1;//rocshmem_my_pe(handle); // this pe + total_pes = -1;//rocshmem_n_pes(handle); // total number of pes } void AllocateSparseMatrix(MatrixMarketReader &mm_reader, diff --git a/internal/clients/spts/build_configs/analyze_single_hip b/internal/clients/spts/build_configs/analyze_single_hip index 62cc68bdc6..03b392ace3 100755 --- a/internal/clients/spts/build_configs/analyze_single_hip +++ b/internal/clients/spts/build_configs/analyze_single_hip @@ -5,7 +5,7 @@ src_path=$(dirname "$(realpath $0)")/.. cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DUSE_ROC_SHMEM=OFF \ + -DUSE_ROCSHMEM=OFF \ -DUSE_HIP=ON \ -DALL_ANALYZE=ON \ -DUSE_DOUBLE=OFF \ diff --git a/internal/clients/spts/build_configs/analyze_single_opencl b/internal/clients/spts/build_configs/analyze_single_opencl index 3716cdfc0d..41db75f17f 100755 --- a/internal/clients/spts/build_configs/analyze_single_opencl +++ b/internal/clients/spts/build_configs/analyze_single_opencl @@ -5,7 +5,7 @@ src_path=$(dirname "$(realpath $0)")/.. cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DUSE_ROC_SHMEM=OFF \ + -DUSE_ROCSHMEM=OFF \ -DUSE_HIP=OFF \ -DALL_ANALYZE=ON \ -DUSE_DOUBLE=OFF \ diff --git a/internal/clients/spts/build_configs/analyze_single_rocshmem b/internal/clients/spts/build_configs/analyze_single_rocshmem index f20fac1dd5..c542aec341 100755 --- a/internal/clients/spts/build_configs/analyze_single_rocshmem +++ b/internal/clients/spts/build_configs/analyze_single_rocshmem @@ -12,7 +12,7 @@ src_path=$(dirname "$(realpath $0)")/.. cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DUSE_ROC_SHMEM=ON \ + -DUSE_ROCSHMEM=ON \ -DUSE_HIP=ON \ -DALL_ANALYZE=ON \ -DUSE_DOUBLE=OFF \ diff --git a/internal/clients/spts/config.h.in b/internal/clients/spts/config.h.in index 268e54b01b..a9d4d814a2 100644 --- a/internal/clients/spts/config.h.in +++ b/internal/clients/spts/config.h.in @@ -1,4 +1,4 @@ -#cmakedefine USE_ROC_SHMEM +#cmakedefine USE_ROCSHMEM #cmakedefine USE_HIP #cmakedefine ALL_ANALYZE #cmakedefine USE_DOUBLE diff --git a/internal/clients/spts/spts_kernel.h b/internal/clients/spts/spts_kernel.h index 495acd4528..69a7c458bb 100644 --- a/internal/clients/spts/spts_kernel.h +++ b/internal/clients/spts/spts_kernel.h @@ -26,8 +26,8 @@ #include #include -#ifdef USE_ROC_SHMEM -#include "roc_shmem.hpp" +#ifdef USE_ROCSHMEM +#include "rocshmem.hpp" using namespace rocshmem; #endif @@ -991,7 +991,7 @@ inline FPTYPE cross_lane_reduction_three(FPTYPE temp_sum, unsigned int *row_max_ __global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1) amd_spts_analyze_and_solve( const size_t global_work_size, -#ifdef USE_ROC_SHMEM +#ifdef USE_ROCSHMEM const int this_pe, const int total_pes, unsigned int * __restrict__ shadowDoneArray, @@ -1002,9 +1002,9 @@ amd_spts_analyze_and_solve( // 1: Naive gets // 2: blocked puts // 3: put/get hybrid - int roc_shmem_algorithm, - int roc_shmem_put_block_size, - int roc_shmem_get_backoff_factor, + int rocshmem_algorithm, + int rocshmem_put_block_size, + int rocshmem_get_backoff_factor, int spts_block_size, #endif const FPTYPE * __restrict__ vals, @@ -1043,13 +1043,13 @@ amd_spts_analyze_and_solve( const unsigned int wg_lid = hipThreadIdx_x; const unsigned int lid = wg_lid % WF_SIZE; -#ifdef USE_ROC_SHMEM - __shared__ roc_shmem_ctx_t ctx; +#ifdef USE_ROCSHMEM + __shared__ rocshmem_ctx_t ctx; //if (wg_lid == OUTPUT_THREAD) { - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ROC_SHMEM_CTX_WG_PRIVATE, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx); __syncthreads(); #endif @@ -1061,7 +1061,7 @@ amd_spts_analyze_and_solve( // Actual row this wavefront will work on. const unsigned int local_row = local_first_row + local_offset; -#ifdef USE_ROC_SHMEM +#ifdef USE_ROCSHMEM // Get the global row for this wavefront assuming a row-cyclic // decomposition. Basically we need to account for other PEs here. int local_block_id = local_row / spts_block_size; @@ -1144,7 +1144,7 @@ amd_spts_analyze_and_solve( // While there are threads in this workgroup that have been unable to // get their input, loop and wait for the flag to exist. __asm__ volatile ("s_setprio 0"); -#ifdef USE_ROC_SHMEM +#ifdef USE_ROCSHMEM int target_pe = (local_col / spts_block_size) % total_pes; int backoff_counter = 0; bool need_remote_notify = true; @@ -1179,8 +1179,8 @@ amd_spts_analyze_and_solve( spin_times++; -#ifdef USE_ROC_SHMEM - if ((total_pes > 1) && (target_pe != this_pe) && (roc_shmem_algorithm == 1)) { +#ifdef USE_ROCSHMEM + if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 1)) { if (first_time) { if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0) need_comm = false; @@ -1188,12 +1188,12 @@ amd_spts_analyze_and_solve( first_time = false; if (need_comm) { - for (int i = 0; i < (backoff_counter * roc_shmem_get_backoff_factor); i++) + for (int i = 0; i < (backoff_counter * rocshmem_get_backoff_factor); i++) __asm__ volatile("s_sleep 127"); - roc_shmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe); - //roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe); + //rocshmem_ctx_quiet(ctx); __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n" "s_waitcnt vmcnt(0)" @@ -1203,7 +1203,7 @@ amd_spts_analyze_and_solve( if (local_done) { - roc_shmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe); + rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe); __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : @@ -1217,19 +1217,19 @@ amd_spts_analyze_and_solve( } } - if ((total_pes > 1) && (target_pe != this_pe) && (roc_shmem_algorithm == 3)) { + if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 3)) { if (need_remote_notify) { need_remote_notify = false; //if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0) //if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) == 0) { - roc_shmem_ctx_putmem_nbi(ctx, &reqUpdateArray[local_col], oneBuf, sizeof(int), target_pe); + rocshmem_ctx_putmem_nbi(ctx, &reqUpdateArray[local_col], oneBuf, sizeof(int), target_pe); //printf("Put 111 blockIDx %d threadID %d target_pe %d local_col %d oneBuf[0]= %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, oneBuf[0]); - roc_shmem_ctx_fence(ctx); + rocshmem_ctx_fence(ctx); //printf("fence 222 blockIDx %d threadID %d target_pe %d local_col %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col); - roc_shmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe); - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe); + rocshmem_ctx_quiet(ctx); //printf("Get 333 blockIDx %d threadID %d target_pe %d local_col %d shadowDone %d \n \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, shadowDoneArray[local_col]); __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n" @@ -1239,8 +1239,8 @@ amd_spts_analyze_and_solve( if (local_done) { - roc_shmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe); - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe); + rocshmem_ctx_quiet(ctx); __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[local_col]), @@ -1313,34 +1313,34 @@ amd_spts_analyze_and_solve( __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(row_max_depth)); asm volatile ("s_waitcnt vmcnt(0)\n\t"); -#ifdef USE_ROC_SHMEM - if (roc_shmem_algorithm == 2 && total_pes > 1) { - int CHUNK = roc_shmem_put_block_size; +#ifdef USE_ROCSHMEM + if (rocshmem_algorithm == 2 && total_pes > 1) { + int CHUNK = rocshmem_put_block_size; bool sendTime = true; int row_base = (row / CHUNK) * CHUNK; int num_done = atomicAdd(&shadowDoneArray[row_base], 1); sendTime = (num_done == (CHUNK - 1)); for(int p=0; p #include #include -#include +#include #include using namespace std; using namespace rocshmem; @@ -16,13 +16,13 @@ __device__ uint64_t timers[TIMERS] = {0}; __device__ uint64_t time_start; #define TIMERS_START() \ if(threadIdx.x == 0) {\ - time_start = roc_shmem_timer();\ + time_start = rocshmem_timer();\ } #define TIME(TIMER_NUM) \ if(threadIdx.x == 0) {\ - timers[TIMER_NUM] = roc_shmem_timer() - time_start;\ - time_start = roc_shmem_timer();\ + timers[TIMER_NUM] = rocshmem_timer() - time_start;\ + time_start = rocshmem_timer();\ } #define OUTPUT_TIME() \ @@ -41,11 +41,11 @@ __device__ uint64_t time_start; #define OUTPUT_TIME() #endif -__device__ __inline__ void alltoall(roc_shmem_ctx_t &ctx, - roc_shmem_team_t team, +__device__ __inline__ void alltoall(rocshmem_ctx_t &ctx, + rocshmem_team_t team, int *dst, int *src) { // Perform alltoall - roc_shmem_ctx_int_wg_alltoall(ctx, + rocshmem_ctx_int_wg_alltoall(ctx, team, dst, // T* dest src, // const T* source @@ -56,18 +56,18 @@ __global__ void sort(volatile int *keys, int *keyBuffer1, int *keyBuffer2, int *sendCount, int *recvCount, int *sendOffset, int *recvOffset, int *outputKeys, - size_t size, roc_shmem_team_t team, + size_t size, rocshmem_team_t team, int max_iters) { - __shared__ roc_shmem_ctx_t ctx; + __shared__ rocshmem_ctx_t ctx; __shared__ int bucketCounter[MAX_PES]; __shared__ int bucketPtr[MAX_PES]; __shared__ int total_size; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ROC_SHMEM_CTX_WG_PRIVATE, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx); - int n_pes = roc_shmem_ctx_n_pes(ctx); - int my_pe = roc_shmem_my_pe(); + int n_pes = rocshmem_ctx_n_pes(ctx); + int my_pe = rocshmem_my_pe(); int buckets = n_pes; int tid = threadIdx.x; // + blockDim.x * blockIdx.x; @@ -116,9 +116,9 @@ __global__ void sort(volatile int *keys, int *keyBuffer1, int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1; keyBuffer1[loc] = keys[i]; } - roc_shmem_ctx_threadfence_system(ctx); + rocshmem_ctx_threadfence_system(ctx); // Force sync to wait for all PEs to update bucket sizes - roc_shmem_ctx_wg_team_sync(ctx, team); + rocshmem_ctx_wg_team_sync(ctx, team); TIME(3) // Let all PEs know how many keys you wish to send alltoall(ctx, team, recvCount, sendCount); @@ -129,11 +129,11 @@ __global__ void sort(volatile int *keys, int *keyBuffer1, if(threadIdx.x == 0) { total_size = 0; for(int i = 0; i < buckets; ++i) { - roc_shmem_int_get_nbi(&keyBuffer2[total_size], + rocshmem_int_get_nbi(&keyBuffer2[total_size], &keyBuffer1[recvOffset[i]], recvCount[i], i); total_size += recvCount[i]; } - roc_shmem_quiet(); + rocshmem_quiet(); } for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x) outputKeys[i] = 0; @@ -163,14 +163,14 @@ __global__ void sort(volatile int *keys, int *keyBuffer1, TIME(7) } OUTPUT_TIME() - roc_shmem_wg_ctx_destroy(ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(ctx); + rocshmem_wg_finalize(); } bool verify(int *outputKeys, int *keyBuffer2, size_t size) { - int num_pes = roc_shmem_n_pes(); - int my_pe = roc_shmem_my_pe(); + int num_pes = rocshmem_n_pes(); + int my_pe = rocshmem_my_pe(); MPI_Status status; MPI_Request request; @@ -228,8 +228,8 @@ void initGPU() { // Calculation for local rank, taken from rccl-tests int localRank = 0; - int proc = roc_shmem_my_pe(); - int nProcs = roc_shmem_n_pes(); + int proc = rocshmem_my_pe(); + int nProcs = rocshmem_n_pes(); char hostname[1024]; gethostname(hostname, 1024); for (int i=0; i< 1024; i++) { @@ -261,12 +261,12 @@ void initGPU() int main(int argc, char *argv[]) { - // Init roc_shmem stuff + // Init rocshmem stuff initGPU(); - roc_shmem_init(NUM_WGS); - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); - roc_shmem_team_t team_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, + rocshmem_init(NUM_WGS); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); + rocshmem_team_t team_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, @@ -278,8 +278,8 @@ int main(int argc, char *argv[]) if(argc > 1) iterations = atoi(argv[1]); - int num_pes = roc_shmem_n_pes(); - int my_pe = roc_shmem_my_pe(); + int num_pes = rocshmem_n_pes(); + int my_pe = rocshmem_my_pe(); // Configure input and outputs size_t size = 1024; //atoi(argv[2]); @@ -298,17 +298,17 @@ int main(int argc, char *argv[]) // Init buffers int *keyBuffer1, *keyBuffer2; - keyBuffer1 = (int*)roc_shmem_malloc(sizeof(int) * size); - keyBuffer2 = (int*)roc_shmem_malloc(sizeof(int) * size * 4); + keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size); + keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4); int *sendCount, *recvCount, *sendOffset, *recvOffset; - sendCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES); - recvCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES); - sendOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES); - recvOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES); + sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); + recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); + sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); + recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); // Untimed run - roc_shmem_barrier_all(); + rocshmem_barrier_all(); sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2, sendCount, recvCount, sendOffset, recvOffset, outputKeys, size, team_world_dup, 1); @@ -321,7 +321,7 @@ int main(int argc, char *argv[]) } // Timed run - roc_shmem_barrier_all(); + rocshmem_barrier_all(); auto time_start = TIME_NOW; sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2, sendCount, recvCount, sendOffset, recvOffset, @@ -347,12 +347,12 @@ int main(int argc, char *argv[]) // Clean up hipFree(keys); hipFree(outputKeys); - roc_shmem_free(keyBuffer1); - roc_shmem_free(keyBuffer2); - roc_shmem_free(sendCount); - roc_shmem_free(recvCount); - roc_shmem_free(sendOffset); - roc_shmem_free(recvOffset); - roc_shmem_finalize(); + rocshmem_free(keyBuffer1); + rocshmem_free(keyBuffer2); + rocshmem_free(sendCount); + rocshmem_free(recvCount); + rocshmem_free(sendOffset); + rocshmem_free(recvOffset); + rocshmem_finalize(); return 0; -} \ No newline at end of file +} diff --git a/scripts/functional_tests/driver.sh b/scripts/functional_tests/driver.sh index 5fa93051ef..de02721463 100755 --- a/scripts/functional_tests/driver.sh +++ b/scripts/functional_tests/driver.sh @@ -47,88 +47,88 @@ case $2 in ########################################################################### *"serial") echo "get_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log check get_n2_w1_z1_1MB echo "getnbi_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log check getnbi_n2_w1_z1_1MB echo "put_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log check put_n2_w1_z1_1MB echo "putnbi_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log check putnbi_n2_w1_z1_1MB echo "wg_get_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 28 > $3/wg_get_n2_w1_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 28 > $3/wg_get_n2_w1_z64_1MB.log check wg_get_n2_w1_z1_1MB echo "wg_getnbi_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_n2_w1_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_n2_w1_z64_1MB.log check wg_getnbi_n2_w1_z1_1MB echo "wg_put_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 30 > $3/wg_put_n2_w1_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 30 > $3/wg_put_n2_w1_z64_1MB.log check wg_put_n2_w1_z1_1MB echo "wg_putnbi_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_n2_w1_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_n2_w1_z64_1MB.log check wg_putnbi_n2_w1_z1_1MB echo "wg_get_tiled_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 28 > $3/wg_get_tiled_n2_w2_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 28 > $3/wg_get_tiled_n2_w2_z64_1MB.log check wg_get_tiled_n2_w1_z1_1MB echo "wg_getnbi_tiled_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_tiled_n2_w2_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_tiled_n2_w2_z64_1MB.log check wg_getnbi_tiled_n2_w1_z1_1MB echo "wg_put_tiled_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 30 > $3/wg_put_tiled_n2_w2_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 30 > $3/wg_put_tiled_n2_w2_z64_1MB.log check wg_put_tiled_n2_w1_z1_1MB echo "wg_putnbi_tiled_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_tiled_n2_w2_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_tiled_n2_w2_z64_1MB.log check wg_putnbi_tiled_n2_w1_z1_1MB echo "wave_get_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 32 > $3/wave_get_n2_w1_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 32 > $3/wave_get_n2_w1_z64_1MB.log check wave_get_n2_w1_z1_1MB echo "wave_getnbi_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 33 > $3/wave_getnbi_n2_w1_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 33 > $3/wave_getnbi_n2_w1_z64_1MB.log check wave_getnbi_n2_w1_z1_1MB echo "wave_put_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 34 > $3/wave_put_n2_w1_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 34 > $3/wave_put_n2_w1_z64_1MB.log check wave_put_n2_w1_z1_1MB echo "wave_putnbi_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 35 > $3/wave_putnbi_n2_w1_z64_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 35 > $3/wave_putnbi_n2_w1_z64_1MB.log check wave_putnbi_n2_w1_z1_1MB echo "wave_get_tiled_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 32 > $3/wave_get_tiled_n2_w2_z128_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 32 > $3/wave_get_tiled_n2_w2_z128_1MB.log check wave_get_tiled_n2_w1_z1_1MB echo "wave_getnbi_tiled_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 33 > $3/wave_getnbi_tiled_n2_w2_z128_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 33 > $3/wave_getnbi_tiled_n2_w2_z128_1MB.log check wave_getnbi_tiled_n2_w1_z1_1MB echo "wave_put_tiled_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 34 > $3/wave_put_tiled_n2_w2_z128_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 34 > $3/wave_put_tiled_n2_w2_z128_1MB.log check wave_put_tiled_n2_w1_z1_1MB echo "wave_putnbi_tiled_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 35 > $3/wave_putnbi_tiled_n2_w2_z128_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 35 > $3/wave_putnbi_tiled_n2_w2_z128_1MB.log check wave_putnbi_tiled_n2_w1_z1_1MB echo "amofadd_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log check amofadd_n2_w1_z1 echo "amofinc_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log check amofinc_n2_w1_z1 echo "amofetch_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log check amofetch_n2_w1_z1 echo "amofcswap_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log check amofcswap_n2_w1_z1 echo "amoadd_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log check amoadd_n2_w1_z1 echo "amoinc_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log check amoinc_n2_w1_z1 # echo "pingpong_n2_w1" - # ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log + # ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log # check pingpong_n2_w1 echo "amoset_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log check amoset_n2_w1_z1 ;; @@ -137,88 +137,88 @@ case $2 in ########################################################################### *"short") echo "get_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log check get_n2_w16_z128_8B echo "getnbi_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log check getnbi_n2_w16_z128_8B echo "put_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log check put_n2_w16_z128_8B echo "putnbi_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log check putnbi_n2_w16_z128_8B echo "wg_get_n2_w1_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 28 > $3/wg_get_n2_w1_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 28 > $3/wg_get_n2_w1_z64_8B.log check wg_get_n2_w1_z64_8B echo "wg_getnbi_n2_w1_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 29 > $3/wg_getnbi_n2_w1_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 29 > $3/wg_getnbi_n2_w1_z64_8B.log check wg_getnbi_n2_w1_z64_8B echo "wg_put_n2_w1_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 30 > $3/wg_put_n2_w1_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 30 > $3/wg_put_n2_w1_z64_8B.log check wg_put_n2_w1_z64_8B echo "wg_putnbi_n2_w1_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 31 > $3/wg_putnbi_n2_w1_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 31 > $3/wg_putnbi_n2_w1_z64_8B.log check wg_putnbi_n2_w1_z64_8B echo "wg_get_tiled_n2_w16_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 28 > $3/wg_get_tiled_n2_w16_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 28 > $3/wg_get_tiled_n2_w16_z64_8B.log check wg_get_tiled_n2_w16_z64_8B echo "wg_getnbi_tiled_n2_w16_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 29 > $3/wg_getnbi_tiled_n2_w16_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 29 > $3/wg_getnbi_tiled_n2_w16_z64_8B.log check wg_getnbi_tiled_n2_w16_z64_8B echo "wg_put_tiled_n2_w16_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 30 > $3/wg_put_tiled_n2_w16_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 30 > $3/wg_put_tiled_n2_w16_z64_8B.log check wg_put_tiled_n2_w16_z64_8B echo "wg_putnbi_tiled_n2_w16_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 31 > $3/wg_putnbi_tiled_n2_w16_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 31 > $3/wg_putnbi_tiled_n2_w16_z64_8B.log check wg_putnbi_tiled_n2_w16_z64_8B echo "wave_get_n2_w1_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 32 > $3/wave_get_n2_w1_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 32 > $3/wave_get_n2_w1_z64_8B.log check wave_get_n2_w1_z64_8B echo "wave_getnbi_n2_w1_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 33 > $3/wave_getnbi_n2_w1_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 33 > $3/wave_getnbi_n2_w1_z64_8B.log check wave_getnbi_n2_w1_z64_8B echo "wave_put_n2_w1_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 34 > $3/wave_put_n2_w1_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 34 > $3/wave_put_n2_w1_z64_8B.log check wave_put_n2_w1_z64_8B echo "wave_putnbi_n2_w1_z64_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 35 > $3/wave_putnbi_n2_w1_z64_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 35 > $3/wave_putnbi_n2_w1_z64_8B.log check wave_putnbi_n2_w1_z64_8B echo "wave_get_tiled_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 32 > $3/wave_get_tiled_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 32 > $3/wave_get_tiled_n2_w16_z128_8B.log check wave_get_tiled_n2_w16_z128_8B echo "wave_getnbi_tiled_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 33 > $3/wave_getnbi_tiled_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 33 > $3/wave_getnbi_tiled_n2_w16_z128_8B.log check wave_getnbi_tiled_n2_w16_z128_8B echo "wave_put_tiled_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 34 > $3/wave_put_tiled_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 34 > $3/wave_put_tiled_n2_w16_z128_8B.log check wave_put_tiled_n2_w16_z128_8B echo "wave_putnbi_tiled_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 35 > $3/wave_putnbi_tiled_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 35 > $3/wave_putnbi_tiled_n2_w16_z128_8B.log check wave_putnbi_tiled_n2_w16_z128_8B echo "amofadd_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log check amofadd_n2_w8_z1 echo "amofinc_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log check amofinc_n2_w8_z1 echo "amofetch_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log check amofetch_n2_w8_z1 echo "amofcswap_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log check amofcswap_n2_w8_z1 echo "amoadd_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log check amoadd_n2_w8_z1 echo "amoinc_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log check amoinc_n2_w8_z1 # echo "pingpong_n2_w1" - # ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log + # ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log # check pingpong_n2_w1 echo "amoset_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log check amoset_n2_w8_z1 ;; @@ -228,246 +228,246 @@ case $2 in *"exhaustive") ############################### GET ################################### echo "get_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log check get_n2_w1_z1_1MB echo "get_n2_w1_z1024_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 0 > $3/get_n2_w1_z1024_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 0 > $3/get_n2_w1_z1024_512B.log check get_n2_w1_z1024_512B echo "get_n2_w8_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 0 > $3/get_n2_w8_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 0 > $3/get_n2_w8_z1_1MB.log check get_n2_w8_z1_1MB echo "get_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log check get_n2_w16_z128_8B echo "get_n2_w32_z256_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 0 > $3/get_n2_w32_z256_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 0 > $3/get_n2_w32_z256_512B.log check get_n2_w32_z256_512B echo "get_n2_w64_z1024_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 0 > $3/get_n2_w64_z1024_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 0 > $3/get_n2_w64_z1024_8B.log check get_n2_w64_z1024_8B ############################### GETNBI ################################ echo "getnbi_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log check getnbi_n2_w1_z1_1MB echo "getnbi_n2_w1_z1024_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 1 > $3/getnbi_n2_w1_z1024_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 1 > $3/getnbi_n2_w1_z1024_512B.log check getnbi_n2_w1_z1024_512B echo "getnbi_n2_w8_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w8_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w8_z1_1MB.log check getnbi_n2_w8_z1_1MB echo "getnbi_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log check getnbi_n2_w16_z128_8B echo "getnbi_n2_w32_z256_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 1 > $3/getnbi_n2_w32_z256_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 1 > $3/getnbi_n2_w32_z256_512B.log check getnbi_n2_w32_z256_512B echo "getnbi_n2_w64_z1024_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 1 > $3/getnbi_n2_w64_z1024_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 1 > $3/getnbi_n2_w64_z1024_8B.log check getnbi_n2_w64_z1024_8B ############################### PUT ################################### echo "put_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log check put_n2_w1_z1_1MB echo "put_n2_w1_z1024_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 2 > $3/put_n2_w1_z1024_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 2 > $3/put_n2_w1_z1024_512B.log check put_n2_w1_z1024_512B echo "put_n2_w8_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 2 > $3/put_n2_w8_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 2 > $3/put_n2_w8_z1_1MB.log check put_n2_w8_z1_1MB echo "put_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log check put_n2_w16_z128_8B echo "put_n2_w32_z256_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 2 > $3/put_n2_w32_z256_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 2 > $3/put_n2_w32_z256_512B.log check put_n2_w32_z256_512B echo "put_n2_w64_z1024_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 2 > $3/put_n2_w64_z1024_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 2 > $3/put_n2_w64_z1024_8B.log check put_n2_w64_z1024_8B ############################### PUTNBI ################################ echo "putnbi_n2_w1_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log check putnbi_n2_w1_z1_1MB echo "putnbi_n2_w1_z1024_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 3 > $3/putnbi_n2_w1_z1024_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 3 > $3/putnbi_n2_w1_z1024_512B.log check putnbi_n2_w1_z1024_512B echo "putnbi_n2_w8_z1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w8_z1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w8_z1_1MB.log check putnbi_n2_w8_z1_1MB echo "putnbi_n2_w16_z128_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log check putnbi_n2_w16_z128_8B echo "putnbi_n2_w32_z256_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 3 > $3/putnbi_n2_w32_z256_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 3 > $3/putnbi_n2_w32_z256_512B.log check putnbi_n2_w32_z256_512B echo "putnbi_n2_w64_z1024_8B" - ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 3 > $3/putnbi_n2_w64_z1024_8B.log + ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 3 > $3/putnbi_n2_w64_z1024_8B.log check putnbi_n2_w64_z1024_8B ############################# REDUCTION ############################## echo "reduction_n2_w1_z1_32K" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 32768 -a 5 > $3/reduction_n2_w1_z1_32K.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 32768 -a 5 > $3/reduction_n2_w1_z1_32K.log check reduction_n2_w1_z1_32K echo "reduction_n2_w8_z1_32K" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 32768 -a 5 > $3/reduction_n2_w8_z1_32K.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 32768 -a 5 > $3/reduction_n2_w8_z1_32K.log check reduction_n2_w8_z1_32K echo "reduction_n2_w32_z1_32K" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -s 32768 -a 5 > $3/reduction_n2_w32_z1_32K.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -s 32768 -a 5 > $3/reduction_n2_w32_z1_32K.log check reduction_n2_w32_z1_32K ############################## AMOFADD ############################### echo "amofadd_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log check amofadd_n2_w1_z1 echo "amofadd_n2_w1_z1024" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 6 > $3/amofadd_n2_w1_z1024.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 6 > $3/amofadd_n2_w1_z1024.log check amofadd_n2_w1_z1024 echo "amofadd_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log check amofadd_n2_w8_z1 echo "amofadd_n2_w32_z128" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 6 > $3/amofadd_n2_w32_z128.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 6 > $3/amofadd_n2_w32_z128.log check amofadd_n2_w32_z128 ############################## AMOFINC ############################### echo "amofinc_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log check amofinc_n2_w1_z1 echo "amofinc_n2_w1_z1024" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 7 > $3/amofinc_n2_w1_z1024.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 7 > $3/amofinc_n2_w1_z1024.log check amofinc_n2_w1_z1024 echo "amofinc_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log check amofinc_n2_w8_z1 echo "amofinc_n2_w32_z128" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 7 > $3/amofinc_n2_w32_z128.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 7 > $3/amofinc_n2_w32_z128.log check amofinc_n2_w32_z128 ############################ AMOFETCH ################################ echo "amofetch_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log check amofetch_n2_w1_z1 echo "amofetch_n2_w1_z1024" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 8 > $3/amofetch_n2_w1_z1024.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 8 > $3/amofetch_n2_w1_z1024.log check amofetch_n2_w1_z1024 echo "amofetch_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log check amofetch_n2_w8_z1 echo "amofetch_n2_w32_z128" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 8 > $3/amofetch_n2_w32_z128.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 8 > $3/amofetch_n2_w32_z128.log check amofetch_n2_w32_z128 ########################### AMOFCSWAP ################################ echo "amofcswap_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log check amofcswap_n2_w1_z1 echo "amofcswap_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log check amofcswap_n2_w8_z1 echo "amofcswap_n2_w32_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 9 > $3/amofcswap_n2_w32_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 9 > $3/amofcswap_n2_w32_z1.log check amofcswap_n2_w32_z1 ############################# AMOADD ################################ echo "amoadd_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log check amoadd_n2_w1_z1 echo "amoadd_n2_w1_z1024" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 10 > $3/amoadd_n2_w1_z1024.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 10 > $3/amoadd_n2_w1_z1024.log check amoadd_n2_w1_z1024 echo "amoadd_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log check amoadd_n2_w8_z1 echo "amoadd_n2_w32_z128" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 10 > $3/amoadd_n2_w32_z128.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 10 > $3/amoadd_n2_w32_z128.log check amoadd_n2_w32_z128 ############################# AMOINC ################################ echo "amoinc_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log check amoinc_n2_w1_z1 echo "amoinc_n2_w1_z1024" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 11 > $3/amoinc_n2_w1_z1024.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 11 > $3/amoinc_n2_w1_z1024.log check amoinc_n2_w1_z1024 echo "amoinc_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log check amoinc_n2_w8_z1 echo "amoinc_n2_w32_z128" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 11 > $3/amoinc_n2_w32_z128.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 11 > $3/amoinc_n2_w32_z128.log check amoinc_n2_w32_z128 ############################## INIT ################################# echo "init_n2" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -a 13 > $3/init_n2.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -a 13 > $3/init_n2.log check init_n2 ########################### PINGPONG ################################ echo "pingpong_n2_w1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log check pingpong_n2_w1 echo "pingpong_n2_w8" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 14 > $3/pingpong_n2_w8.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 14 > $3/pingpong_n2_w8.log check pingpong_n2_w8 echo "pingpong_n2_w32" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 14 > $3/pingpong_n2_w32.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 14 > $3/pingpong_n2_w32.log check pingpong_n2_w32 ############################ BARRIER ################################ echo "barrier_n2_w1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 17 > $3/barrier_n2_w1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 17 > $3/barrier_n2_w1.log check barrier_n2_w1 echo "barrier_n2_w8" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 17 > $3/barrier_n2_w8.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 17 > $3/barrier_n2_w8.log check barrier_n2_w8 echo "barrier_n2_w32" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 17 > $3/barrier_n2_w32.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 17 > $3/barrier_n2_w32.log check barrier_n2_w32 ############################ SYNCALL ################################ echo "syncall_n2_w1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 18 > $3/syncall_n2_w1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 18 > $3/syncall_n2_w1.log check syncall_n2_w1 echo "syncall_n2_w8" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 18 > $3/syncall_n2_w8.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 18 > $3/syncall_n2_w8.log check syncall_n2_w8 echo "syncall_n2_w32" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 18 > $3/syncall_n2_w32.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 18 > $3/syncall_n2_w32.log check syncall_n2_w32 ############################# SYNC ################################## echo "sync_n2_w1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 19 > $3/sync_n2_w1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 19 > $3/sync_n2_w1.log check sync_n2_w1 echo "sync_n2_w8" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 19 > $3/sync_n2_w8.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 19 > $3/sync_n2_w8.log check sync_n2_w8 echo "sync_n2_w32" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 19 > $3/sync_n2_w32.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 19 > $3/sync_n2_w32.log check sync_n2_w32 ########################### FCOLLECT ################################ echo "fcollect_n2_w1_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 22 > $3/fcollect_n2_w1_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 22 > $3/fcollect_n2_w1_512B.log check fcollect_n2_w1_512B echo "fcollect_n2_w8_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 22 > $3/fcollect_n2_w8_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 22 > $3/fcollect_n2_w8_512B.log check fcollect_n2_w8_512B echo "fcollect_n2_w32_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 22 > $3/fcollect_n2_w32_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 22 > $3/fcollect_n2_w32_512B.log check fcollect_n2_w32_512B ########################### ALLTOALL ################################ echo "alltoall_n2_w1_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 23 > $3/alltoall_n2_w1_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 23 > $3/alltoall_n2_w1_512B.log check alltoall_n2_w1_512B echo "alltoall_n2_w8_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 23 > $3/alltoall_n2_w8_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 23 > $3/alltoall_n2_w8_512B.log check alltoall_n2_w8_512B echo "alltoall_n2_w32_512B" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 23 > $3/alltoall_n2_w32_512B.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 23 > $3/alltoall_n2_w32_512B.log check alltoall_n2_w32_512B ########################## TEAMGETNBI ############################### echo "teamgetnbi_n2_w1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 39 > $3/teamgetnbi_n2_w1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 39 > $3/teamgetnbi_n2_w1_1MB.log check teamgetnbi_n2_w1_1MB ########################## TEAMPUTNBI ############################### echo "teamputnbi_n2_w1_1MB" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 41 > $3/teamputnbi_n2_w1_1MB.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 41 > $3/teamputnbi_n2_w1_1MB.log check teamputnbi_n2_w1_1MB ############################ AMOSET ################################# echo "amoset_n2_w1_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log check amoset_n2_w1_z1 echo "amoset_n2_w8_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log check amoset_n2_w8_z1 echo "amoset_n2_w32_z1" - ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 44 > $3/amoset_n2_w32_z1.log + ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 44 > $3/amoset_n2_w32_z1.log check amoset_n2_w32_z1 ;; diff --git a/scripts/functional_tests/shmem_allLib_build_and_test.sh b/scripts/functional_tests/shmem_allLib_build_and_test.sh index e9c1af0c4d..e9c3c2ce01 100755 --- a/scripts/functional_tests/shmem_allLib_build_and_test.sh +++ b/scripts/functional_tests/shmem_allLib_build_and_test.sh @@ -82,10 +82,10 @@ do # test exeuction based on lib if [ "$libnm" == "ro_net" ] then - ROC_SHMEM_RO=1 + ROCSHMEM_RO=1 ROC_NET_CPU_QUEUE=1 UCX_TLS=rc - #echo $ROC_SHMEM_RO"--"$ROC_NET_CPU_QUEUE "--"$UCX_TLS + #echo $ROCSHMEM_RO"--"$ROC_NET_CPU_QUEUE "--"$UCX_TLS ../scripts/functional_tests/driver.sh tests/functional_tests/rocshmem_example_driver $threadType . else ../scripts/functional_tests/driver.sh tests/functional_tests/rocshmem_example_driver $threadType . diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5c31872765..3a02f783b5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,8 +31,8 @@ target_sources( context_host.cpp context_device.cpp mpi_init_singleton.cpp - roc_shmem_gpu.cpp - roc_shmem.cpp + rocshmem_gpu.cpp + rocshmem.cpp team.cpp team_tracker.cpp util.cpp diff --git a/src/backend_bc.cpp b/src/backend_bc.cpp index c55b1cd2d1..23d085070a 100644 --- a/src/backend_bc.cpp +++ b/src/backend_bc.cpp @@ -202,7 +202,7 @@ void Backend::reset_stats() { reset_backend_stats(); } -__device__ bool Backend::create_ctx(int64_t option, roc_shmem_ctx_t* ctx) { +__device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) { #ifdef USE_GPU_IB return static_cast(this)->create_ctx(option, ctx); #elif defined(USE_RO) @@ -212,7 +212,7 @@ __device__ bool Backend::create_ctx(int64_t option, roc_shmem_ctx_t* ctx) { #endif } -__device__ void Backend::destroy_ctx(roc_shmem_ctx_t* ctx) { +__device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) { #ifdef USE_GPU_IB static_cast(this)->destroy_ctx(ctx); #elif defined(USE_RO) diff --git a/src/backend_bc.hpp b/src/backend_bc.hpp index 88cc6b3ba3..d635f5bc5c 100644 --- a/src/backend_bc.hpp +++ b/src/backend_bc.hpp @@ -36,7 +36,7 @@ #include #include "config.h" // NOLINT(build/include_subdir) -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "backend_type.hpp" #include "ipc_policy.hpp" #include "memory/symmetric_heap.hpp" @@ -56,7 +56,7 @@ class TeamInfo; * It uses this state to populate Context objects which the GPU may use to * perform networking operations. * - * The roc_shmem.cpp implementation file wraps many the Backend public + * The rocshmem.cpp implementation file wraps many the Backend public * members to implement the library's public API. */ class Backend { @@ -74,8 +74,8 @@ class Backend { */ virtual ~Backend(); - __device__ bool create_ctx(int64_t option, roc_shmem_ctx_t* ctx); - __device__ void destroy_ctx(roc_shmem_ctx_t* ctx); + __device__ bool create_ctx(int64_t option, rocshmem_ctx_t* ctx); + __device__ void destroy_ctx(rocshmem_ctx_t* ctx); /** * @brief Create a new team object and initialize it. @@ -93,14 +93,14 @@ class Backend { TeamInfo* team_info_wrt_parent, TeamInfo* team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t* new_team) = 0; + rocshmem_team_t* new_team) = 0; /** * @brief Destruct a team * * @param[in] team Handle to the team to destroy. */ - virtual void team_destroy(roc_shmem_team_t team) = 0; + virtual void team_destroy(rocshmem_team_t team) = 0; /** * @brief Reports processing element number id. diff --git a/src/constants.hpp b/src/constants.hpp index 1c1cfc1e5c..706ed9ede2 100644 --- a/src/constants.hpp +++ b/src/constants.hpp @@ -26,7 +26,7 @@ /** * @file constants.hpp * - * @brief Contains global constants for ROCSHMEM library + * @brief Contains global constants for rocSHMEM library */ namespace rocshmem { diff --git a/src/containers/helper_macros.hpp b/src/containers/helper_macros.hpp index 50cc33b2d1..28a0ddbfeb 100644 --- a/src/containers/helper_macros.hpp +++ b/src/containers/helper_macros.hpp @@ -30,11 +30,11 @@ #include #include -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" -#define BARRIER() rocshmem::roc_shmem_wg_barrier_all() -#define RANK rocshmem::roc_shmem_my_pe() -#define NPES rocshmem::roc_shmem_n_pes() +#define BARRIER() rocshmem::rocshmem_wg_barrier_all() +#define RANK rocshmem::rocshmem_my_pe() +#define NPES rocshmem::rocshmem_n_pes() #define PE_BITS ((uint64_t)ceil(log(NPES) / log(2))) #define PE_OF(X) ((X) >> (64 - PE_BITS)) diff --git a/src/context.hpp b/src/context.hpp index 4516a5d0a1..850ab6ef0a 100644 --- a/src/context.hpp +++ b/src/context.hpp @@ -67,7 +67,7 @@ class Context { * just removing the dispatch implementations. * * No comments for these guys since its basically the same as in the - * roc_shmem.hpp public header. + * rocshmem.hpp public header. */ /************************************************************************** @@ -139,7 +139,7 @@ class Context { __device__ void sync_all(); - __device__ void sync(roc_shmem_team_t team); + __device__ void sync(rocshmem_team_t team); template __device__ T amo_fetch(void* dst, T value, T cond, int pe, uint8_t atomic_op); @@ -186,13 +186,13 @@ class Context { template __device__ T g(T* source, int pe); - template + template __device__ void to_all(T* dest, const T* source, int nreduce, int PE_start, int logPE_stride, int PE_size, T* pWrk, long* pSync); // NOLINT(runtime/int) - template - __device__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce); + template + __device__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce); template __device__ void put(T* dest, const T* source, size_t nelems, int pe); @@ -207,15 +207,15 @@ class Context { __device__ void get_nbi(T* dest, const T* source, size_t nelems, int pe); template - __device__ void alltoall(roc_shmem_team_t team, T* dest, const T* source, + __device__ void alltoall(rocshmem_team_t team, T* dest, const T* source, int nelems); template - __device__ void fcollect(roc_shmem_team_t team, T* dest, const T* source, + __device__ void fcollect(rocshmem_team_t team, T* dest, const T* source, int nelems); template - __device__ void broadcast(roc_shmem_team_t team, T* dest, const T* source, + __device__ void broadcast(rocshmem_team_t team, T* dest, const T* source, int nelems, int pe_root); template @@ -351,16 +351,16 @@ class Context { long* p_sync); // NOLINT(runtime/int) template - __host__ void broadcast(roc_shmem_team_t team, T* dest, const T* source, + __host__ void broadcast(rocshmem_team_t team, T* dest, const T* source, int nelems, int pe_root); - template + template __host__ void to_all(T* dest, const T* source, int nreduce, int PE_start, int logPE_stride, int PE_size, T* pWrk, long* pSync); // NOLINT(runtime/int) - template - __host__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce); + template + __host__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce); template __host__ void wait_until(T *ivars, int cmp, T val); diff --git a/src/context_device.cpp b/src/context_device.cpp index 33f468d72d..33265396bb 100644 --- a/src/context_device.cpp +++ b/src/context_device.cpp @@ -154,7 +154,7 @@ __device__ void Context::sync_all() { DISPATCH(sync_all()); } -__device__ void Context::sync(roc_shmem_team_t team) { +__device__ void Context::sync(rocshmem_team_t team) { ctxStats.incStat(NUM_SYNC_ALL); DISPATCH(sync(team)); diff --git a/src/context_tmpl_device.hpp b/src/context_tmpl_device.hpp index b504fce229..5c32ba19db 100644 --- a/src/context_tmpl_device.hpp +++ b/src/context_tmpl_device.hpp @@ -62,7 +62,7 @@ __device__ T Context::g(T *source, int pe) { } // The only way to get multi-arg templates to feed into a macro -template +template __device__ void Context::to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, @@ -79,11 +79,11 @@ __device__ void Context::to_all(T *dest, const T *source, int nreduce, PE_size, pWrk, pSync)); } -template -__device__ int Context::reduce(roc_shmem_team_t team, T *dest, const T *source, +template +__device__ int Context::reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce) { if (nreduce == 0) { - return ROC_SHMEM_SUCCESS; + return ROCSHMEM_SUCCESS; } if (is_thread_zero_in_block()) { @@ -140,7 +140,7 @@ __device__ void Context::get_nbi(T *dest, const T *source, size_t nelems, } template -__device__ void Context::alltoall(roc_shmem_team_t team, T *dest, +__device__ void Context::alltoall(rocshmem_team_t team, T *dest, const T *source, int nelems) { if (nelems == 0) { return; @@ -154,7 +154,7 @@ __device__ void Context::alltoall(roc_shmem_team_t team, T *dest, } template -__device__ void Context::fcollect(roc_shmem_team_t team, T *dest, +__device__ void Context::fcollect(rocshmem_team_t team, T *dest, const T *source, int nelems) { if (nelems == 0) { return; @@ -168,7 +168,7 @@ __device__ void Context::fcollect(roc_shmem_team_t team, T *dest, } template -__device__ void Context::broadcast(roc_shmem_team_t team, T *dest, +__device__ void Context::broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root) { if (nelems == 0) { return; @@ -340,32 +340,32 @@ __device__ __forceinline__ int Context::test(T *ivars, int cmp, int ret = 0; volatile T *vol_ivars = reinterpret_cast(ivars); switch (cmp) { - case ROC_SHMEM_CMP_EQ: + case ROCSHMEM_CMP_EQ: if (uncached_load(vol_ivars) == val) { ret = 1; } break; - case ROC_SHMEM_CMP_NE: + case ROCSHMEM_CMP_NE: if (uncached_load(vol_ivars) != val) { ret = 1; } break; - case ROC_SHMEM_CMP_GT: + case ROCSHMEM_CMP_GT: if (uncached_load(vol_ivars) > val) { ret = 1; } break; - case ROC_SHMEM_CMP_GE: + case ROCSHMEM_CMP_GE: if (uncached_load(vol_ivars) >= val) { ret = 1; } break; - case ROC_SHMEM_CMP_LT: + case ROCSHMEM_CMP_LT: if (uncached_load(vol_ivars) < val) { ret = 1; } break; - case ROC_SHMEM_CMP_LE: + case ROCSHMEM_CMP_LE: if (uncached_load(vol_ivars) <= val) { ret = 1; } diff --git a/src/context_tmpl_host.hpp b/src/context_tmpl_host.hpp index e8cd2f23b9..ac226e1c83 100644 --- a/src/context_tmpl_host.hpp +++ b/src/context_tmpl_host.hpp @@ -194,7 +194,7 @@ __host__ void Context::broadcast(T *dest, const T *source, int nelems, } template -__host__ void Context::broadcast(roc_shmem_team_t team, T *dest, +__host__ void Context::broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root) { // NOLINT(runtime/int) if (nelems == 0) { @@ -206,7 +206,7 @@ __host__ void Context::broadcast(roc_shmem_team_t team, T *dest, HOST_DISPATCH(broadcast(team, dest, source, nelems, pe_root)); } -template +template __host__ void Context::to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, @@ -221,11 +221,11 @@ __host__ void Context::to_all(T *dest, const T *source, int nreduce, logPE_stride, PE_size, pWrk, pSync)); } -template -__host__ int Context::reduce(roc_shmem_team_t team, T *dest, const T *source, +template +__host__ int Context::reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce) { // NOLINT(runtime/int) if (nreduce == 0) { - return ROC_SHMEM_SUCCESS; + return ROCSHMEM_SUCCESS; } ctxHostStats.incStat(NUM_HOST_TO_ALL); diff --git a/src/fence_policy.hpp b/src/fence_policy.hpp index c1e57f24bc..56725837c3 100644 --- a/src/fence_policy.hpp +++ b/src/fence_policy.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_FENCE_POLICY_HPP_ #define LIBRARY_SRC_FENCE_POLICY_HPP_ -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" namespace rocshmem { @@ -43,7 +43,7 @@ class Fence { * @param[in] options interpreted as a bitfield using bitwise operations */ __host__ __device__ Fence(long option) { - if (option & ROC_SHMEM_CTX_NOSTORE) { + if (option & ROCSHMEM_CTX_NOSTORE) { flush_ = false; } } diff --git a/src/gpu_ib/backend_ib.cpp b/src/gpu_ib/backend_ib.cpp index 4692235738..7a757b67da 100644 --- a/src/gpu_ib/backend_ib.cpp +++ b/src/gpu_ib/backend_ib.cpp @@ -30,7 +30,7 @@ #include #include // NOLINT(build/c++11) -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "../backend_type.hpp" #include "../context_incl.hpp" #include "gpu_ib_team.hpp" @@ -47,10 +47,10 @@ namespace rocshmem { } \ } -extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT; +extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; -roc_shmem_team_t get_external_team(GPUIBTeam *team) { - return reinterpret_cast(team); +rocshmem_team_t get_external_team(GPUIBTeam *team) { + return reinterpret_cast(team); } int get_ls_non_zero_bit(char *bitmask, int mask_length) { @@ -68,7 +68,7 @@ int get_ls_non_zero_bit(char *bitmask, int mask_length) { } GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() { - if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) { + if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) { std::stringstream sstream(maximum_num_contexts_str); sstream >> maximum_num_contexts_; } @@ -96,7 +96,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() { setup_team_world(); - roc_shmem_collective_init(); + rocshmem_collective_init(); teams_init(); @@ -109,7 +109,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() { #ifdef USE_HOST_SIDE_HDP_FLUSH hdp_gpu_cpu_flush_flag_ = - static_cast(roc_shmem_malloc(sizeof(unsigned int))); + static_cast(rocshmem_malloc(sizeof(unsigned int))); hdp_policy->set_flush_polling_ptr(hdp_gpu_cpu_flush_flag_); hdp_flush_worker_thread = std::thread(&GPUIBBackend::hdp_flush_poll, this); @@ -125,7 +125,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() { } __device__ bool GPUIBBackend::create_ctx(int64_t options, - roc_shmem_ctx_t *ctx) { + rocshmem_ctx_t *ctx) { GPUIBContext *ctx_; auto pop_result = ctx_free_list.get()->pop_front(); @@ -154,7 +154,7 @@ void GPUIBBackend::ctx_destroy(Context *ctx) { delete gpu_ib_host_ctx; } -__device__ void GPUIBBackend::destroy_ctx(roc_shmem_ctx_t *ctx) { +__device__ void GPUIBBackend::destroy_ctx(rocshmem_ctx_t *ctx) { ctx_free_list.get()->push_back(static_cast(ctx->ctx_opaque)); } @@ -167,7 +167,7 @@ GPUIBBackend::~GPUIBBackend() { #ifdef USE_HOST_SIDE_HDP_FLUSH hdp_flush_worker_thread.join(); hdp_policy->set_flush_polling_ptr(nullptr); - roc_shmem_free(hdp_gpu_cpu_flush_flag_); + rocshmem_free(hdp_gpu_cpu_flush_flag_); #endif /** @@ -203,7 +203,7 @@ void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team, TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) { + rocshmem_team_t *new_team) { /** * Read the bit mask and find out a common index into * the pool of available work arrays. @@ -237,7 +237,7 @@ void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team, *new_team = get_external_team(new_team_obj); } -void GPUIBBackend::team_destroy(roc_shmem_team_t team) { +void GPUIBBackend::team_destroy(rocshmem_team_t team) { GPUIBTeam *team_obj = get_internal_gpu_ib_team(team); /* Mark the pool as available */ @@ -263,7 +263,7 @@ void GPUIBBackend::initialize_network() { networkImpl.networkHostSetup(this); } void GPUIBBackend::setup_default_host_ctx() { default_host_ctx_ = new GPUIBHostContext(this, 0); - ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_; + ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_; } void GPUIBBackend::setup_ctxs() { @@ -288,19 +288,19 @@ void GPUIBBackend::setup_default_ctx() { new (default_ctx_) GPUIBContext(this, true, 0); /* - * Set the ROC_SHMEM_CTX_DEFAULT in constant memory. + * Set the ROCSHMEM_CTX_DEFAULT in constant memory. */ int *symbol_address; CHECK_HIP(hipGetSymbolAddress(reinterpret_cast(&symbol_address), - HIP_SYMBOL(ROC_SHMEM_CTX_DEFAULT))); + HIP_SYMBOL(ROCSHMEM_CTX_DEFAULT))); TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world; - roc_shmem_ctx_t ctx_default_host{default_ctx_, tinfo}; + rocshmem_ctx_t ctx_default_host{default_ctx_, tinfo}; hipStream_t stream; CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); CHECK_HIP(hipMemcpyAsync(symbol_address, &ctx_default_host, - sizeof(roc_shmem_ctx_t), hipMemcpyDefault, stream)); + sizeof(rocshmem_ctx_t), hipMemcpyDefault, stream)); CHECK_HIP(hipStreamSynchronize(stream)); CHECK_HIP(hipStreamDestroy(stream)); } @@ -328,9 +328,9 @@ void GPUIBBackend::setup_team_world() { team_tracker.set_team_world(team_world); /** - * Copy the address to ROC_SHMEM_TEAM_WORLD. + * Copy the address to ROCSHMEM_TEAM_WORLD. */ - ROC_SHMEM_TEAM_WORLD = reinterpret_cast(team_world); + ROCSHMEM_TEAM_WORLD = reinterpret_cast(team_world); } void GPUIBBackend::init_mpi_once(MPI_Comm comm) { @@ -381,19 +381,19 @@ void GPUIBBackend::teams_init() { * Allocate pools for the teams sync and work arrary from the SHEAP. */ auto max_num_teams{team_tracker.get_max_num_teams()}; - barrier_pSync_pool = reinterpret_cast(roc_shmem_malloc( - sizeof(long) * ROC_SHMEM_BARRIER_SYNC_SIZE * max_num_teams)); - reduce_pSync_pool = reinterpret_cast(roc_shmem_malloc( - sizeof(long) * ROC_SHMEM_REDUCE_SYNC_SIZE * max_num_teams)); - bcast_pSync_pool = reinterpret_cast(roc_shmem_malloc( - sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE * max_num_teams)); - alltoall_pSync_pool = reinterpret_cast(roc_shmem_malloc( - sizeof(long) * ROC_SHMEM_ALLTOALL_SYNC_SIZE * max_num_teams)); + barrier_pSync_pool = reinterpret_cast(rocshmem_malloc( + sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams)); + reduce_pSync_pool = reinterpret_cast(rocshmem_malloc( + sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams)); + bcast_pSync_pool = reinterpret_cast(rocshmem_malloc( + sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams)); + alltoall_pSync_pool = reinterpret_cast(rocshmem_malloc( + sizeof(long) * ROCSHMEM_ALLTOALL_SYNC_SIZE * max_num_teams)); /* Accommodating for largest possible data type for pWrk */ - pWrk_pool = roc_shmem_malloc( - sizeof(double) * ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams); - pAta_pool = roc_shmem_malloc(sizeof(double) * ROC_SHMEM_ATA_MAX_WRKDATA_SIZE * + pWrk_pool = rocshmem_malloc( + sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams); + pAta_pool = rocshmem_malloc(sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE * max_num_teams); /** @@ -402,25 +402,25 @@ void GPUIBBackend::teams_init() { long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync; for (int team_i = 0; team_i < max_num_teams; team_i++) { barrier_pSync = reinterpret_cast( - &barrier_pSync_pool[team_i * ROC_SHMEM_BARRIER_SYNC_SIZE]); + &barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]); reduce_pSync = reinterpret_cast( - &reduce_pSync_pool[team_i * ROC_SHMEM_REDUCE_SYNC_SIZE]); + &reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]); bcast_pSync = reinterpret_cast( - &bcast_pSync_pool[team_i * ROC_SHMEM_BCAST_SYNC_SIZE]); + &bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]); alltoall_pSync = reinterpret_cast( - &alltoall_pSync_pool[team_i * ROC_SHMEM_ALLTOALL_SYNC_SIZE]); + &alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]); - for (int i = 0; i < ROC_SHMEM_BARRIER_SYNC_SIZE; i++) { - barrier_pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) { + barrier_pSync[i] = ROCSHMEM_SYNC_VALUE; } - for (int i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) { - reduce_pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) { + reduce_pSync[i] = ROCSHMEM_SYNC_VALUE; } - for (int i = 0; i < ROC_SHMEM_BCAST_SYNC_SIZE; i++) { - bcast_pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) { + bcast_pSync[i] = ROCSHMEM_SYNC_VALUE; } - for (int i = 0; i < ROC_SHMEM_ALLTOALL_SYNC_SIZE; i++) { - alltoall_pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) { + alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE; } } @@ -457,30 +457,30 @@ void GPUIBBackend::teams_init() { } void GPUIBBackend::teams_destroy() { - roc_shmem_free(barrier_pSync_pool); - roc_shmem_free(reduce_pSync_pool); - roc_shmem_free(bcast_pSync_pool); - roc_shmem_free(alltoall_pSync_pool); - roc_shmem_free(pWrk_pool); - roc_shmem_free(pAta_pool); + rocshmem_free(barrier_pSync_pool); + rocshmem_free(reduce_pSync_pool); + rocshmem_free(bcast_pSync_pool); + rocshmem_free(alltoall_pSync_pool); + rocshmem_free(pWrk_pool); + rocshmem_free(pAta_pool); free(pool_bitmask_); free(reduced_bitmask_); } -void GPUIBBackend::roc_shmem_collective_init() { +void GPUIBBackend::rocshmem_collective_init() { /* * Allocate heap space for barrier_sync */ size_t one_sync_size_bytes{sizeof(*barrier_sync)}; - size_t sync_size_bytes{one_sync_size_bytes * ROC_SHMEM_BARRIER_SYNC_SIZE}; + size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE}; heap.malloc(reinterpret_cast(&barrier_sync), sync_size_bytes); /* * Initialize the barrier synchronization array with default values. */ for (int i = 0; i < num_pes; i++) { - barrier_sync[i] = ROC_SHMEM_SYNC_VALUE; + barrier_sync[i] = ROCSHMEM_SYNC_VALUE; } /* diff --git a/src/gpu_ib/backend_ib.hpp b/src/gpu_ib/backend_ib.hpp index 64c6a59e19..87b5dc85f1 100644 --- a/src/gpu_ib/backend_ib.hpp +++ b/src/gpu_ib/backend_ib.hpp @@ -72,19 +72,19 @@ class GPUIBBackend : public Backend { void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) override; + rocshmem_team_t *new_team) override; /** - * @copydoc Backend::team_destroy(roc_shmem_team_t) + * @copydoc Backend::team_destroy(rocshmem_team_t) */ - void team_destroy(roc_shmem_team_t team) override; + void team_destroy(rocshmem_team_t team) override; /** * @copydoc Backend::ctx_create */ void ctx_create(int64_t options, void **ctx) override; - __device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx); + __device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx); /** * @copydoc Backend::ctx_destroy @@ -94,7 +94,7 @@ class GPUIBBackend : public Backend { /** * @copydoc Backend::ctx_destroy */ - __device__ void destroy_ctx(roc_shmem_ctx_t *ctx); + __device__ void destroy_ctx(rocshmem_ctx_t *ctx); protected: /** @@ -151,10 +151,10 @@ class GPUIBBackend : public Backend { void initialize_ipc(); /** - * @brief Allocate and initialize the ROC_SHMEM_CTX_DEFAULT variable. + * @brief Allocate and initialize the ROCSHMEM_CTX_DEFAULT variable. * * @todo The default_ctx member looks unused after it is copied into - * the ROC_SHMEM_CTX_DEFAULT variable. + * the ROCSHMEM_CTX_DEFAULT variable. */ void setup_default_ctx(); void setup_ctxs(); @@ -187,7 +187,7 @@ class GPUIBBackend : public Backend { * When this method completes, the barrier_sync member will be available * for use. */ - void roc_shmem_collective_init(); + void rocshmem_collective_init(); #ifdef USE_HOST_SIDE_HDP_FLUSH /** @@ -245,8 +245,8 @@ class GPUIBBackend : public Backend { void *pAta_pool{nullptr}; /** - * @brief ROC_SHMEM's copy of MPI_COMM_WORLD (for interoperability - * with orthogonal MPI usage in an MPI+ROC_SHMEM program). + * @brief rocSHMEM's copy of MPI_COMM_WORLD (for interoperability + * with orthogonal MPI usage in an MPI+rocSHMEM program). */ MPI_Comm gpu_ib_comm_world{}; MPI_Comm backend_comm{}; @@ -334,7 +334,7 @@ class GPUIBBackend : public Backend { * specification). * * @todo Remove this member from the backend class. There is another - * copy stored in ROC_SHMEM_CTX_DEFAULT. + * copy stored in ROCSHMEM_CTX_DEFAULT. */ GPUIBContext *default_ctx_{nullptr}; diff --git a/src/gpu_ib/connection.cpp b/src/gpu_ib/connection.cpp index b6b2ae4dcd..31f8eda4d3 100644 --- a/src/gpu_ib/connection.cpp +++ b/src/gpu_ib/connection.cpp @@ -39,19 +39,19 @@ int Connection::coherent_cq = 0; Connection::Connection(GPUIBBackend* b, int k) : backend(b), key_offset(k) { char* value = nullptr; - if ((value = getenv("ROC_SHMEM_USE_IB_HCA"))) { + if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) { requested_dev = value; } - if ((value = getenv("ROC_SHMEM_SQ_SIZE"))) { + if ((value = getenv("ROCSHMEM_SQ_SIZE"))) { sq_size = atoi(value); } - if ((value = getenv("ROC_SHMEM_USE_CQ_GPU_MEM")) != nullptr) { + if ((value = getenv("ROCSHMEM_USE_CQ_GPU_MEM")) != nullptr) { cq_use_gpu_mem = atoi(value); } - if ((value = getenv("ROC_SHMEM_USE_SQ_GPU_MEM")) != nullptr) { + if ((value = getenv("ROCSHMEM_USE_SQ_GPU_MEM")) != nullptr) { sq_use_gpu_mem = atoi(value); } } diff --git a/src/gpu_ib/connection.hpp b/src/gpu_ib/connection.hpp index ba29a685be..bdcf9eaeba 100644 --- a/src/gpu_ib/connection.hpp +++ b/src/gpu_ib/connection.hpp @@ -31,7 +31,7 @@ extern "C" { #include -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "connection_policy.hpp" namespace rocshmem { diff --git a/src/gpu_ib/context_ib_device.cpp b/src/gpu_ib/context_ib_device.cpp index 7bef0e7f4b..fed54fc6e8 100644 --- a/src/gpu_ib/context_ib_device.cpp +++ b/src/gpu_ib/context_ib_device.cpp @@ -25,7 +25,7 @@ #include #include "config.h" // NOLINT(build/include_subdir) -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "../backend_type.hpp" #include "../context_incl.hpp" #include "backend_ib.hpp" diff --git a/src/gpu_ib/context_ib_device.hpp b/src/gpu_ib/context_ib_device.hpp index 9e000cfb82..9748b1dac3 100644 --- a/src/gpu_ib/context_ib_device.hpp +++ b/src/gpu_ib/context_ib_device.hpp @@ -69,7 +69,7 @@ class GPUIBContext : public Context { __device__ void sync_all(); - __device__ void sync(roc_shmem_team_t team); + __device__ void sync(rocshmem_team_t team); template __device__ void amo_add(void *dst, T value, int pe); @@ -113,13 +113,13 @@ class GPUIBContext : public Context { template __device__ T g(const T *source, int pe); - template + template __device__ void to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); // NOLINT(runtime/int) - template - __device__ void to_all(roc_shmem_team_t team, T *dest, const T *source, + template + __device__ void to_all(rocshmem_team_t team, T *dest, const T *source, int nreduce); template @@ -135,7 +135,7 @@ class GPUIBContext : public Context { __device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe); template - __device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source, + __device__ void broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root); template @@ -144,43 +144,43 @@ class GPUIBContext : public Context { long *p_sync); // NOLINT(runtime/int) template - __device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source, + __device__ void alltoall(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_broadcast(roc_shmem_team_t team, T *dest, + __device__ void alltoall_broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_brucks(roc_shmem_team_t team, T *dest, + __device__ void alltoall_brucks(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_gcen(roc_shmem_team_t team, T *dest, const T *source, + __device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_gcen2(roc_shmem_team_t team, T *dest, + __device__ void alltoall_gcen2(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source, + __device__ void fcollect(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect_broadcast(roc_shmem_team_t team, T *dest, + __device__ void fcollect_broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect_brucks(roc_shmem_team_t team, T *dest, + __device__ void fcollect_brucks(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect_gcen(roc_shmem_team_t team, T *dest, const T *source, + __device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect_gcen2(roc_shmem_team_t team, T *dest, + __device__ void fcollect_gcen2(rocshmem_team_t team, T *dest, const T *source, int nelems); __device__ void putmem_wg(void *dest, const void *source, size_t nelems, @@ -232,13 +232,13 @@ class GPUIBContext : public Context { __device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe); private: - template + template __device__ void internal_direct_allreduce( T *dst, const T *src, int nelems, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); // NOLINT(runtime/int) - template + template __device__ void internal_ring_allreduce(T *dst, const T *src, int nelems, int PE_start, int logPE_stride, int PE_size, T *pWrk, diff --git a/src/gpu_ib/context_ib_device_coll.cpp b/src/gpu_ib/context_ib_device_coll.cpp index bb3c35cc3e..61f1e97ca9 100644 --- a/src/gpu_ib/context_ib_device_coll.cpp +++ b/src/gpu_ib/context_ib_device_coll.cpp @@ -20,7 +20,7 @@ * IN THE SOFTWARE. *****************************************************************************/ -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "../context_incl.hpp" #include "context_ib_tmpl_device.hpp" #include "../util.hpp" @@ -35,8 +35,8 @@ __device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start, // Go through all PE offsets (except current offset = 0) // and wait until they all reach for (size_t i = 1; i < n_pes; i++) { - wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val); - pSync[i] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val); + pSync[i] = ROCSHMEM_SYNC_VALUE; } threadfence_system(); // Announce to other PEs that all have reached @@ -48,8 +48,8 @@ __device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start, // Mark current PE offset as reached size_t pe_offset = (pe - PE_start) / stride; put_nbi(&pSync[pe_offset], &flag_val, 1, PE_start); - wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val); - pSync[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); + pSync[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); } } @@ -59,16 +59,16 @@ __device__ void GPUIBContext::internal_atomic_barrier(int pe, int PE_start, int64_t *pSync) { int64_t flag_val = 1; if (pe == PE_start) { - wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(n_pes - 1)); - pSync[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1)); + pSync[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) { put_nbi(&pSync[0], &flag_val, 1, j); } } else { amo_add(&pSync[0], flag_val, PE_start); - wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val); - pSync[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); + pSync[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); } } @@ -88,7 +88,7 @@ __device__ void GPUIBContext::internal_sync(int pe, int PE_start, int stride, __syncthreads(); } -__device__ void GPUIBContext::sync(roc_shmem_team_t team) { +__device__ void GPUIBContext::sync(rocshmem_team_t team) { GPUIBTeam *team_obj = reinterpret_cast(team); double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; diff --git a/src/gpu_ib/context_ib_host.hpp b/src/gpu_ib/context_ib_host.hpp index 2bf800a215..a38f18ce01 100644 --- a/src/gpu_ib/context_ib_host.hpp +++ b/src/gpu_ib/context_ib_host.hpp @@ -86,16 +86,16 @@ class GPUIBHostContext : public Context { long *p_sync); // NOLINT(runtime/int) template - __host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source, + __host__ void broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root); - template + template __host__ void to_all(T *dest, const T *source, int nreduce, int pe_start, int log_pe_stride, int pe_size, T *p_wrk, long *p_sync); // NOLINT(runtime/int) - template - __host__ void to_all(roc_shmem_team_t team, T *dest, const T *source, + template + __host__ void to_all(rocshmem_team_t team, T *dest, const T *source, int nreduce); template diff --git a/src/gpu_ib/context_ib_tmpl_device.hpp b/src/gpu_ib/context_ib_tmpl_device.hpp index c575cb0832..44b1dd2291 100644 --- a/src/gpu_ib/context_ib_tmpl_device.hpp +++ b/src/gpu_ib/context_ib_tmpl_device.hpp @@ -24,16 +24,16 @@ #define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_ #include "config.h" // NOLINT(build/include_subdir) -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "context_ib_device.hpp" #include "gpu_ib_team.hpp" #include "queue_pair.hpp" #include "../util.hpp" -#include "../roc_shmem_calc.hpp" +#include "../rocshmem_calc.hpp" namespace rocshmem { -template +template __device__ void compute_reduce(T *src, T *dst, int size, int wg_id, int wg_size) { for (size_t i = wg_id; i < size; i += wg_size) { @@ -47,7 +47,7 @@ __device__ void GPUIBContext::p(T *dest, T value, int pe) { putmem_nbi(dest, &value, sizeof(T), pe); } -template +template __device__ void GPUIBContext::internal_ring_allreduce( T *dst, const T *src, int nelems, [[maybe_unused]] int PE_start, [[maybe_unused]] int logPE_stride, [[maybe_unused]] int PE_size, T *pWrk, @@ -81,7 +81,7 @@ __device__ void GPUIBContext::internal_ring_allreduce( wait_val = seg + 100; p(&pSync[round], wait_val, send_pe); - wait_until(&pSync[round], ROC_SHMEM_CMP_EQ, wait_val); + wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val); __threadfence(); } __syncthreads(); @@ -99,19 +99,19 @@ __device__ void GPUIBContext::internal_ring_allreduce( fence(); wait_val = seg + 100; p(&pSync[round], wait_val, send_pe); - wait_until(&pSync[round], ROC_SHMEM_CMP_EQ, wait_val); + wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val); } __syncthreads(); } } __syncthreads(); for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; + pSync[i] = ROCSHMEM_SYNC_VALUE; } __syncthreads(); } -template +template __device__ void GPUIBContext::internal_direct_allreduce( T *dst, const T *src, int nelems, int PE_start, int logPE_stride, int PE_size, T *pWrk, @@ -147,7 +147,7 @@ __device__ void GPUIBContext::internal_direct_allreduce( if (i != pe) { // Wait for leader thread to see that the buffer is ready. if (is_thread_zero_in_block()) { - wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, 1L); + wait_until(&pSync[i], ROCSHMEM_CMP_EQ, 1L); } __syncthreads(); @@ -159,14 +159,14 @@ __device__ void GPUIBContext::internal_direct_allreduce( __syncthreads(); for (int i = wg_id; i < num_pes; i += wg_size) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; + pSync[i] = ROCSHMEM_SYNC_VALUE; } __syncthreads(); } -template -__device__ void GPUIBContext::to_all(roc_shmem_team_t team, T *dest, +template +__device__ void GPUIBContext::to_all(rocshmem_team_t team, T *dest, const T *source, int nreduce) { GPUIBTeam *team_obj = reinterpret_cast(team); @@ -189,7 +189,7 @@ __device__ void GPUIBContext::to_all(roc_shmem_team_t team, T *dest, p_sync); } -template +template __device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, @@ -200,8 +200,8 @@ __device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce, size_t ring_pSync = 2 * num_pes; size_t provided_pWrk = - max(nreduce / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE); - size_t provided_pSync = ROC_SHMEM_REDUCE_SYNC_SIZE; + max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE); + size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE; // TODO(bpotter): // We basically do a direct reduce if pWrk is big enough, else we @@ -212,12 +212,12 @@ __device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce, internal_direct_allreduce(dest, source, nreduce, PE_start, logPE_stride, PE_size, pWrk, pSync); } else { - if (ring_pSync <= ROC_SHMEM_REDUCE_SYNC_SIZE) { + if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) { int chunk_size = 1024; size_t ring_pWrk = chunk_size * num_pes; if (provided_pWrk < ring_pWrk) { ring_pWrk = max(nreduce / 2, // NOLINT - ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE); + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE); chunk_size = ring_pWrk / num_pes; } int seg_size = ring_pWrk; @@ -434,7 +434,7 @@ __device__ void GPUIBContext::internal_get_broadcast( } template -__device__ void GPUIBContext::broadcast(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::broadcast(rocshmem_team_t team, T *dst, const T *src, int nelems, int pe_root) { GPUIBTeam *team_obj = reinterpret_cast(team); @@ -475,14 +475,14 @@ __device__ void GPUIBContext::broadcast(T *dst, const T *src, int nelems, } template -__device__ void GPUIBContext::alltoall(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::alltoall(rocshmem_team_t team, T *dst, const T *src, int nelems) { // Currently broadcast implementation performs the best alltoall_broadcast(team, dst, src, nelems); } template -__device__ void GPUIBContext::alltoall_broadcast(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::alltoall_broadcast(rocshmem_team_t team, T *dst, const T *src, int nelems) { // Broadcast implementation of alltoall collective GPUIBTeam *team_obj = reinterpret_cast(team); @@ -514,7 +514,7 @@ __device__ void GPUIBContext::alltoall_broadcast(roc_shmem_team_t team, T *dst, } template -__device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::alltoall_brucks(rocshmem_team_t team, T *dst, const T *src, int nelems) { // Brucks implementation of alltoall collective GPUIBTeam *team_obj = reinterpret_cast(team); @@ -537,7 +537,7 @@ __device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst, int blk_size = get_flat_block_size(); // Check if we have enough buffer space. If not, fail. - if (pe_size * nelems * 2 > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) { + if (pe_size * nelems * 2 > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n"); assert(false); } @@ -612,7 +612,7 @@ __device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst, } template -__device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::alltoall_gcen(rocshmem_team_t team, T *dst, const T *src, int nelems) { // GPU-centric implementation of alltoall collective GPUIBTeam *team_obj = reinterpret_cast(team); @@ -629,12 +629,12 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst, int stride = 1 << log_pe_stride; long *pSync = team_obj->alltoall_pSync; - int64_t *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE]; + int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE]; int my_pe_in_team = team_obj->my_pe; // Check if we have enough buffer space. If not, fail. T *pAta = reinterpret_cast(team_obj->pAta); - if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) { + if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n"); assert(false); } @@ -677,29 +677,29 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst, if (dest_pe2 != my_pe) amo_add(&pSync[0], flag_val, dest_pe2); if (my_pe == dest_pe) { - wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1)); - pSync2[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1)); + pSync2[0] = ROCSHMEM_SYNC_VALUE; __threadfence_system(); for (int i = 1; i < clust_size; ++i) put_nbi(&pSync2[0], &flag_val, 1, team_obj->get_pe_in_world(my_pe_in_team + i)); } else { - wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val); - pSync2[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val); + pSync2[0] = ROCSHMEM_SYNC_VALUE; __threadfence_system(); } if (my_pe == dest_pe2) { - wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(num_clust - 1)); - pSync[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(num_clust - 1)); + pSync[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); for (size_t i = 1, j = dest_pe2 + clust_size * stride; i < num_clust; ++i, j += clust_size * stride) { put_nbi(&pSync[0], &flag_val, 1, j); } } else { - wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val); - pSync[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); + pSync[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); } } @@ -707,7 +707,7 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst, } template -__device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::alltoall_gcen2(rocshmem_team_t team, T *dst, const T *src, int nelems) { // GPU-centric implementation of alltoall collective // Uses in-place blocking sync @@ -725,12 +725,12 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst, int stride = 1 << log_pe_stride; long *pSync = team_obj->alltoall_pSync; - int64_t *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE]; + int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE]; int my_pe_in_team = team_obj->my_pe; // Check if we have enough buffer space. If not, fail. T *pAta = reinterpret_cast(team_obj->pAta); - if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) { + if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n"); assert(false); } @@ -771,15 +771,15 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst, if (is_thread_zero_in_block()) { quiet(); if ((my_pe_in_team % clust_size) == 0) { - wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1)); - pSync2[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1)); + pSync2[0] = ROCSHMEM_SYNC_VALUE; __threadfence_system(); for (int i = 1; i < clust_size; ++i) put_nbi(&pSync2[0], &flag_val, 1, team_obj->get_pe_in_world(my_pe_in_team + i)); } else { - wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val); - pSync2[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val); + pSync2[0] = ROCSHMEM_SYNC_VALUE; __threadfence_system(); } } @@ -790,7 +790,7 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst, } template -__device__ void GPUIBContext::fcollect(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::fcollect(rocshmem_team_t team, T *dst, const T *src, int nelems) { // Main function for fcollect // Broadcast version performs moderately well @@ -799,7 +799,7 @@ __device__ void GPUIBContext::fcollect(roc_shmem_team_t team, T *dst, } template -__device__ void GPUIBContext::fcollect_broadcast(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::fcollect_broadcast(rocshmem_team_t team, T *dst, const T *src, int nelems) { // Broadcast implementation of fcollect collective GPUIBTeam *team_obj = reinterpret_cast(team); @@ -832,7 +832,7 @@ __device__ void GPUIBContext::fcollect_broadcast(roc_shmem_team_t team, T *dst, } template -__device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::fcollect_brucks(rocshmem_team_t team, T *dst, const T *src, int nelems) { // Brucks implementation of fcollect collective GPUIBTeam *team_obj = reinterpret_cast(team); @@ -855,7 +855,7 @@ __device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst, int blk_size = get_flat_block_size(); // Check if we have enough buffer space. If not, fail. - if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) { + if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n"); assert(false); } @@ -895,7 +895,7 @@ __device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst, } template -__device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::fcollect_gcen(rocshmem_team_t team, T *dst, const T *src, int nelems) { // GPU-centric implementation of fcollect collective GPUIBTeam *team_obj = reinterpret_cast(team); @@ -912,12 +912,12 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst, int stride = 1 << log_pe_stride; long *pSync = team_obj->alltoall_pSync; - long *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE]; + long *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE]; int my_pe_in_team = team_obj->my_pe; // Check if we have enough buffer space. If not, fail. T *pAta = reinterpret_cast(team_obj->pAta); - if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) { + if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n"); assert(false); } @@ -957,15 +957,15 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst, if (is_thread_zero_in_block()) { quiet(); if ((my_pe_in_team % clust_size) == 0) { - wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1)); - pSync2[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1)); + pSync2[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); for (int i = 1; i < clust_size; ++i) put_nbi(&pSync2[0], &flag_val, 1, team_obj->get_pe_in_world(my_pe_in_team + i)); } else { - wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val); - pSync2[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val); + pSync2[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); } } @@ -976,7 +976,7 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst, } template -__device__ void GPUIBContext::fcollect_gcen2(roc_shmem_team_t team, T *dst, +__device__ void GPUIBContext::fcollect_gcen2(rocshmem_team_t team, T *dst, const T *src, int nelems) { // GPU-centric implementation of fcollect collective // Uses in-place blocking sync @@ -998,7 +998,7 @@ __device__ void GPUIBContext::fcollect_gcen2(roc_shmem_team_t team, T *dst, // Check if we have enough buffer space. If not, fail. T *pAta = reinterpret_cast(team_obj->pAta); - if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) { + if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n"); assert(false); } diff --git a/src/gpu_ib/context_ib_tmpl_host.hpp b/src/gpu_ib/context_ib_tmpl_host.hpp index af1491b5f2..552a8c25a2 100644 --- a/src/gpu_ib/context_ib_tmpl_host.hpp +++ b/src/gpu_ib/context_ib_tmpl_host.hpp @@ -93,13 +93,13 @@ __host__ void GPUIBHostContext::broadcast( } template -__host__ void GPUIBHostContext::broadcast(roc_shmem_team_t team, T *dest, +__host__ void GPUIBHostContext::broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root) { host_interface->broadcast(team, dest, source, nelems, pe_root); } -template +template __host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce, int pe_start, int log_pe_stride, int pe_size, T *p_wrk, @@ -108,8 +108,8 @@ __host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce, pe_size, p_wrk, p_sync); } -template -__host__ void GPUIBHostContext::to_all(roc_shmem_team_t team, T *dest, +template +__host__ void GPUIBHostContext::to_all(rocshmem_team_t team, T *dest, const T *source, int nreduce) { host_interface->to_all(team, dest, source, nreduce); } diff --git a/src/gpu_ib/debug.cpp b/src/gpu_ib/debug.cpp index 4045cbed38..227b75fd4e 100644 --- a/src/gpu_ib/debug.cpp +++ b/src/gpu_ib/debug.cpp @@ -20,7 +20,7 @@ * IN THE SOFTWARE. *****************************************************************************/ -#include "roc_shmem/debug.hpp" +#include "rocshmem/debug.hpp" #include "qe_dumper.hpp" diff --git a/src/gpu_ib/dynamic_connection.cpp b/src/gpu_ib/dynamic_connection.cpp index daa59216d8..7195601b24 100644 --- a/src/gpu_ib/dynamic_connection.cpp +++ b/src/gpu_ib/dynamic_connection.cpp @@ -31,11 +31,11 @@ namespace rocshmem { DynamicConnection::DynamicConnection(GPUIBBackend* b) : Connection(b, 4) { char* value = nullptr; - if ((value = getenv("ROC_SHMEM_NUM_DCIs"))) { + if ((value = getenv("ROCSHMEM_NUM_DCIs"))) { num_dcis = atoi(value); } - if ((value = getenv("ROC_SHMEM_NUM_DCT"))) { + if ((value = getenv("ROCSHMEM_NUM_DCT"))) { num_dct = atoi(value); } } diff --git a/src/gpu_ib/gpu_ib_team.cpp b/src/gpu_ib/gpu_ib_team.cpp index 3e3a9338c3..0aa04c8c83 100644 --- a/src/gpu_ib/gpu_ib_team.cpp +++ b/src/gpu_ib/gpu_ib_team.cpp @@ -38,17 +38,17 @@ GPUIBTeam::GPUIBTeam(Backend *backend, TeamInfo *team_info_parent, pool_index_ = pool_index; barrier_pSync = - &(b->barrier_pSync_pool[pool_index * ROC_SHMEM_BARRIER_SYNC_SIZE]); + &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]); reduce_pSync = - &(b->reduce_pSync_pool[pool_index * ROC_SHMEM_REDUCE_SYNC_SIZE]); - bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROC_SHMEM_BCAST_SYNC_SIZE]); + &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]); + bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]); alltoall_pSync = - &(b->alltoall_pSync_pool[pool_index * ROC_SHMEM_ALLTOALL_SYNC_SIZE]); + &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]); pWrk = reinterpret_cast(b->pWrk_pool) + - ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index; + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index; pAta = reinterpret_cast(b->pAta_pool) + - ROC_SHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index; + ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index; } GPUIBTeam::~GPUIBTeam() {} diff --git a/src/gpu_ib/network_policy.cpp b/src/gpu_ib/network_policy.cpp index 0b51fdafc0..60ba379f49 100644 --- a/src/gpu_ib/network_policy.cpp +++ b/src/gpu_ib/network_policy.cpp @@ -297,7 +297,7 @@ void NetworkOnImpl::setup_gpu_qps(GPUIBBackend *B) { } } -void NetworkOnImpl::roc_shmem_g_init(SymmetricHeap *heap_handle, +void NetworkOnImpl::rocshmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm) { init_g_ret(heap_handle, thread_comm, num_blocks, &g_ret); } @@ -327,7 +327,7 @@ __host__ void NetworkOnImpl::networkHostSetup(GPUIBBackend *B) { connection->initialize_gpu_policy(&connection_policy, heap_rkey); - roc_shmem_g_init(&B->heap, B->thread_comm); + rocshmem_g_init(&B->heap, B->thread_comm); connection->post_wqes(); diff --git a/src/gpu_ib/network_policy.hpp b/src/gpu_ib/network_policy.hpp index bdd3f0bc4d..6e1ceb9107 100644 --- a/src/gpu_ib/network_policy.hpp +++ b/src/gpu_ib/network_policy.hpp @@ -27,7 +27,7 @@ #include #include "config.h" // NOLINT(build/include_subdir) -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "connection_policy.hpp" #include "queue_pair.hpp" #include "../hdp_policy.hpp" @@ -154,7 +154,7 @@ class NetworkOnImpl { * @brief Allocate and initialize device-side memory that will be used for * the return of g shmem ops (eg: shmem_int_g) */ - void roc_shmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm); + void rocshmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm); /** * @brief The backend delegates some InfiniBand connection setup to diff --git a/src/gpu_ib/queue_pair.hpp b/src/gpu_ib/queue_pair.hpp index 4a0c60a806..649950c1dc 100644 --- a/src/gpu_ib/queue_pair.hpp +++ b/src/gpu_ib/queue_pair.hpp @@ -28,7 +28,7 @@ * * @section DESCRIPTION * An IB QueuePair (SQ and CQ) that the device can use to perform network - * operations. Most important ROC_SHMEM operations are performed by this + * operations. Most important rocSHMEM operations are performed by this * class. */ diff --git a/src/gpu_ib/thread_policy.hpp b/src/gpu_ib/thread_policy.hpp index 25ed3bcf06..6a7d5b6a1f 100644 --- a/src/gpu_ib/thread_policy.hpp +++ b/src/gpu_ib/thread_policy.hpp @@ -32,7 +32,7 @@ class QueuePair; /* * GPU single-thread policy class. Only a single work-item per work-group - * is allowed to call into a ROC_SHMEM function (unless it is specifically + * is allowed to call into a rocSHMEM function (unless it is specifically * called out as a collective API. This thread policy is the fastest but * is not as flexible. */ @@ -59,7 +59,7 @@ class SingleThreadImpl { /* * GPU multi-thread policy class. Multiple work-items per work-group are - * allowed to call into a ROC_SHMEM function. A bit slower than its + * allowed to call into a rocSHMEM function. A bit slower than its * single-thread counterpart but it enables a much more flexible user-facing * API. */ diff --git a/src/host/host.cpp b/src/host/host.cpp index 549f8d7f7e..3826fd92f9 100644 --- a/src/host/host.cpp +++ b/src/host/host.cpp @@ -83,13 +83,13 @@ int HostInterface::find_win_info_in_pool(WindowInfo* window_info) { } __host__ HostInterface::HostInterface(HdpPolicy* hdp_policy, - MPI_Comm roc_shmem_comm, + MPI_Comm rocshmem_comm, SymmetricHeap* heap) { /* * Duplicate a communicator from roc_shem's comm * world for the host interface */ - MPI_Comm_dup(roc_shmem_comm, &host_comm_world_); + MPI_Comm_dup(rocshmem_comm, &host_comm_world_); MPI_Comm_rank(host_comm_world_, &my_pe_); MPI_Comm_rank(host_comm_world_, &num_pes_); @@ -103,7 +103,7 @@ __host__ HostInterface::HostInterface(HdpPolicy* hdp_policy, * Allocate and initialize pool of windows for contexts */ char* value{nullptr}; - if ((value = getenv("ROC_SHMEM_MAX_NUM_HOST_CONTEXTS"))) { + if ((value = getenv("ROCSHMEM_MAX_NUM_HOST_CONTEXTS"))) { max_num_ctxs_ = atoi(value); } diff --git a/src/host/host.hpp b/src/host/host.hpp index 79d7a2eec5..dbdcec9297 100644 --- a/src/host/host.hpp +++ b/src/host/host.hpp @@ -36,7 +36,7 @@ #include -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "../hdp_policy.hpp" #include "../memory/symmetric_heap.hpp" #include "../memory/window_info.hpp" @@ -104,7 +104,7 @@ class HostInterface { /** * @brief Primary constructor */ - __host__ HostInterface(HdpPolicy* hdp_policy, MPI_Comm roc_shmem_comm, + __host__ HostInterface(HdpPolicy* hdp_policy, MPI_Comm rocshmem_comm, SymmetricHeap* heap); /** @@ -198,16 +198,16 @@ class HostInterface { long* p_sync); // NOLINT(runtime/int) template - __host__ void broadcast(roc_shmem_team_t team, T* dest, const T* source, + __host__ void broadcast(rocshmem_team_t team, T* dest, const T* source, int nelems, int pe_root); - template + template __host__ void to_all(T* dest, const T* source, int nreduce, int pe_start, int log_pe_stride, int pe_size, T* p_wrk, long* p_sync); // NOLINT(runtime/int) - template - __host__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce); + template + __host__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce); template __host__ void wait_until(T *ivars, int cmp, T val, @@ -288,7 +288,7 @@ class HostInterface { __host__ MPI_Comm get_mpi_comm(int pe_start, int log_pe_stride, int pe_size); - __host__ MPI_Op get_mpi_op(ROC_SHMEM_OP Op); + __host__ MPI_Op get_mpi_op(ROCSHMEM_OP Op); template __host__ MPI_Datatype get_mpi_type(); @@ -300,7 +300,7 @@ class HostInterface { __host__ int test_and_compare(MPI_Aint offset, MPI_Datatype mpi_type, int cmp, T val, MPI_Win win); - template + template __host__ void to_all_internal(MPI_Comm mpi_comm, T* dest, const T* source, int nreduce); diff --git a/src/host/host_templates.hpp b/src/host/host_templates.hpp index a7cce76ea9..522d1398b1 100644 --- a/src/host/host_templates.hpp +++ b/src/host/host_templates.hpp @@ -200,7 +200,7 @@ __host__ void HostInterface::broadcast(T* dest, const T* source, int nelems, } template -__host__ void HostInterface::broadcast(roc_shmem_team_t team, T* dest, +__host__ void HostInterface::broadcast(rocshmem_team_t team, T* dest, const T* source, int nelems, int pe_root) { DPRINTF("Function: Team-based host_broadcast\n"); @@ -216,24 +216,24 @@ __host__ void HostInterface::broadcast(roc_shmem_team_t team, T* dest, return; } -__host__ inline MPI_Op HostInterface::get_mpi_op(ROC_SHMEM_OP Op) { +__host__ inline MPI_Op HostInterface::get_mpi_op(ROCSHMEM_OP Op) { switch (Op) { - case ROC_SHMEM_SUM: + case ROCSHMEM_SUM: return MPI_SUM; - case ROC_SHMEM_MAX: + case ROCSHMEM_MAX: return MPI_MAX; - case ROC_SHMEM_MIN: + case ROCSHMEM_MIN: return MPI_MIN; - case ROC_SHMEM_PROD: + case ROCSHMEM_PROD: return MPI_PROD; - case ROC_SHMEM_AND: + case ROCSHMEM_AND: return MPI_BAND; - case ROC_SHMEM_OR: + case ROCSHMEM_OR: return MPI_BOR; - case ROC_SHMEM_XOR: + case ROCSHMEM_XOR: return MPI_BXOR; default: - fprintf(stderr, "Unknown ROC_SHMEM op MPI conversion %d\n", Op); + fprintf(stderr, "Unknown rocSHMEM op MPI conversion %d\n", Op); abort(); return 0; } @@ -330,7 +330,7 @@ __host__ T HostInterface::amo_fetch_cas(void* dst, T value, T cond, int pe, return ret; } -template +template __host__ void HostInterface::to_all_internal(MPI_Comm mpi_comm, T* dest, const T* source, int nreduce) { DPRINTF("Function: host_to_all_internal\n"); @@ -356,7 +356,7 @@ __host__ void HostInterface::to_all_internal(MPI_Comm mpi_comm, T* dest, return; } -template +template __host__ void HostInterface::to_all(T* dest, const T* source, int nreduce, int pe_start, int log_pe_stride, int pe_size, [[maybe_unused]] T* p_wrk, @@ -375,8 +375,8 @@ __host__ void HostInterface::to_all(T* dest, const T* source, int nreduce, return; } -template -__host__ int HostInterface::reduce(roc_shmem_team_t team, T* dest, +template +__host__ int HostInterface::reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce) { DPRINTF("Function: Team-based host_reduce\n"); @@ -388,7 +388,7 @@ __host__ int HostInterface::reduce(roc_shmem_team_t team, T* dest, to_all_internal(mpi_comm, dest, source, nreduce); - return ROC_SHMEM_SUCCESS; + return ROCSHMEM_SUCCESS; } template @@ -397,26 +397,26 @@ __host__ inline int HostInterface::compare(int cmp, T input_val, int cond_satisfied{0}; switch (cmp) { - case ROC_SHMEM_CMP_EQ: + case ROCSHMEM_CMP_EQ: cond_satisfied = (input_val == target_val) ? 1 : 0; break; - case ROC_SHMEM_CMP_NE: + case ROCSHMEM_CMP_NE: cond_satisfied = (input_val != target_val) ? 1 : 0; break; - case ROC_SHMEM_CMP_GT: + case ROCSHMEM_CMP_GT: cond_satisfied = (input_val > target_val) ? 1 : 0; break; - case ROC_SHMEM_CMP_GE: + case ROCSHMEM_CMP_GE: cond_satisfied = (input_val >= target_val) ? 1 : 0; break; - case ROC_SHMEM_CMP_LT: + case ROCSHMEM_CMP_LT: cond_satisfied = (input_val < target_val) ? 1 : 0; break; - case ROC_SHMEM_CMP_LE: + case ROCSHMEM_CMP_LE: cond_satisfied = (input_val <= target_val) ? 1 : 0; break; default: - assert(cmp >= ROC_SHMEM_CMP_EQ && cmp <= ROC_SHMEM_CMP_LE); + assert(cmp >= ROCSHMEM_CMP_EQ && cmp <= ROCSHMEM_CMP_LE); break; } diff --git a/src/ipc/backend_ipc.cpp b/src/ipc/backend_ipc.cpp index 4b5bd6ef08..5a89335746 100644 --- a/src/ipc/backend_ipc.cpp +++ b/src/ipc/backend_ipc.cpp @@ -33,10 +33,10 @@ namespace rocshmem { } \ } -extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT; +extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; -roc_shmem_team_t get_external_team(GPUIBTeam *team) { - return reinterpret_cast(team); +rocshmem_team_t get_external_team(GPUIBTeam *team) { + return reinterpret_cast(team); } int get_ls_non_zero_bit(char *bitmask, int mask_length) { @@ -57,7 +57,7 @@ IPCBackend::IPCBackend(MPI_Comm comm) : Backend() { type = BackendType::IPC_BACKEND; - if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) { + if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) { std::stringstream sstream(maximum_num_contexts_str); sstream >> maximum_num_contexts_; } @@ -82,7 +82,7 @@ IPCBackend::IPCBackend(MPI_Comm comm) default_host_ctx = std::make_unique(this, 0); - ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get(); + ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get(); init_g_ret(&heap, thread_comm, MAX_NUM_BLOCKS, &bp->g_ret); @@ -92,7 +92,7 @@ IPCBackend::IPCBackend(MPI_Comm comm) init_wrk_sync_buffer(); - roc_shmem_collective_init(); + rocshmem_collective_init(); setup_fence_buffer(); @@ -143,7 +143,7 @@ void IPCBackend::setup_ctxs() { } } -__device__ bool IPCBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) { +__device__ bool IPCBackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) { IPCContext *ctx_{nullptr}; auto pop_result = ctx_free_list.get()->pop_front(); @@ -158,7 +158,7 @@ __device__ bool IPCBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) { return true; } -__device__ void IPCBackend::destroy_ctx(roc_shmem_ctx_t *ctx) { +__device__ void IPCBackend::destroy_ctx(rocshmem_ctx_t *ctx) { ctx_free_list.get()->push_back(static_cast(ctx->ctx_opaque)); } @@ -182,9 +182,9 @@ void IPCBackend::setup_team_world() { team_tracker.set_team_world(team_world); /** - * Copy the address to ROC_SHMEM_TEAM_WORLD. + * Copy the address to ROCSHMEM_TEAM_WORLD. */ - ROC_SHMEM_TEAM_WORLD = reinterpret_cast(team_world); + ROCSHMEM_TEAM_WORLD = reinterpret_cast(team_world); } void IPCBackend::init_mpi_once(MPI_Comm comm) { @@ -205,7 +205,7 @@ void IPCBackend::init_mpi_once(MPI_Comm comm) { NET_CHECK(MPI_Comm_rank(thread_comm, &my_pe)); } -void IPCBackend::team_destroy(roc_shmem_team_t team) { +void IPCBackend::team_destroy(rocshmem_team_t team) { IPCTeam *team_obj = get_internal_ipc_team(team); /* Mark the pool as available */ @@ -221,7 +221,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team, TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) { + rocshmem_team_t *new_team) { /** * Read the bit mask and find out a common index into * the pool of available work arrays. @@ -303,24 +303,24 @@ void IPCBackend::init_wrk_sync_buffer() { /** * size of barrier sync */ - Wrk_Sync_buffer_size_ += sizeof(*barrier_sync) * ROC_SHMEM_BARRIER_SYNC_SIZE; + Wrk_Sync_buffer_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE; /** * Size of sync arrays for the teams */ Wrk_Sync_buffer_size_ += sizeof(long) * max_num_teams * - (ROC_SHMEM_BARRIER_SYNC_SIZE + - ROC_SHMEM_REDUCE_SYNC_SIZE + - ROC_SHMEM_BCAST_SYNC_SIZE + - ROC_SHMEM_ALLTOALL_SYNC_SIZE); + (ROCSHMEM_BARRIER_SYNC_SIZE + + ROCSHMEM_REDUCE_SYNC_SIZE + + ROCSHMEM_BCAST_SYNC_SIZE + + ROCSHMEM_ALLTOALL_SYNC_SIZE); /** * Size of work arrays for the teams * Accommodate largest possible data type for pWrk */ Wrk_Sync_buffer_size_ += sizeof(double) * max_num_teams * - (ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE + - ROC_SHMEM_ATA_MAX_WRKDATA_SIZE); + (ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE + + ROCSHMEM_ATA_MAX_WRKDATA_SIZE); /** * Size of fence array @@ -397,12 +397,12 @@ void IPCBackend::setup_fence_buffer() { temp_Wrk_Sync_buff_ptr_ += sizeof(int) * num_pes; } -void IPCBackend::roc_shmem_collective_init() { +void IPCBackend::rocshmem_collective_init() { /* * Allocate heap space for barrier_sync */ size_t one_sync_size_bytes{sizeof(*barrier_sync)}; - size_t sync_size_bytes{one_sync_size_bytes * ROC_SHMEM_BARRIER_SYNC_SIZE}; + size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE}; barrier_sync = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); temp_Wrk_Sync_buff_ptr_ += sync_size_bytes; @@ -410,7 +410,7 @@ void IPCBackend::roc_shmem_collective_init() { * Initialize the barrier synchronization array with default values. */ for (int i = 0; i < num_pes; i++) { - barrier_sync[i] = ROC_SHMEM_SYNC_VALUE; + barrier_sync[i] = ROCSHMEM_SYNC_VALUE; } /* @@ -427,29 +427,29 @@ void IPCBackend::teams_init() { auto max_num_teams{team_tracker.get_max_num_teams()}; barrier_pSync_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BARRIER_SYNC_SIZE + temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams; reduce_pSync_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_REDUCE_SYNC_SIZE + temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams; bcast_pSync_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE + temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams; alltoall_pSync_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE + temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams; /* Accommodating for largest possible data type for pWrk */ pWrk_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE + temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams; pAta_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROC_SHMEM_ATA_MAX_WRKDATA_SIZE + temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE * max_num_teams; /** @@ -458,25 +458,25 @@ void IPCBackend::teams_init() { long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync; for (int team_i = 0; team_i < max_num_teams; team_i++) { barrier_pSync = reinterpret_cast( - &barrier_pSync_pool[team_i * ROC_SHMEM_BARRIER_SYNC_SIZE]); + &barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]); reduce_pSync = reinterpret_cast( - &reduce_pSync_pool[team_i * ROC_SHMEM_REDUCE_SYNC_SIZE]); + &reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]); bcast_pSync = reinterpret_cast( - &bcast_pSync_pool[team_i * ROC_SHMEM_BCAST_SYNC_SIZE]); + &bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]); alltoall_pSync = reinterpret_cast( - &alltoall_pSync_pool[team_i * ROC_SHMEM_ALLTOALL_SYNC_SIZE]); + &alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]); - for (int i = 0; i < ROC_SHMEM_BARRIER_SYNC_SIZE; i++) { - barrier_pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) { + barrier_pSync[i] = ROCSHMEM_SYNC_VALUE; } - for (int i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) { - reduce_pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) { + reduce_pSync[i] = ROCSHMEM_SYNC_VALUE; } - for (int i = 0; i < ROC_SHMEM_BCAST_SYNC_SIZE; i++) { - bcast_pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) { + bcast_pSync[i] = ROCSHMEM_SYNC_VALUE; } - for (int i = 0; i < ROC_SHMEM_ALLTOALL_SYNC_SIZE; i++) { - alltoall_pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) { + alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE; } } diff --git a/src/ipc/backend_ipc.hpp b/src/ipc/backend_ipc.hpp index f029a32fbb..9d80f030c7 100644 --- a/src/ipc/backend_ipc.hpp +++ b/src/ipc/backend_ipc.hpp @@ -48,13 +48,13 @@ class IPCBackend : public Backend { */ virtual ~IPCBackend(); - __device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx); + __device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx); /** - * @brief Destroy a `roc_shmem_ctx_t` context and returns it back to the + * @brief Destroy a `rocshmem_ctx_t` context and returns it back to the * context free list. */ - __device__ void destroy_ctx(roc_shmem_ctx_t *ctx); + __device__ void destroy_ctx(rocshmem_ctx_t *ctx); /** * @copydoc Backend::ctx_create @@ -103,12 +103,12 @@ class IPCBackend : public Backend { void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) override; + rocshmem_team_t *new_team) override; /** - * @copydoc Backend::team_destroy(roc_shmem_team_t) + * @copydoc Backend::team_destroy(rocshmem_team_t) */ - void team_destroy(roc_shmem_team_t team) override; + void team_destroy(rocshmem_team_t team) override; /** * @brief Accessor for work/sync bases @@ -213,7 +213,7 @@ class IPCBackend : public Backend { * When this method completes, the barrier_sync member will be available * for use. */ - void roc_shmem_collective_init(); + void rocshmem_collective_init(); /** * @brief Allocate buffer for fence/quiet operation diff --git a/src/ipc/context_ipc_device.cpp b/src/ipc/context_ipc_device.cpp index 47d45565ef..b1a97542b1 100644 --- a/src/ipc/context_ipc_device.cpp +++ b/src/ipc/context_ipc_device.cpp @@ -31,7 +31,7 @@ #include #include "config.h" // NOLINT(build/include_subdir) -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "backend_ipc.hpp" namespace rocshmem { diff --git a/src/ipc/context_ipc_device.hpp b/src/ipc/context_ipc_device.hpp index 03b72efe89..87d21361fe 100644 --- a/src/ipc/context_ipc_device.hpp +++ b/src/ipc/context_ipc_device.hpp @@ -63,7 +63,7 @@ class IPCContext : public Context { __device__ void sync_all(); - __device__ void sync(roc_shmem_team_t team); + __device__ void sync(rocshmem_team_t team); template __device__ void p(T *dest, T value, int pe); @@ -121,18 +121,18 @@ class IPCContext : public Context { __device__ T amo_fetch_cas(void *dst, T value, T cond, int pe); // Collectives - template - __device__ int reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce); + template + __device__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce); template - __device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source, + __device__ void broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root); template - __device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source, + __device__ void alltoall(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source, + __device__ void fcollect(rocshmem_team_t team, T *dest, const T *source, int nelems); @@ -211,11 +211,11 @@ class IPCContext : public Context { int pe_root); // NOLINT(runtime/int) template - __device__ void fcollect_linear(roc_shmem_team_t team, T *dest, + __device__ void fcollect_linear(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_linear(roc_shmem_team_t team, T *dest, + __device__ void alltoall_linear(rocshmem_team_t team, T *dest, const T *source, int nelems); __device__ void internal_sync(int pe, int PE_start, int stride, int PE_size, @@ -227,10 +227,10 @@ class IPCContext : public Context { __device__ void internal_atomic_barrier(int pe, int PE_start, int stride, int n_pes, int64_t *pSync); - template + template __device__ void internal_direct_allreduce(T *dst, const T *src, int nelems, IPCTeam *team_obj); - template + template __device__ void internal_ring_allreduce(T *dst, const T *src, int nelems, IPCTeam *team_obj, int n_seg, int seg_size, int chunk_size); diff --git a/src/ipc/context_ipc_device_coll.cpp b/src/ipc/context_ipc_device_coll.cpp index 7a03233d89..13745e719d 100644 --- a/src/ipc/context_ipc_device_coll.cpp +++ b/src/ipc/context_ipc_device_coll.cpp @@ -20,7 +20,7 @@ * IN THE SOFTWARE. *****************************************************************************/ -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "../context_incl.hpp" #include "context_ipc_tmpl_device.hpp" #include "../util.hpp" @@ -39,8 +39,8 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start, __threadfence_system(); #endif /* __gfx90a__ */ for (size_t i = 1; i < n_pes; i++) { - wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val); - pSync[i] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val); + pSync[i] = ROCSHMEM_SYNC_VALUE; } threadfence_system(); @@ -58,8 +58,8 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start, #if defined(__gfx90a__) __threadfence_system(); #endif /* __gfx90a__ */ - wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val); - pSync[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); + pSync[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); } } @@ -69,8 +69,8 @@ __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start, int64_t *pSync) { int64_t flag_val = 1; if (pe == PE_start) { - wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(n_pes - 1)); - pSync[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1)); + pSync[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) { @@ -78,8 +78,8 @@ __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start, } } else { amo_add(&pSync[0], flag_val, PE_start); - wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val); - pSync[0] = ROC_SHMEM_SYNC_VALUE; + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); + pSync[0] = ROCSHMEM_SYNC_VALUE; threadfence_system(); } } @@ -98,7 +98,7 @@ __device__ void IPCContext::internal_sync(int pe, int PE_start, int stride, __syncthreads(); } -__device__ void IPCContext::sync(roc_shmem_team_t team) { +__device__ void IPCContext::sync(rocshmem_team_t team) { IPCTeam *team_obj = reinterpret_cast(team); int pe = team_obj->my_pe_in_world; diff --git a/src/ipc/context_ipc_host.hpp b/src/ipc/context_ipc_host.hpp index 8e2c9f48d2..506fa52258 100644 --- a/src/ipc/context_ipc_host.hpp +++ b/src/ipc/context_ipc_host.hpp @@ -86,16 +86,16 @@ class IPCHostContext : public Context { long *p_sync); template - __host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source, + __host__ void broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root); - template + template __host__ void to_all(T *dest, const T *source, int nreduce, int pe_start, int log_pe_stride, int pe_size, T *p_wrk, long *p_sync); - template - __host__ int reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce); + template + __host__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce); template __host__ void wait_until(T *ivars, int cmp, T val); diff --git a/src/ipc/context_ipc_tmpl_device.hpp b/src/ipc/context_ipc_tmpl_device.hpp index 4da49b6f7b..038002d9c5 100644 --- a/src/ipc/context_ipc_tmpl_device.hpp +++ b/src/ipc/context_ipc_tmpl_device.hpp @@ -24,11 +24,11 @@ #define LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_ #include "config.h" // NOLINT(build/include_subdir) -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "context_ipc_device.hpp" #include "../util.hpp" #include "ipc_team.hpp" -#include "../roc_shmem_calc.hpp" +#include "../rocshmem_calc.hpp" namespace rocshmem { @@ -153,7 +153,7 @@ __device__ T IPCContext::amo_fetch_cas(void *dest, T value, T cond, int pe) { } // Collectives -template +template __device__ void compute_reduce(T *src, T *dst, int size, int wg_id, int wg_size) { for (size_t i = wg_id; i < size; i += wg_size) { @@ -162,7 +162,7 @@ __device__ void compute_reduce(T *src, T *dst, int size, int wg_id, __syncthreads(); } -template +template __device__ void IPCContext::internal_direct_allreduce( T *dst, const T *src, int nelems, IPCTeam *team_obj) { // NOLINT(runtime/int) @@ -203,7 +203,7 @@ __device__ void IPCContext::internal_direct_allreduce( if (i != pe) { // Wait for leader thread to see that the buffer is ready. if (is_thread_zero_in_block()) { - wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val); + wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val); } __syncthreads(); @@ -216,7 +216,7 @@ __device__ void IPCContext::internal_direct_allreduce( __syncthreads(); for (int i = wg_id; i < num_pes; i += wg_size) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; + pSync[i] = ROCSHMEM_SYNC_VALUE; } threadfence_system(); __syncthreads(); @@ -278,7 +278,7 @@ __device__ void IPCContext::internal_direct_allreduce( * [02+12+22+32] [02+12+22+32] [02+12+22+32] [02+12+22+32] * [03+13+23+33] [03+13+23+33] [03+13+23+33] [03+13+23+33] */ -template +template __device__ void IPCContext::internal_ring_allreduce( T *dst, const T *src, int nelems, IPCTeam *team_obj, // NOLINT(runtime/int) int n_seg, int seg_size, int chunk_size) { @@ -323,7 +323,7 @@ __device__ void IPCContext::internal_ring_allreduce( #if defined(__gfx90a__) __threadfence_system(); #endif /* __gfx90a__ */ - wait_until(&pSync[iter], ROC_SHMEM_CMP_EQ, wait_val); + wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val); } __syncthreads(); compute_reduce(&pWrk[off_recv], &dst[off_seg + off_recv], @@ -344,7 +344,7 @@ __device__ void IPCContext::internal_ring_allreduce( #if defined(__gfx90a__) __threadfence_system(); #endif /* __gfx90a__ */ - wait_until(&pSync[iter], ROC_SHMEM_CMP_EQ, wait_val); + wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val); } __syncthreads(); } @@ -352,13 +352,13 @@ __device__ void IPCContext::internal_ring_allreduce( __syncthreads(); for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; + pSync[i] = ROCSHMEM_SYNC_VALUE; } __syncthreads(); } -template -__device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest, +template +__device__ int IPCContext::reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce) { IPCTeam *team_obj = reinterpret_cast(team); @@ -367,14 +367,14 @@ __device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest, size_t direct_pWrk = PE_size * nreduce; size_t direct_pSync = PE_size; size_t ring_pSync = 2 * PE_size; - size_t provided_pWrk = max(nreduce / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE); - size_t provided_pSync = ROC_SHMEM_REDUCE_SYNC_SIZE; + size_t provided_pWrk = max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE); + size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE; if (provided_pWrk >= direct_pWrk && provided_pSync >= direct_pSync) { internal_direct_allreduce(dest, source, nreduce, team_obj); } else { - if (ring_pSync <= ROC_SHMEM_REDUCE_SYNC_SIZE) { - size_t ring_pWrk = ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE; + if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) { + size_t ring_pWrk = ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE; // integer division truncating value int chunk_size = ring_pWrk / PE_size; int seg_size = chunk_size * PE_size; @@ -410,10 +410,10 @@ __device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest, } } else { GPU_DPRINTF("Unsupported reduction size for IPC conduit.\n"); - return ROC_SHMEM_ERROR; + return ROCSHMEM_ERROR; } } - return ROC_SHMEM_SUCCESS; + return ROCSHMEM_SUCCESS; } template @@ -439,7 +439,7 @@ __device__ void IPCContext::internal_get_broadcast( } template -__device__ void IPCContext::broadcast(roc_shmem_team_t team, T *dst, +__device__ void IPCContext::broadcast(rocshmem_team_t team, T *dst, const T *src, int nelems, int pe_root) { IPCTeam *team_obj = reinterpret_cast(team); @@ -471,13 +471,13 @@ __device__ void IPCContext::internal_broadcast(T *dst, const T *src, int nelems, } template -__device__ void IPCContext::alltoall(roc_shmem_team_t team, T *dst, +__device__ void IPCContext::alltoall(rocshmem_team_t team, T *dst, const T *src, int nelems) { alltoall_linear(team, dst, src, nelems); } template -__device__ void IPCContext::alltoall_linear(roc_shmem_team_t team, T *dst, +__device__ void IPCContext::alltoall_linear(rocshmem_team_t team, T *dst, const T *src, int nelems) { IPCTeam *team_obj = reinterpret_cast(team); @@ -500,13 +500,13 @@ __device__ void IPCContext::alltoall_linear(roc_shmem_team_t team, T *dst, } template -__device__ void IPCContext::fcollect(roc_shmem_team_t team, T *dst, +__device__ void IPCContext::fcollect(rocshmem_team_t team, T *dst, const T *src, int nelems) { fcollect_linear(team, dst, src, nelems); } template -__device__ void IPCContext::fcollect_linear(roc_shmem_team_t team, T *dst, +__device__ void IPCContext::fcollect_linear(rocshmem_team_t team, T *dst, const T *src, int nelems) { IPCTeam *team_obj = reinterpret_cast(team); diff --git a/src/ipc/context_ipc_tmpl_host.hpp b/src/ipc/context_ipc_tmpl_host.hpp index e559ca7068..9e79849631 100644 --- a/src/ipc/context_ipc_tmpl_host.hpp +++ b/src/ipc/context_ipc_tmpl_host.hpp @@ -93,13 +93,13 @@ __host__ void IPCHostContext::broadcast( } template -__host__ void IPCHostContext::broadcast(roc_shmem_team_t team, T *dest, +__host__ void IPCHostContext::broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root) { host_interface->broadcast(team, dest, source, nelems, pe_root); } -template +template __host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce, int pe_start, int log_pe_stride, int pe_size, T *p_wrk, @@ -108,8 +108,8 @@ __host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce, pe_size, p_wrk, p_sync); } -template -__host__ int IPCHostContext::reduce(roc_shmem_team_t team, T *dest, +template +__host__ int IPCHostContext::reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce) { return host_interface->reduce(team, dest, source, nreduce); } diff --git a/src/ipc/ipc_context_proxy.hpp b/src/ipc/ipc_context_proxy.hpp index 29261b62c6..7370761f5b 100644 --- a/src/ipc/ipc_context_proxy.hpp +++ b/src/ipc/ipc_context_proxy.hpp @@ -46,7 +46,7 @@ class IPCDefaultContextProxy { auto ctx{proxy_.get()}; new (ctx) IPCContext(reinterpret_cast(backend)); ctx->tinfo = tinfo; - roc_shmem_ctx_t local{ctx, tinfo}; + rocshmem_ctx_t local{ctx, tinfo}; set_internal_ctx(&local); } @@ -89,4 +89,4 @@ using IPCDefaultContextProxyT = IPCDefaultContextProxy; } // namespace rocshmem -#endif // LIBRARY_SRC_IPC_CONTEXT_PROXY_HPP_ \ No newline at end of file +#endif // LIBRARY_SRC_IPC_CONTEXT_PROXY_HPP_ diff --git a/src/ipc/ipc_team.cpp b/src/ipc/ipc_team.cpp index 433ca32d4f..04bae3578e 100644 --- a/src/ipc/ipc_team.cpp +++ b/src/ipc/ipc_team.cpp @@ -38,17 +38,17 @@ IPCTeam::IPCTeam(Backend *backend, TeamInfo *team_info_parent, pool_index_ = pool_index; barrier_pSync = - &(b->barrier_pSync_pool[pool_index * ROC_SHMEM_BARRIER_SYNC_SIZE]); + &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]); reduce_pSync = - &(b->reduce_pSync_pool[pool_index * ROC_SHMEM_REDUCE_SYNC_SIZE]); - bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROC_SHMEM_BCAST_SYNC_SIZE]); + &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]); + bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]); alltoall_pSync = - &(b->alltoall_pSync_pool[pool_index * ROC_SHMEM_ALLTOALL_SYNC_SIZE]); + &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]); pWrk = reinterpret_cast(b->pWrk_pool) + - ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index; + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index; pAta = reinterpret_cast(b->pAta_pool) + - ROC_SHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index; + ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index; } IPCTeam::~IPCTeam() {} diff --git a/src/ipc_policy.hpp b/src/ipc_policy.hpp index 1b84c52613..74052bbe54 100644 --- a/src/ipc_policy.hpp +++ b/src/ipc_policy.hpp @@ -123,7 +123,7 @@ class IpcOffImpl { __device__ bool isIpcAvailable(int my_pe, int target_pe) { return false; } - __device__ void ipcGpuInit(Backend *roc_shmem_handle, Context *ctx, + __device__ void ipcGpuInit(Backend *rocshmem_handle, Context *ctx, int thread_id) {} __device__ void ipcCopy(void *dst, void *src, size_t size) {} diff --git a/src/memory/single_heap.cpp b/src/memory/single_heap.cpp index 1e95711401..eeea75db02 100644 --- a/src/memory/single_heap.cpp +++ b/src/memory/single_heap.cpp @@ -27,7 +27,7 @@ namespace rocshmem { SingleHeap::SingleHeap() { - if (auto heap_size_cstr = getenv("ROC_SHMEM_HEAP_SIZE")) { + if (auto heap_size_cstr = getenv("ROCSHMEM_HEAP_SIZE")) { std::stringstream sstream(heap_size_cstr); size_t heap_size; sstream >> heap_size; diff --git a/src/memory/slab_heap.cpp b/src/memory/slab_heap.cpp index faf3a84b1b..c126111770 100644 --- a/src/memory/slab_heap.cpp +++ b/src/memory/slab_heap.cpp @@ -29,7 +29,7 @@ namespace rocshmem { SlabHeap::SlabHeap() { - if (auto slab_size_cstr = getenv("ROC_SHMEM_SLAB_SIZE")) { + if (auto slab_size_cstr = getenv("ROCSHMEM_SLAB_SIZE")) { std::stringstream sstream(slab_size_cstr); size_t slab_size; sstream >> slab_size; diff --git a/src/mpi_init_singleton.hpp b/src/mpi_init_singleton.hpp index 2bc16cc76b..b7cc26c8fd 100644 --- a/src/mpi_init_singleton.hpp +++ b/src/mpi_init_singleton.hpp @@ -81,7 +81,7 @@ class MPIInitSingleton { int nprocs_{-1}; /** - * @brief Was MPI initialized before ROCSHMEM_init call + * @brief Was MPI initialized before rocshmem_init call */ int pre_init_done{0}; diff --git a/src/reverse_offload/backend_ro.cpp b/src/reverse_offload/backend_ro.cpp index 96471181e3..f7ce31c72d 100644 --- a/src/reverse_offload/backend_ro.cpp +++ b/src/reverse_offload/backend_ro.cpp @@ -32,7 +32,7 @@ #include #include // NOLINT -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "../atomic_return.hpp" #include "../backend_type.hpp" #include "../context_incl.hpp" @@ -42,13 +42,13 @@ namespace rocshmem { -extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT; +extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; ROBackend::ROBackend(MPI_Comm comm) : profiler_proxy_(MAX_NUM_BLOCKS), Backend() { type = BackendType::RO_BACKEND; - if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) { + if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) { std::stringstream sstream(maximum_num_contexts_str); sstream >> maximum_num_contexts_; } @@ -83,14 +83,14 @@ ROBackend::ROBackend(MPI_Comm comm) default_host_ctx = std::make_unique(this, 0); - ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get(); + ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get(); team_world_proxy_ = new ROTeamProxy( this, transport_->get_world_comm(), my_pe, num_pes); team_tracker.set_team_world(team_world_proxy_->get()); - ROC_SHMEM_TEAM_WORLD = - reinterpret_cast(team_world_proxy_->get()); + ROCSHMEM_TEAM_WORLD = + reinterpret_cast(team_world_proxy_->get()); default_block_handle_proxy_ = DefaultBlockHandleProxyT( bp->g_ret, bp->atomic_ret, &queue_, &ipcImpl, hdp_proxy_.get()); @@ -120,7 +120,7 @@ ROBackend::~ROBackend() { CHECK_HIP(hipFree(ctx_array)); } -__device__ bool ROBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) { +__device__ bool ROBackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) { ROContext *ctx_; auto pop_result = ctx_free_list.get()->pop_front(); @@ -133,11 +133,11 @@ __device__ bool ROBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) { return true; } -__device__ void ROBackend::destroy_ctx(roc_shmem_ctx_t *ctx) { +__device__ void ROBackend::destroy_ctx(rocshmem_ctx_t *ctx) { ctx_free_list.get()->push_back(static_cast(ctx->ctx_opaque)); } -void ROBackend::team_destroy(roc_shmem_team_t team) { +void ROBackend::team_destroy(rocshmem_team_t team) { ROTeam *team_obj{get_internal_ro_team(team)}; team_obj->~ROTeam(); @@ -148,7 +148,7 @@ void ROBackend::create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) { + rocshmem_team_t *new_team) { transport_->createNewTeam(this, parent_team, team_info_wrt_parent, team_info_wrt_world, num_pes, my_pe_in_new_team, team_comm, new_team); diff --git a/src/reverse_offload/backend_ro.hpp b/src/reverse_offload/backend_ro.hpp index 72c79d0946..4f41a4f7d3 100644 --- a/src/reverse_offload/backend_ro.hpp +++ b/src/reverse_offload/backend_ro.hpp @@ -85,20 +85,20 @@ class ROBackend : public Backend { void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) override; + rocshmem_team_t *new_team) override; /** - * @copydoc Backend::team_destroy(roc_shmem_team_t) + * @copydoc Backend::team_destroy(rocshmem_team_t) */ - void team_destroy(roc_shmem_team_t team) override; + void team_destroy(rocshmem_team_t team) override; - __device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx); + __device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx); /** - * @brief Destroy a `roc_shmem_ctx_t` context and returns it back to the + * @brief Destroy a `rocshmem_ctx_t` context and returns it back to the * context free list. */ - __device__ void destroy_ctx(roc_shmem_ctx_t *ctx); + __device__ void destroy_ctx(rocshmem_ctx_t *ctx); /** * @copydoc Backend::ctx_create diff --git a/src/reverse_offload/context_proxy.hpp b/src/reverse_offload/context_proxy.hpp index 4ae94d3851..a5b589a3b4 100644 --- a/src/reverse_offload/context_proxy.hpp +++ b/src/reverse_offload/context_proxy.hpp @@ -23,7 +23,7 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_ -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "../device_proxy.hpp" #include "../memory/hip_allocator.hpp" #include "context_ro_device.hpp" @@ -46,7 +46,7 @@ class DefaultContextProxy { : constructed_{true} { auto ctx{proxy_.get()}; new (ctx) ROContext(reinterpret_cast(backend), -1); - roc_shmem_ctx_t local{ctx, tinfo}; + rocshmem_ctx_t local{ctx, tinfo}; set_internal_ctx(&local); } diff --git a/src/reverse_offload/context_ro_device.cpp b/src/reverse_offload/context_ro_device.cpp index 86e12c7c38..43ce155cac 100644 --- a/src/reverse_offload/context_ro_device.cpp +++ b/src/reverse_offload/context_ro_device.cpp @@ -30,7 +30,7 @@ #include #include "config.h" // NOLINT(build/include_subdir) -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "../backend_type.hpp" #include "../hdp_policy.hpp" #include "backend_proxy.hpp" @@ -176,7 +176,7 @@ __device__ void ROContext::sync_all() { __syncthreads(); } -__device__ void ROContext::sync(roc_shmem_team_t team) { +__device__ void ROContext::sync(rocshmem_team_t team) { ROTeam *team_obj = reinterpret_cast(team); if (is_thread_zero_in_block()) { build_queue_element(RO_NET_SYNC, nullptr, nullptr, 0, 0, 0, 0, 0, nullptr, @@ -472,7 +472,7 @@ __device__ void build_queue_element( ro_net_cmds type, void *dst, void *src, size_t size, int pe, int logPE_stride, int PE_size, int PE_root, void *pWrk, long *pSync, MPI_Comm team_comm, int ro_net_win_id, BlockHandle *handle, - bool blocking, ROC_SHMEM_OP op, ro_net_types datatype) { + bool blocking, ROCSHMEM_OP op, ro_net_types datatype) { auto write_slot{next_write_slot(handle)}; auto queue_element = &handle->queue[write_slot]; diff --git a/src/reverse_offload/context_ro_device.hpp b/src/reverse_offload/context_ro_device.hpp index 8ea7ad3fea..ceccf5e776 100644 --- a/src/reverse_offload/context_ro_device.hpp +++ b/src/reverse_offload/context_ro_device.hpp @@ -34,7 +34,7 @@ __device__ void build_queue_element( ro_net_cmds type, void *dst, void *src, size_t size, int pe, int logPE_stride, int PE_size, int PE_root, void *pWrk, long *pSync, MPI_Comm team_comm, int ro_net_win_id, BlockHandle *handle, - bool blocking, ROC_SHMEM_OP op = ROC_SHMEM_SUM, + bool blocking, ROCSHMEM_OP op = ROCSHMEM_SUM, ro_net_types datatype = RO_NET_INT); class ROContext : public Context { @@ -67,7 +67,7 @@ class ROContext : public Context { __device__ void sync_all(); - __device__ void sync(roc_shmem_team_t team); + __device__ void sync(rocshmem_team_t team); template __device__ void p(T *dest, T value, int pe); @@ -75,13 +75,13 @@ class ROContext : public Context { template __device__ T g(const T *source, int pe); - template + template __device__ void to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); // NOLINT(runtime/int) - template - __device__ void to_all(roc_shmem_team_t team, T *dest, const T *source, + template + __device__ void to_all(rocshmem_team_t team, T *dest, const T *source, int nreduce); template @@ -133,7 +133,7 @@ class ROContext : public Context { __device__ void amo_xor(void *dst, T value, int pe); template - __device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source, + __device__ void broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root); template @@ -142,43 +142,43 @@ class ROContext : public Context { long *p_sync); // NOLINT(runtime/int) template - __device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source, + __device__ void alltoall(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_broadcast(roc_shmem_team_t team, T *dest, + __device__ void alltoall_broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_mpi(roc_shmem_team_t team, T *dest, const T *source, + __device__ void alltoall_mpi(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_gcen(roc_shmem_team_t team, T *dest, const T *source, + __device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void alltoall_gcen2(roc_shmem_team_t team, T *dest, + __device__ void alltoall_gcen2(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source, + __device__ void fcollect(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect_broadcast(roc_shmem_team_t team, T *dest, + __device__ void fcollect_broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect_mpi(roc_shmem_team_t team, T *dest, const T *source, + __device__ void fcollect_mpi(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect_gcen(roc_shmem_team_t team, T *dest, const T *source, + __device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source, int nelems); template - __device__ void fcollect_gcen2(roc_shmem_team_t team, T *dest, + __device__ void fcollect_gcen2(rocshmem_team_t team, T *dest, const T *source, int nelems); __device__ void putmem_wg(void *dest, const void *source, size_t nelems, diff --git a/src/reverse_offload/context_ro_host.hpp b/src/reverse_offload/context_ro_host.hpp index 62710da395..dd3abf0ee6 100644 --- a/src/reverse_offload/context_ro_host.hpp +++ b/src/reverse_offload/context_ro_host.hpp @@ -135,16 +135,16 @@ class ROHostContext : public Context { long *p_sync); // NOLINT(runtime/int) template - __host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source, + __host__ void broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root); - template + template __host__ void to_all(T *dest, const T *source, int nreduce, int pe_start, int log_pe_stride, int pe_size, T *p_wrk, long *p_sync); // NOLINT(runtime/int) - template - __host__ void to_all(roc_shmem_team_t team, T *dest, const T *source, + template + __host__ void to_all(rocshmem_team_t team, T *dest, const T *source, int nreduce); template diff --git a/src/reverse_offload/context_ro_tmpl_device.hpp b/src/reverse_offload/context_ro_tmpl_device.hpp index 3ec0d855f9..003ddae928 100644 --- a/src/reverse_offload/context_ro_tmpl_device.hpp +++ b/src/reverse_offload/context_ro_tmpl_device.hpp @@ -108,8 +108,8 @@ struct GetROType { ********************************* DEVICE API ********************************* *****************************************************************************/ -template -__device__ void ROContext::to_all(roc_shmem_team_t team, T *dest, +template +__device__ void ROContext::to_all(rocshmem_team_t team, T *dest, const T *source, int nreduce) { if (!is_thread_zero_in_block()) { __syncthreads(); @@ -125,7 +125,7 @@ __device__ void ROContext::to_all(roc_shmem_team_t team, T *dest, __syncthreads(); } -template +template __device__ void ROContext::to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync) { @@ -210,7 +210,7 @@ __device__ T ROContext::amo_fetch_cas(void *dst, T value, T cond, int pe) { value, pe, 0, 0, 0, reinterpret_cast(static_cast(cond)), nullptr, (MPI_Comm)NULL, ro_net_win_id, block_handle, true, - ROC_SHMEM_SUM, GetROType::Type); + ROCSHMEM_SUM, GetROType::Type); __threadfence(); return *source; } @@ -225,7 +225,7 @@ __device__ T ROContext::amo_fetch_add(void *dst, T value, int pe) { auto source{get_unused_atomic()}; build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast(source), value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL, - ro_net_win_id, block_handle, true, ROC_SHMEM_SUM, + ro_net_win_id, block_handle, true, ROCSHMEM_SUM, GetROType::Type); __threadfence(); return *source; @@ -241,7 +241,7 @@ __device__ T ROContext::amo_swap(void *dst, T value, int pe) { auto source{get_unused_atomic()}; build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast(source), value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL, - ro_net_win_id, block_handle, true, ROC_SHMEM_REPLACE, + ro_net_win_id, block_handle, true, ROCSHMEM_REPLACE, GetROType::Type); __threadfence(); return *source; @@ -257,7 +257,7 @@ __device__ T ROContext::amo_fetch_and(void *dst, T value, int pe) { auto source{get_unused_atomic()}; build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast(source), value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL, - ro_net_win_id, block_handle, true, ROC_SHMEM_AND, + ro_net_win_id, block_handle, true, ROCSHMEM_AND, GetROType::Type); __threadfence(); return *source; @@ -273,7 +273,7 @@ __device__ T ROContext::amo_fetch_or(void *dst, T value, int pe) { auto source{get_unused_atomic()}; build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast(source), value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL, - ro_net_win_id, block_handle, true, ROC_SHMEM_OR, + ro_net_win_id, block_handle, true, ROCSHMEM_OR, GetROType::Type); __threadfence(); return *source; @@ -289,7 +289,7 @@ __device__ T ROContext::amo_fetch_xor(void *dst, T value, int pe) { auto source{get_unused_atomic()}; build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast(source), value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL, - ro_net_win_id, block_handle, true, ROC_SHMEM_XOR, + ro_net_win_id, block_handle, true, ROCSHMEM_XOR, GetROType::Type); __threadfence(); return *source; @@ -301,7 +301,7 @@ __device__ void ROContext::amo_xor(void *dst, T value, int pe) { } template -__device__ void ROContext::broadcast(roc_shmem_team_t team, T *dest, +__device__ void ROContext::broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root) { if (!is_thread_zero_in_block()) { __syncthreads(); @@ -313,7 +313,7 @@ __device__ void ROContext::broadcast(roc_shmem_team_t team, T *dest, build_queue_element(RO_NET_TEAM_BROADCAST, dest, const_cast(source), nelems, 0, 0, 0, pe_root, nullptr, nullptr, team_obj->mpi_comm, ro_net_win_id, block_handle, true, - ROC_SHMEM_SUM, GetROType::Type); + ROCSHMEM_SUM, GetROType::Type); __syncthreads(); } @@ -331,13 +331,13 @@ __device__ void ROContext::broadcast(T *dest, const T *source, int nelems, build_queue_element(RO_NET_BROADCAST, dest, const_cast(source), nelems, pe_start, log_pe_stride, pe_size, pe_root, nullptr, p_sync, (MPI_Comm)NULL, ro_net_win_id, block_handle, true, - ROC_SHMEM_SUM, GetROType::Type); + ROCSHMEM_SUM, GetROType::Type); __syncthreads(); } template -__device__ void ROContext::alltoall(roc_shmem_team_t team, T *dest, +__device__ void ROContext::alltoall(rocshmem_team_t team, T *dest, const T *source, int nelems) { if (!is_thread_zero_in_block()) { __syncthreads(); @@ -349,13 +349,13 @@ __device__ void ROContext::alltoall(roc_shmem_team_t team, T *dest, build_queue_element(RO_NET_ALLTOALL, dest, const_cast(source), nelems, 0, 0, 0, 0, team_obj->ata_buffer, nullptr, team_obj->mpi_comm, ro_net_win_id, block_handle, true, - ROC_SHMEM_SUM, GetROType::Type); + ROCSHMEM_SUM, GetROType::Type); __syncthreads(); } template -__device__ void ROContext::fcollect(roc_shmem_team_t team, T *dest, +__device__ void ROContext::fcollect(rocshmem_team_t team, T *dest, const T *source, int nelems) { if (!is_thread_zero_in_block()) { __syncthreads(); @@ -367,7 +367,7 @@ __device__ void ROContext::fcollect(roc_shmem_team_t team, T *dest, build_queue_element(RO_NET_FCOLLECT, dest, const_cast(source), nelems, 0, 0, 0, 0, team_obj->ata_buffer, nullptr, team_obj->mpi_comm, ro_net_win_id, block_handle, true, - ROC_SHMEM_SUM, GetROType::Type); + ROCSHMEM_SUM, GetROType::Type); __syncthreads(); } diff --git a/src/reverse_offload/context_ro_tmpl_host.hpp b/src/reverse_offload/context_ro_tmpl_host.hpp index 25d2e72c35..3bffda1cd0 100644 --- a/src/reverse_offload/context_ro_tmpl_host.hpp +++ b/src/reverse_offload/context_ro_tmpl_host.hpp @@ -114,7 +114,7 @@ __host__ void ROHostContext::broadcast(T *dest, const T *source, int nelems, } template -__host__ void ROHostContext::broadcast(roc_shmem_team_t team, T *dest, +__host__ void ROHostContext::broadcast(rocshmem_team_t team, T *dest, const T *source, int nelems, int pe_root) { DPRINTF("Function: Team-based ro_net_host_broadcast\n"); @@ -122,7 +122,7 @@ __host__ void ROHostContext::broadcast(roc_shmem_team_t team, T *dest, host_interface->broadcast(team, dest, source, nelems, pe_root); } -template +template __host__ void ROHostContext::to_all(T *dest, const T *source, int nreduce, int pe_start, int log_pe_stride, int pe_size, T *p_wrk, long *p_sync) { @@ -132,8 +132,8 @@ __host__ void ROHostContext::to_all(T *dest, const T *source, int nreduce, pe_size, p_wrk, p_sync); } -template -__host__ void ROHostContext::to_all(roc_shmem_team_t team, T *dest, +template +__host__ void ROHostContext::to_all(rocshmem_team_t team, T *dest, const T *source, int nreduce) { DPRINTF("Function: Team-based ro_net_host_to_all\n"); diff --git a/src/reverse_offload/mpi_transport.cpp b/src/reverse_offload/mpi_transport.cpp index 154b688581..9aa660d30c 100644 --- a/src/reverse_offload/mpi_transport.cpp +++ b/src/reverse_offload/mpi_transport.cpp @@ -142,7 +142,7 @@ void MPITransport::submitRequestsToMPI() { const_cast(&next_element.ol1.atomic_value), next_element.PE, next_element.ro_net_win_id, queue_idx, next_element.threadId, true, - static_cast(next_element.op), + static_cast(next_element.op), static_cast(next_element.datatype)); DPRINTF("Received AMO dst %p src %p Val %llu pe %d\n", next_element.dst, next_element.src, next_element.ol1.atomic_value, next_element.PE); @@ -163,7 +163,7 @@ void MPITransport::submitRequestsToMPI() { team_reduction(next_element.dst, next_element.src, next_element.ol1.size, next_element.ro_net_win_id, queue_idx, next_element.team_comm, - static_cast(next_element.op), + static_cast(next_element.op), static_cast(next_element.datatype), next_element.threadId, true); DPRINTF("Received FLOAT_SUM_TEAM_TO_ALL dst %p src %p size %lu team %d\n", @@ -175,7 +175,7 @@ void MPITransport::submitRequestsToMPI() { next_element.PE, next_element.ro_net_win_id, queue_idx, next_element.PE, next_element.logPE_stride, next_element.PE_size, next_element.ol2.pWrk, next_element.pSync, - static_cast(next_element.op), + static_cast(next_element.op), static_cast(next_element.datatype), next_element.threadId, true); DPRINTF( @@ -275,15 +275,15 @@ void MPITransport::finalizeTransport() { delete host_interface; } -roc_shmem_team_t get_external_team(ROTeam *team) { - return reinterpret_cast(team); +rocshmem_team_t get_external_team(ROTeam *team) { + return reinterpret_cast(team); } void MPITransport::createNewTeam(ROBackend *backend, Team *parent_team, TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) { + rocshmem_team_t *new_team) { ROTeam *new_team_obj{nullptr}; CHECK_HIP(hipMalloc(&new_team_obj, sizeof(ROTeam))); @@ -342,26 +342,26 @@ void MPITransport::barrier(int blockId, int threadId, bool blocking, outstanding[blockId]++; } -MPI_Op MPITransport::get_mpi_op(ROC_SHMEM_OP op) { +MPI_Op MPITransport::get_mpi_op(ROCSHMEM_OP op) { switch (op) { - case ROC_SHMEM_SUM: + case ROCSHMEM_SUM: return MPI_SUM; - case ROC_SHMEM_MAX: + case ROCSHMEM_MAX: return MPI_MAX; - case ROC_SHMEM_MIN: + case ROCSHMEM_MIN: return MPI_MIN; - case ROC_SHMEM_PROD: + case ROCSHMEM_PROD: return MPI_PROD; - case ROC_SHMEM_AND: + case ROCSHMEM_AND: return MPI_BAND; - case ROC_SHMEM_OR: + case ROCSHMEM_OR: return MPI_BOR; - case ROC_SHMEM_XOR: + case ROCSHMEM_XOR: return MPI_BXOR; - case ROC_SHMEM_REPLACE: + case ROCSHMEM_REPLACE: return MPI_REPLACE; default: - fprintf(stderr, "Unknown ROC_SHMEM op MPI conversion %d\n", op); + fprintf(stderr, "Unknown rocSHMEM op MPI conversion %d\n", op); abort(); } } @@ -383,7 +383,7 @@ static MPI_Datatype convertType(ro_net_types type) { case RO_NET_LONG_DOUBLE: return MPI_LONG_DOUBLE; default: - fprintf(stderr, "Unknown ROC_SHMEM type MPI conversion %d\n", type); + fprintf(stderr, "Unknown rocSHMEM type MPI conversion %d\n", type); abort(); } } @@ -391,7 +391,7 @@ static MPI_Datatype convertType(ro_net_types type) { void MPITransport::reduction(void *dst, void *src, int size, int pe, int win_id, int blockId, int start, int logPstride, int sizePE, void *pWrk, long *pSync, - ROC_SHMEM_OP op, ro_net_types type, int threadId, + ROCSHMEM_OP op, ro_net_types type, int threadId, bool blocking) { MPI_Request request{}; MPI_Op mpi_op{get_mpi_op(op)}; @@ -435,7 +435,7 @@ void MPITransport::broadcast(void *dst, void *src, int size, int pe, } void MPITransport::team_reduction(void *dst, void *src, int size, int win_id, - int blockId, MPI_Comm team, ROC_SHMEM_OP op, + int blockId, MPI_Comm team, ROCSHMEM_OP op, ro_net_types type, int threadId, bool blocking) { MPI_Request request{}; @@ -1046,7 +1046,7 @@ void MPITransport::putMem(void *dst, void *src, int size, int pe, int win_id, void MPITransport::amoFOP(void *dst, void *src, void *val, int pe, int win_id, int blockId, int threadId, bool blocking, - ROC_SHMEM_OP op, ro_net_types type) { + ROCSHMEM_OP op, ro_net_types type) { queue->flush_hdp(); auto *bp{backend_proxy->get()}; diff --git a/src/reverse_offload/mpi_transport.hpp b/src/reverse_offload/mpi_transport.hpp index 11a98ef3ff..fdd8e5a133 100644 --- a/src/reverse_offload/mpi_transport.hpp +++ b/src/reverse_offload/mpi_transport.hpp @@ -49,18 +49,18 @@ class MPITransport : public Transport { TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) override; + rocshmem_team_t *new_team) override; void barrier(int blockId, int threadId, bool blocking, MPI_Comm team) override; void reduction(void *dst, void *src, int size, int pe, int win_id, int blockId, int start, int logPstride, int sizePE, void *pWrk, - long *pSync, ROC_SHMEM_OP op, ro_net_types type, + long *pSync, ROCSHMEM_OP op, ro_net_types type, int threadId, bool blocking) override; void team_reduction(void *dst, void *src, int size, int win_id, int blockId, - MPI_Comm team, ROC_SHMEM_OP op, ro_net_types type, + MPI_Comm team, ROCSHMEM_OP op, ro_net_types type, int threadId, bool blocking) override; void broadcast(void *dst, void *src, int size, int pe, int win_id, @@ -116,7 +116,7 @@ class MPITransport : public Transport { int threadId, bool blocking, bool inline_data = false) override; void amoFOP(void *dst, void *src, void *val, int pe, int win_id, int blockId, - int threadId, bool blocking, ROC_SHMEM_OP op, + int threadId, bool blocking, ROCSHMEM_OP op, ro_net_types type) override; void amoFCAS(void *dst, void *src, void *val, int pe, int win_id, int blockId, @@ -193,7 +193,7 @@ class MPITransport : public Transport { void submitRequestsToMPI(); - MPI_Op get_mpi_op(ROC_SHMEM_OP op); + MPI_Op get_mpi_op(ROCSHMEM_OP op); Queue *queue{nullptr}; diff --git a/src/reverse_offload/transport.hpp b/src/reverse_offload/transport.hpp index 0dcc5acd22..108bdcbf8f 100644 --- a/src/reverse_offload/transport.hpp +++ b/src/reverse_offload/transport.hpp @@ -27,7 +27,7 @@ #include -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "backend_proxy.hpp" #include "ro_net_team.hpp" @@ -48,18 +48,18 @@ class Transport { TeamInfo *team_info_wrt_parent, TeamInfo *team_info_wrt_world, int num_pes, int my_pe_in_new_team, MPI_Comm team_comm, - roc_shmem_team_t *new_team) = 0; + rocshmem_team_t *new_team) = 0; virtual void barrier(int wg_id, int threadId, bool blocking, MPI_Comm team) = 0; virtual void reduction(void *dst, void *src, int size, int pe, int win_id, int wg_id, int start, int logPstride, int sizePE, - void *pWrk, long *pSync, ROC_SHMEM_OP op, + void *pWrk, long *pSync, ROCSHMEM_OP op, ro_net_types type, int threadId, bool blocking) = 0; virtual void team_reduction(void *dst, void *src, int size, int win_id, - int wg_id, MPI_Comm team, ROC_SHMEM_OP op, + int wg_id, MPI_Comm team, ROCSHMEM_OP op, ro_net_types type, int threadId, bool blocking) = 0; @@ -89,7 +89,7 @@ class Transport { int wg_id, int threadId, bool blocking) = 0; virtual void amoFOP(void *dst, void *src, void *val, int pe, int win_id, - int wg_id, int threadId, bool blocking, ROC_SHMEM_OP op, + int wg_id, int threadId, bool blocking, ROCSHMEM_OP op, ro_net_types type) = 0; virtual void amoFCAS(void *dst, void *src, void *val, int pe, int win_id, diff --git a/src/roc_shmem_gpu.cpp b/src/roc_shmem_gpu.cpp deleted file mode 100644 index 2deff5d0b7..0000000000 --- a/src/roc_shmem_gpu.cpp +++ /dev/null @@ -1,1540 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -/** - * @file roc_shmem.cpp - * @brief Public header for ROC_SHMEM device and host libraries. - * - * This is the implementation for the public roc_shmem.hpp header file. This - * guy just extracts the transport from the opaque public handles and delegates - * to the appropriate backend. - * - * The device-side delegation is nasty because we can't use polymorphism with - * our current shader compiler stack. Maybe one day..... - * - * TODO: Could probably autogenerate many of these functions from macros. - * - * TODO: Support runtime backend detection. - * - */ - -#include - -#include - -#include "config.h" // NOLINT(build/include_subdir) -#include "roc_shmem/roc_shmem.hpp" -#include "backend_bc.hpp" -#include "context_incl.hpp" -#include "team.hpp" -#include "templates.hpp" -#include "util.hpp" - -#ifdef USE_GPU_IB -#include "gpu_ib/context_ib_tmpl_device.hpp" -#elif defined(USE_RO) -#include "reverse_offload/context_ro_tmpl_device.hpp" -#else -#include "ipc/context_ipc_tmpl_device.hpp" -#endif - -/****************************************************************************** - **************************** Device Vars And Init **************************** - *****************************************************************************/ - -namespace rocshmem { - -__device__ __constant__ roc_shmem_ctx_t ROC_SHMEM_CTX_DEFAULT{}; - -__constant__ Backend *device_backend_proxy; - -__device__ void roc_shmem_wg_init() { - int provided; - - /* - * Non-threaded init is allowed to select any thread mode, so don't worry - * if provided is different. - */ - roc_shmem_wg_init_thread(ROC_SHMEM_THREAD_WG_FUNNELED, &provided); -} - -__device__ void roc_shmem_wg_init_thread([[maybe_unused]] int requested, - int *provided) { - roc_shmem_query_thread(provided); -} - -__device__ void roc_shmem_query_thread(int *provided) { -#ifdef USE_THREADS - *provided = ROC_SHMEM_THREAD_MULTIPLE; -#else - *provided = ROC_SHMEM_THREAD_WG_FUNNELED; -#endif -} - -__device__ void roc_shmem_wg_finalize() {} - -/****************************************************************************** - ************************** Default Context Wrappers ************************** - *****************************************************************************/ - -__device__ void roc_shmem_putmem(void *dest, const void *source, size_t nelems, - int pe) { - roc_shmem_ctx_putmem(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_put(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_put(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_p(T *dest, T value, int pe) { - roc_shmem_p(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -template -__device__ T roc_shmem_g(const T *source, int pe) { - return roc_shmem_g(ROC_SHMEM_CTX_DEFAULT, source, pe); -} - -__device__ void roc_shmem_getmem(void *dest, const void *source, size_t nelems, - int pe) { - roc_shmem_ctx_getmem(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_get(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_get(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); -} - -__device__ void roc_shmem_putmem_nbi(void *dest, const void *source, - size_t nelems, int pe) { - roc_shmem_ctx_putmem_nbi(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_put_nbi(T *dest, const T *source, size_t nelems, - int pe) { - roc_shmem_put_nbi(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); -} - -__device__ void roc_shmem_getmem_nbi(void *dest, const void *source, - size_t nelems, int pe) { - roc_shmem_ctx_getmem_nbi(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_get_nbi(T *dest, const T *source, size_t nelems, - int pe) { - roc_shmem_get_nbi(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); -} - -__device__ void roc_shmem_fence() { - roc_shmem_ctx_fence(ROC_SHMEM_CTX_DEFAULT); -} - -__device__ void roc_shmem_fence(int pe) { - roc_shmem_ctx_fence(ROC_SHMEM_CTX_DEFAULT, pe); -} - -__device__ void roc_shmem_quiet() { - roc_shmem_ctx_quiet(ROC_SHMEM_CTX_DEFAULT); -} - -__device__ void roc_shmem_threadfence_system() { - roc_shmem_ctx_threadfence_system(ROC_SHMEM_CTX_DEFAULT); -} - -template -__device__ T roc_shmem_atomic_fetch_add(T *dest, T val, int pe) { - return roc_shmem_atomic_fetch_add(ROC_SHMEM_CTX_DEFAULT, dest, val, pe); -} - -template -__device__ T roc_shmem_atomic_compare_swap(T *dest, T cond, T val, int pe) { - return roc_shmem_atomic_compare_swap(ROC_SHMEM_CTX_DEFAULT, dest, cond, val, - pe); -} - -template -__device__ T roc_shmem_atomic_fetch_inc(T *dest, int pe) { - return roc_shmem_atomic_fetch_inc(ROC_SHMEM_CTX_DEFAULT, dest, pe); -} - -template -__device__ T roc_shmem_atomic_fetch(T *source, int pe) { - return roc_shmem_atomic_fetch(ROC_SHMEM_CTX_DEFAULT, source, pe); -} - -template -__device__ void roc_shmem_atomic_add(T *dest, T val, int pe) { - roc_shmem_atomic_add(ROC_SHMEM_CTX_DEFAULT, dest, val, pe); -} - -template -__device__ void roc_shmem_atomic_inc(T *dest, int pe) { - roc_shmem_atomic_inc(ROC_SHMEM_CTX_DEFAULT, dest, pe); -} - -template -__device__ void roc_shmem_atomic_set(T *dest, T value, int pe) { - roc_shmem_atomic_set(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -template -__device__ T roc_shmem_atomic_swap(T *dest, T value, int pe) { - return roc_shmem_atomic_swap(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -template -__device__ T roc_shmem_atomic_fetch_and(T *dest, T value, int pe) { - return roc_shmem_atomic_fetch_and(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -template -__device__ void roc_shmem_atomic_and(T *dest, T value, int pe) { - roc_shmem_atomic_and(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -template -__device__ T roc_shmem_atomic_fetch_or(T *dest, T value, int pe) { - return roc_shmem_atomic_fetch_or(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -template -__device__ void roc_shmem_atomic_or(T *dest, T value, int pe) { - roc_shmem_atomic_or(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -template -__device__ T roc_shmem_atomic_fetch_xor(T *dest, T value, int pe) { - return roc_shmem_atomic_fetch_xor(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -template -__device__ void roc_shmem_atomic_xor(T *dest, T value, int pe) { - roc_shmem_atomic_xor(ROC_SHMEM_CTX_DEFAULT, dest, value, pe); -} - -/****************************************************************************** - ************************* Private Context Interfaces ************************* - *****************************************************************************/ - -__device__ int translate_pe(roc_shmem_ctx_t ctx, int pe) { - if (ctx.team_opaque) { - TeamInfo *tinfo = reinterpret_cast(ctx.team_opaque); - return (tinfo->pe_start + tinfo->stride * pe); - } else { - return pe; - } -} - -__host__ void set_internal_ctx(roc_shmem_ctx_t *ctx) { - CHECK_HIP(hipMemcpyToSymbol(HIP_SYMBOL(ROC_SHMEM_CTX_DEFAULT), ctx, - sizeof(roc_shmem_ctx_t), 0, - hipMemcpyHostToDevice)); -} - -__device__ Context *get_internal_ctx(roc_shmem_ctx_t ctx) { - return reinterpret_cast(ctx.ctx_opaque); -} - -__device__ int roc_shmem_wg_ctx_create(long option, roc_shmem_ctx_t *ctx) { - GPU_DPRINTF("Function: roc_shmem_ctx_create\n"); - bool result{true}; - if (get_flat_block_id() == 0) { - ctx->team_opaque = reinterpret_cast(ROC_SHMEM_CTX_DEFAULT.team_opaque); - result = device_backend_proxy->create_ctx(option, ctx); - reinterpret_cast(ctx->ctx_opaque)->setFence(option); - } - __syncthreads(); - return result == true ? 0 : -1; -} - -__device__ int roc_shmem_wg_team_create_ctx(roc_shmem_team_t team, long options, - roc_shmem_ctx_t *ctx) { - GPU_DPRINTF("Function: roc_shmem_team_create_ctx\n"); - if (team == ROC_SHMEM_TEAM_INVALID) { - return -1; - } - - bool result{true}; - if (get_flat_block_id() == 0) { - Team *team_obj{get_internal_team(team)}; - TeamInfo *info_wrt_world = team_obj->tinfo_wrt_world; - ctx->team_opaque = info_wrt_world; - result = device_backend_proxy->create_ctx(options, ctx); - reinterpret_cast(ctx->ctx_opaque)->setFence(options); - } - __syncthreads(); - - return result == true ? 0 : -1; -} - -__device__ void roc_shmem_wg_ctx_destroy( - [[maybe_unused]] roc_shmem_ctx_t *ctx) { - GPU_DPRINTF("Function: roc_shmem_ctx_destroy\n"); - - if (get_flat_block_id() == 0) { - device_backend_proxy->destroy_ctx(ctx); - } -} - -__device__ void roc_shmem_ctx_threadfence_system(roc_shmem_ctx_t ctx) { - GPU_DPRINTF("Function: roc_shmem_ctx_threadfence_system\n"); - - get_internal_ctx(ctx)->threadfence_system(); -} - -__device__ void roc_shmem_ctx_putmem(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_putmem\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->putmem(dest, source, nelems, pe_in_world); -} - -template -__device__ void roc_shmem_put(roc_shmem_ctx_t ctx, T *dest, const T *source, - size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_put\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->put(dest, source, nelems, pe_in_world); -} - -template -__device__ void roc_shmem_p(roc_shmem_ctx_t ctx, T *dest, T value, int pe) { - GPU_DPRINTF("Function: roc_shmem_p\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->p(dest, value, pe_in_world); -} - -template -__device__ T roc_shmem_g(roc_shmem_ctx_t ctx, const T *source, int pe) { - GPU_DPRINTF("Function: roc_shmem_g\n"); - - int pe_in_world = translate_pe(ctx, pe); - - return get_internal_ctx(ctx)->g(source, pe_in_world); -} - -__device__ void roc_shmem_ctx_getmem(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_getmem\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->getmem(dest, source, nelems, pe_in_world); -} - -template -__device__ void roc_shmem_get(roc_shmem_ctx_t ctx, T *dest, const T *source, - size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_get\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->get(dest, source, nelems, pe_in_world); -} - -__device__ void roc_shmem_ctx_putmem_nbi(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_putmem_nbi\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->putmem_nbi(dest, source, nelems, pe_in_world); -} - -template -__device__ void roc_shmem_put_nbi(roc_shmem_ctx_t ctx, T *dest, const T *source, - size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_put_nbi\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->put_nbi(dest, source, nelems, pe_in_world); -} - -__device__ void roc_shmem_ctx_getmem_nbi(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_getmem_nbi\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->getmem_nbi(dest, source, nelems, pe_in_world); -} - -template -__device__ void roc_shmem_get_nbi(roc_shmem_ctx_t ctx, T *dest, const T *source, - size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_get_nbi\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->get_nbi(dest, source, nelems, pe_in_world); -} - -__device__ void roc_shmem_ctx_fence(roc_shmem_ctx_t ctx) { - GPU_DPRINTF("Function: roc_shmem_ctx_fence\n"); - - get_internal_ctx(ctx)->fence(); -} - -__device__ void roc_shmem_ctx_fence(roc_shmem_ctx_t ctx, int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_fence\n"); - - int pe_in_world = translate_pe(ctx, pe); - - get_internal_ctx(ctx)->fence(pe_in_world); -} - -__device__ void roc_shmem_ctx_quiet(roc_shmem_ctx_t ctx) { - GPU_DPRINTF("Function: roc_shmem_ctx_quiet\n"); - - get_internal_ctx(ctx)->quiet(); -} - -__device__ void *roc_shmem_ptr(const void *dest, int pe) { - GPU_DPRINTF("Function: roc_shmem_ptr\n"); - - return get_internal_ctx(ROC_SHMEM_CTX_DEFAULT)->shmem_ptr(dest, pe); -} - -template -__device__ int roc_shmem_wg_reduce(roc_shmem_ctx_t ctx, roc_shmem_team_t team, - T *dest, const T *source, int nreduce) { - GPU_DPRINTF("Function: roc_shmem_reduce\n"); - - return get_internal_ctx(ctx)->reduce(team, dest, source, nreduce); -} - -template -__device__ void roc_shmem_wg_broadcast(roc_shmem_ctx_t ctx, - roc_shmem_team_t team, T *dest, - const T *source, int nelem, - int pe_root) { - GPU_DPRINTF("Function: Team-based roc_shmem_broadcast\n"); - - get_internal_ctx(ctx)->broadcast(team, dest, source, nelem, pe_root); -} - -template -__device__ void roc_shmem_wg_alltoall(roc_shmem_ctx_t ctx, - roc_shmem_team_t team, T *dest, - const T *source, int nelem) { - GPU_DPRINTF("Function: roc_shmem_alltoall\n"); - - get_internal_ctx(ctx)->alltoall(team, dest, source, nelem); -} - -template -__device__ void roc_shmem_wg_fcollect(roc_shmem_ctx_t ctx, - roc_shmem_team_t team, T *dest, - const T *source, int nelem) { - GPU_DPRINTF("Function: roc_shmem_fcollect\n"); - - get_internal_ctx(ctx)->fcollect(team, dest, source, nelem); -} - -template -__device__ void roc_shmem_wait_until(T *ivars, int cmp, T val) { - GPU_DPRINTF("Function: roc_shmem_wait_until\n"); - - Context *ctx_internal = get_internal_ctx(ROC_SHMEM_CTX_DEFAULT); - ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL); - ctx_internal->wait_until(ivars, cmp, val); -} - -template -__device__ void roc_shmem_wait_until_all(T *ivars, size_t nelems, const int* status, - int cmp, T val) { - GPU_DPRINTF("Function: roc_shmem_wait_until_all\n"); - - Context *ctx_internal = get_internal_ctx(ROC_SHMEM_CTX_DEFAULT); - ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL); - ctx_internal->wait_until_all(ivars, nelems, status, cmp, val); -} - -template -__device__ size_t roc_shmem_wait_until_any(T *ivars, size_t nelems, const int* status, - int cmp, T val) { - GPU_DPRINTF("Function: roc_shmem_wait_until_any\n"); - - Context *ctx_internal = get_internal_ctx(ROC_SHMEM_CTX_DEFAULT); - ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY); - return ctx_internal->wait_until_any(ivars, nelems, status, cmp, val); -} - -template -__device__ size_t roc_shmem_wait_until_some(T *ivars, size_t nelems, size_t* indices, - const int* status, int cmp, - T val) { - DPRINTF("Function: roc_shmem_wait_until_some\n"); - - Context *ctx_internal = get_internal_ctx(ROC_SHMEM_CTX_DEFAULT); - ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME); - return ctx_internal->wait_until_some(ivars, nelems, indices, status, cmp, val); -} - -template -__device__ size_t roc_shmem_wait_until_any_vector(T *ivars, size_t nelems, const int* status, - int cmp, T* vals) { - DPRINTF("Function: roc_shmem_wait_until_any_vector\n"); - - Context *ctx_internal = get_internal_ctx(ROC_SHMEM_CTX_DEFAULT); - ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY_VECTOR); - return ctx_internal->wait_until_any_vector(ivars, nelems, status, cmp, vals); -} - -template -__device__ void roc_shmem_wait_until_all_vector(T *ivars, size_t nelems, const int* status, - int cmp, T* vals) { - DPRINTF("Function: roc_shmem_wait_until_all_vector\n"); - - Context *ctx_internal = get_internal_ctx(ROC_SHMEM_CTX_DEFAULT); - ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL_VECTOR); - ctx_internal->wait_until_all_vector(ivars, nelems, status, cmp, vals); -} - -template -__device__ size_t roc_shmem_wait_until_some_vector(T *ivars, size_t nelems, - size_t* indices, - const int* status, - int cmp, T* vals) { - DPRINTF("Function: roc_shmem_wait_until_some_vector\n"); - - Context *ctx_internal = get_internal_ctx(ROC_SHMEM_CTX_DEFAULT); - ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME_VECTOR); - return ctx_internal->wait_until_some_vector(ivars, nelems, indices, status, cmp, vals); -} - -template -__device__ int roc_shmem_test(T *ivars, int cmp, T val) { - GPU_DPRINTF("Function: roc_shmem_testl\n"); - - Context *ctx_internal = get_internal_ctx(ROC_SHMEM_CTX_DEFAULT); - ctx_internal->ctxStats.incStat(NUM_TEST); - - return ctx_internal->test(ivars, cmp, val); -} - -__device__ void roc_shmem_ctx_wg_barrier_all(roc_shmem_ctx_t ctx) { - GPU_DPRINTF("Function: roc_shmem_ctx_barrier_all\n"); - - get_internal_ctx(ctx)->barrier_all(); -} - -__device__ void roc_shmem_wg_barrier_all() { - roc_shmem_ctx_wg_barrier_all(ROC_SHMEM_CTX_DEFAULT); -} - -__device__ void roc_shmem_ctx_wg_sync_all(roc_shmem_ctx_t ctx) { - GPU_DPRINTF("Function: roc_shmem_ctx_sync_all\n"); - - get_internal_ctx(ctx)->sync_all(); -} - -__device__ void roc_shmem_wg_sync_all() { - roc_shmem_ctx_wg_sync_all(ROC_SHMEM_CTX_DEFAULT); -} - -__device__ void roc_shmem_ctx_wg_team_sync(roc_shmem_ctx_t ctx, - roc_shmem_team_t team) { - GPU_DPRINTF("Function: roc_shmem_ctx_sync_all\n"); - - get_internal_ctx(ctx)->sync(team); -} - -__device__ void roc_shmem_wg_team_sync(roc_shmem_team_t team) { - roc_shmem_ctx_wg_team_sync(ROC_SHMEM_CTX_DEFAULT, team); -} - -__device__ int roc_shmem_ctx_n_pes(roc_shmem_ctx_t ctx) { - GPU_DPRINTF("Function: roc_shmem_n_pes\n"); - - return get_internal_ctx(ctx)->num_pes; -} - -__device__ int roc_shmem_n_pes() { - return get_internal_ctx(ROC_SHMEM_CTX_DEFAULT)->num_pes; -} - -__device__ int roc_shmem_ctx_my_pe(roc_shmem_ctx_t ctx) { - GPU_DPRINTF("Function: roc_shmem_ctx_my_pe\n"); - - return get_internal_ctx(ctx)->my_pe; -} - -__device__ int roc_shmem_my_pe() { - return get_internal_ctx(ROC_SHMEM_CTX_DEFAULT)->my_pe; -} - -__device__ uint64_t roc_shmem_timer() { - GPU_DPRINTF("Function: roc_shmem_timer\n"); - - return __read_clock(); -} - -template -__device__ T roc_shmem_atomic_fetch_add(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_fetch_add\n"); - - return get_internal_ctx(ctx)->amo_fetch_add(dest, val, pe); -} - -template -__device__ T roc_shmem_atomic_compare_swap(roc_shmem_ctx_t ctx, T *dest, T cond, - T val, int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_compare_swap\n"); - - return get_internal_ctx(ctx)->amo_fetch_cas(dest, val, cond, pe); -} - -template -__device__ T roc_shmem_atomic_fetch_inc(roc_shmem_ctx_t ctx, T *dest, int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_fetch_inc\n"); - - return get_internal_ctx(ctx)->amo_fetch_add(dest, 1, pe); -} - -template -__device__ T roc_shmem_atomic_fetch(roc_shmem_ctx_t ctx, T *source, int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_fetch\n"); - - return get_internal_ctx(ctx)->amo_fetch_add(source, 0, pe); -} - -template -__device__ void roc_shmem_atomic_add(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_add\n"); - - get_internal_ctx(ctx)->amo_add(dest, val, pe); -} - -template -__device__ void roc_shmem_atomic_inc(roc_shmem_ctx_t ctx, T *dest, int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_inc\n"); - - get_internal_ctx(ctx)->amo_add(dest, 1, pe); -} - -template -__device__ void roc_shmem_atomic_set(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_set\n"); - - get_internal_ctx(ctx)->amo_set(dest, val, pe); -} - -template -__device__ T roc_shmem_atomic_swap(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_swap\n"); - - return get_internal_ctx(ctx)->amo_swap(dest, val, pe); -} - -template -__device__ T roc_shmem_atomic_fetch_and(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_fetch_and\n"); - - return get_internal_ctx(ctx)->amo_fetch_and(dest, val, pe); -} - -template -__device__ void roc_shmem_atomic_and(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_and\n"); - - get_internal_ctx(ctx)->amo_and(dest, val, pe); -} - -template -__device__ T roc_shmem_atomic_fetch_or(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_fetch_or\n"); - - return get_internal_ctx(ctx)->amo_fetch_or(dest, val, pe); -} - -template -__device__ void roc_shmem_atomic_or(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_or\n"); - - get_internal_ctx(ctx)->amo_or(dest, val, pe); -} - -template -__device__ T roc_shmem_atomic_fetch_xor(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_fetch_xor\n"); - - return get_internal_ctx(ctx)->amo_fetch_xor(dest, val, pe); -} - -template -__device__ void roc_shmem_atomic_xor(roc_shmem_ctx_t ctx, T *dest, T val, - int pe) { - GPU_DPRINTF("Function: roc_shmem_atomic_xor\n"); - - get_internal_ctx(ctx)->amo_xor(dest, val, pe); -} - -/** - * SHMEM X RMA API for WG and Wave level - */ -__device__ void roc_shmem_ctx_putmem_wave(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_putmem_wave\n"); - - get_internal_ctx(ctx)->putmem_wave(dest, source, nelems, pe); -} - -__device__ void roc_shmem_ctx_putmem_wg(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_putmem_wg\n"); - - get_internal_ctx(ctx)->putmem_wg(dest, source, nelems, pe); -} - -__device__ void roc_shmem_ctx_putmem_nbi_wave(roc_shmem_ctx_t ctx, void *dest, - const void *source, - size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_putmem_nbi_wave\n"); - - get_internal_ctx(ctx)->putmem_nbi_wave(dest, source, nelems, pe); -} - -__device__ void roc_shmem_ctx_putmem_nbi_wg(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_putmem_nbi_wg\n"); - - get_internal_ctx(ctx)->putmem_nbi_wg(dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_put_wave(roc_shmem_ctx_t ctx, T *dest, - const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_put_wave\n"); - - get_internal_ctx(ctx)->put_wave(dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_put_wg(roc_shmem_ctx_t ctx, T *dest, const T *source, - size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_put_wg\n"); - - get_internal_ctx(ctx)->put_wg(dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_put_nbi_wave(roc_shmem_ctx_t ctx, T *dest, - const T *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_put_nbi_wave\n"); - - get_internal_ctx(ctx)->put_nbi_wave(dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_put_nbi_wg(roc_shmem_ctx_t ctx, T *dest, - const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_put_nbi_wg\n"); - - get_internal_ctx(ctx)->put_nbi_wg(dest, source, nelems, pe); -} - -__device__ void roc_shmem_ctx_getmem_wg(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_getmem_wg\n"); - - get_internal_ctx(ctx)->getmem_wg(dest, source, nelems, pe); -} - -__device__ void roc_shmem_ctx_getmem_wave(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_getmem_wave\n"); - - get_internal_ctx(ctx)->getmem_wave(dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_get_wg(roc_shmem_ctx_t ctx, T *dest, const T *source, - size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_get_wg\n"); - - get_internal_ctx(ctx)->get_wg(dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_get_wave(roc_shmem_ctx_t ctx, T *dest, - const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_get_wave\n"); - - get_internal_ctx(ctx)->get_wave(dest, source, nelems, pe); -} - -__device__ void roc_shmem_ctx_getmem_nbi_wg(roc_shmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_getmem_nbi_wg\n"); - - get_internal_ctx(ctx)->getmem_nbi_wg(dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_get_nbi_wg(roc_shmem_ctx_t ctx, T *dest, - const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_get_nbi_wg\n"); - - get_internal_ctx(ctx)->get_nbi_wg(dest, source, nelems, pe); -} - -__device__ void roc_shmem_ctx_getmem_nbi_wave(roc_shmem_ctx_t ctx, void *dest, - const void *source, - size_t nelems, int pe) { - GPU_DPRINTF("Function: roc_shmem_ctx_getmem_nbi_wave\n"); - - get_internal_ctx(ctx)->getmem_nbi_wave(dest, source, nelems, pe); -} - -template -__device__ void roc_shmem_get_nbi_wave(roc_shmem_ctx_t ctx, T *dest, - const T *source, size_t nelems, - int pe) { - GPU_DPRINTF("Function: roc_shmem_get_nbi_wave\n"); - - get_internal_ctx(ctx)->get_nbi_wave(dest, source, nelems, pe); -} - -/****************************************************************************** - ****************************** Teams Interface ******************************* - *****************************************************************************/ - -__device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, - int src_pe, - roc_shmem_team_t dst_team) { - return team_translate_pe(src_team, src_pe, dst_team); -} - -/****************************************************************************** - ************************* Template Generation Macros ************************* - *****************************************************************************/ - -/** - * Template generator for reductions - */ -#define REDUCTION_GEN(T, Op) \ - template __device__ int roc_shmem_wg_reduce( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ - int nreduce); - -/** - * Declare templates for the required datatypes (for the compiler) - */ -#define RMA_GEN(T) \ - template __device__ void roc_shmem_put( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_put_nbi( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_p(roc_shmem_ctx_t ctx, T * dest, \ - T value, int pe); \ - template __device__ void roc_shmem_get( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_get_nbi( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ T roc_shmem_g(roc_shmem_ctx_t ctx, const T *source, \ - int pe); \ - template __device__ void roc_shmem_put(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ void roc_shmem_put_nbi(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ void roc_shmem_p(T * dest, T value, int pe); \ - template __device__ void roc_shmem_get(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ void roc_shmem_get_nbi(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ T roc_shmem_g(const T *source, int pe); \ - template __device__ void roc_shmem_wg_broadcast( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ - int nelem, int pe_root); \ - template __device__ void roc_shmem_wg_alltoall( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ - int nelem); \ - template __device__ void roc_shmem_wg_fcollect( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ - int nelem); \ - template __device__ void roc_shmem_put_wave( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_put_wg( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_put_wave(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ void roc_shmem_put_wg(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ void roc_shmem_put_nbi_wave( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_put_nbi_wg( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_put_nbi_wave( \ - T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_put_nbi_wg(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ void roc_shmem_get_wave( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_get_wg( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_get_wave(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ void roc_shmem_get_wg(T * dest, const T *source, \ - size_t nelems, int pe); \ - template __device__ void roc_shmem_get_nbi_wave( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_get_nbi_wg( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_get_nbi_wave( \ - T * dest, const T *source, size_t nelems, int pe); \ - template __device__ void roc_shmem_get_nbi_wg(T * dest, const T *source, \ - size_t nelems, int pe); - -/** - * Declare templates for the standard amo types - */ -#define AMO_STANDARD_GEN(T) \ - template __device__ T roc_shmem_atomic_compare_swap( \ - roc_shmem_ctx_t ctx, T * dest, T cond, T value, int pe); \ - template __device__ T roc_shmem_atomic_compare_swap(T * dest, T cond, \ - T value, int pe); \ - template __device__ T roc_shmem_atomic_fetch_inc(roc_shmem_ctx_t ctx, \ - T * dest, int pe); \ - template __device__ T roc_shmem_atomic_fetch_inc(T * dest, int pe); \ - template __device__ void roc_shmem_atomic_inc(roc_shmem_ctx_t ctx, \ - T * dest, int pe); \ - template __device__ void roc_shmem_atomic_inc(T * dest, int pe); \ - template __device__ T roc_shmem_atomic_fetch_add( \ - roc_shmem_ctx_t ctx, T * dest, T value, int pe); \ - template __device__ T roc_shmem_atomic_fetch_add(T * dest, T value, \ - int pe); \ - template __device__ void roc_shmem_atomic_add(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __device__ void roc_shmem_atomic_add(T * dest, T value, int pe); - -/** - * Declare templates for the extended amo types - */ -#define AMO_EXTENDED_GEN(T) \ - template __device__ T roc_shmem_atomic_fetch(roc_shmem_ctx_t ctx, \ - T * dest, int pe); \ - template __device__ T roc_shmem_atomic_fetch(T * dest, int pe); \ - template __device__ void roc_shmem_atomic_set(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __device__ void roc_shmem_atomic_set(T * dest, T value, int pe); \ - template __device__ T roc_shmem_atomic_swap(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __device__ T roc_shmem_atomic_swap(T * dest, T value, int pe); - -/** - * Declare templates for the bitwise amo types - */ -#define AMO_BITWISE_GEN(T) \ - template __device__ T roc_shmem_atomic_fetch_and( \ - roc_shmem_ctx_t ctx, T * dest, T value, int pe); \ - template __device__ T roc_shmem_atomic_fetch_and(T * dest, T value, \ - int pe); \ - template __device__ void roc_shmem_atomic_and(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __device__ void roc_shmem_atomic_and(T * dest, T value, int pe); \ - template __device__ T roc_shmem_atomic_fetch_or( \ - roc_shmem_ctx_t ctx, T * dest, T value, int pe); \ - template __device__ T roc_shmem_atomic_fetch_or(T * dest, T value, \ - int pe); \ - template __device__ void roc_shmem_atomic_or(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __device__ void roc_shmem_atomic_or(T * dest, T value, int pe); \ - template __device__ T roc_shmem_atomic_fetch_xor( \ - roc_shmem_ctx_t ctx, T * dest, T value, int pe); \ - template __device__ T roc_shmem_atomic_fetch_xor(T * dest, T value, \ - int pe); \ - template __device__ void roc_shmem_atomic_xor(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __device__ void roc_shmem_atomic_xor(T * dest, T value, int pe); - -/** - * Declare templates for the wait types - */ -#define WAIT_GEN(T) \ - template __device__ void roc_shmem_wait_until(T *ivars, \ - int cmp, T val); \ - template __device__ size_t roc_shmem_wait_until_any(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T val); \ - template __device__ void roc_shmem_wait_until_all(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T val); \ - template __device__ size_t roc_shmem_wait_until_some(T *ivars, \ - size_t nelems, size_t* indices, \ - const int* status, \ - int cmp, T val); \ - template __device__ size_t roc_shmem_wait_until_any_vector(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T* vals); \ - template __device__ void roc_shmem_wait_until_all_vector(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T* vals); \ - template __device__ size_t roc_shmem_wait_until_some_vector(T *ivars, \ - size_t nelems, size_t* indices, \ - const int* status, int cmp, \ - T* vals); \ - template __device__ int roc_shmem_test(T *ivars, int cmp, \ - T val); \ - template __device__ void Context::wait_until(T *ivars, int cmp, \ - T val); \ - template __device__ size_t Context::wait_until_any(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T val); \ - template __device__ void Context::wait_until_all(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T val); \ - template __device__ size_t Context::wait_until_some(T *ivars, \ - size_t nelems, \ - size_t* indices, const int* status, \ - int cmp, T val); \ - template __device__ size_t Context::wait_until_any_vector(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T* vals); \ - template __device__ void Context::wait_until_all_vector(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T* vals); \ - template __device__ size_t Context::wait_until_some_vector(T *ivars, \ - size_t nelems, size_t* indices, \ - const int* status, int cmp, \ - T* vals); \ - template __device__ int Context::test(T *ivars, int cmp, T val); - -#define ARITH_REDUCTION_GEN(T) \ - REDUCTION_GEN(T, ROC_SHMEM_SUM) \ - REDUCTION_GEN(T, ROC_SHMEM_MIN) \ - REDUCTION_GEN(T, ROC_SHMEM_MAX) \ - REDUCTION_GEN(T, ROC_SHMEM_PROD) - -#define BITWISE_REDUCTION_GEN(T) \ - REDUCTION_GEN(T, ROC_SHMEM_OR) \ - REDUCTION_GEN(T, ROC_SHMEM_AND) \ - REDUCTION_GEN(T, ROC_SHMEM_XOR) - -#define INT_REDUCTION_GEN(T) \ - ARITH_REDUCTION_GEN(T) \ - BITWISE_REDUCTION_GEN(T) - -#define FLOAT_REDUCTION_GEN(T) ARITH_REDUCTION_GEN(T) - -/** - * Define APIs to call the template functions - **/ - -#define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ - __device__ int roc_shmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ - int nreduce) { \ - return roc_shmem_wg_reduce(ctx, team, dest, source, nreduce); \ - } - -#define ARITH_REDUCTION_DEF_GEN(T, TNAME) \ - REDUCTION_DEF_GEN(T, TNAME, sum, ROC_SHMEM_SUM) \ - REDUCTION_DEF_GEN(T, TNAME, min, ROC_SHMEM_MIN) \ - REDUCTION_DEF_GEN(T, TNAME, max, ROC_SHMEM_MAX) \ - REDUCTION_DEF_GEN(T, TNAME, prod, ROC_SHMEM_PROD) - -#define BITWISE_REDUCTION_DEF_GEN(T, TNAME) \ - REDUCTION_DEF_GEN(T, TNAME, or, ROC_SHMEM_OR) \ - REDUCTION_DEF_GEN(T, TNAME, and, ROC_SHMEM_AND) \ - REDUCTION_DEF_GEN(T, TNAME, xor, ROC_SHMEM_XOR) - -#define INT_REDUCTION_DEF_GEN(T, TNAME) \ - ARITH_REDUCTION_DEF_GEN(T, TNAME) \ - BITWISE_REDUCTION_DEF_GEN(T, TNAME) - -#define FLOAT_REDUCTION_DEF_GEN(T, TNAME) ARITH_REDUCTION_DEF_GEN(T, TNAME) - -#define RMA_DEF_GEN(T, TNAME) \ - __device__ void roc_shmem_ctx_##TNAME##_put( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_put(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_put_nbi( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_put_nbi(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_p(roc_shmem_ctx_t ctx, T *dest, \ - T value, int pe) { \ - roc_shmem_p(ctx, dest, value, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_get( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_get(ctx, dest, source, nelems, pe); \ - } \ - __device__ T roc_shmem_ctx_##TNAME##_g(roc_shmem_ctx_t ctx, const T *source, \ - int pe) { \ - return roc_shmem_g(ctx, source, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_get_nbi( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_get_nbi(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_put(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_put(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_put_nbi(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_put_nbi(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_p(T *dest, T value, int pe) { \ - roc_shmem_p(dest, value, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_get(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_get(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_get_nbi(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_get_nbi(dest, source, nelems, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_g(const T *source, int pe) { \ - return roc_shmem_g(source, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_put_wave( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_put_wave(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_put_wg( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_put_wg(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_put_wave(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_put_wave(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_put_wg(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_put_wg(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_put_nbi_wave( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_put_nbi_wave(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_put_nbi_wg( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_put_nbi_wg(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_put_nbi_wave(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_put_nbi_wave(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_put_nbi_wg(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_put_nbi_wg(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_get_wave( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_get_wave(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_get_wg( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_get_wg(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_get_wave(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_get_wave(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_get_wg(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_get_wg(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_get_nbi_wave( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_get_nbi_wave(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_get_nbi_wg( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_get_nbi_wg(ctx, dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_get_nbi_wave(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_get_nbi_wave(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_get_nbi_wg(T *dest, const T *source, \ - size_t nelems, int pe) { \ - roc_shmem_get_nbi_wg(dest, source, nelems, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_wg_broadcast( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ - int nelem, int pe_root) { \ - roc_shmem_wg_broadcast(ctx, team, dest, source, nelem, pe_root); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_wg_alltoall( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ - int nelem) { \ - roc_shmem_wg_alltoall(ctx, team, dest, source, nelem); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_wg_fcollect( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ - int nelem) { \ - roc_shmem_wg_fcollect(ctx, team, dest, source, nelem); \ - } - -#define AMO_STANDARD_DEF_GEN(T, TNAME) \ - __device__ T roc_shmem_ctx_##TNAME##_atomic_compare_swap( \ - roc_shmem_ctx_t ctx, T *dest, T cond, T value, int pe) { \ - return roc_shmem_atomic_compare_swap(ctx, dest, cond, value, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_atomic_compare_swap(T *dest, T cond, \ - T value, int pe) { \ - return roc_shmem_atomic_compare_swap(dest, cond, value, pe); \ - } \ - __device__ T roc_shmem_ctx_##TNAME##_atomic_fetch_inc(roc_shmem_ctx_t ctx, \ - T *dest, int pe) { \ - return roc_shmem_atomic_fetch_inc(ctx, dest, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_atomic_fetch_inc(T *dest, int pe) { \ - return roc_shmem_atomic_fetch_inc(dest, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_atomic_inc(roc_shmem_ctx_t ctx, \ - T *dest, int pe) { \ - roc_shmem_atomic_inc(ctx, dest, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_atomic_inc(T *dest, int pe) { \ - roc_shmem_atomic_inc(dest, pe); \ - } \ - __device__ T roc_shmem_ctx_##TNAME##_atomic_fetch_add( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_add(ctx, dest, value, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_atomic_fetch_add(T *dest, T value, \ - int pe) { \ - return roc_shmem_atomic_fetch_add(dest, value, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_atomic_add( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - roc_shmem_atomic_add(ctx, dest, value, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_atomic_add(T *dest, T value, int pe) { \ - roc_shmem_atomic_add(dest, value, pe); \ - } - -#define AMO_EXTENDED_DEF_GEN(T, TNAME) \ - __device__ T roc_shmem_ctx_##TNAME##_atomic_fetch(roc_shmem_ctx_t ctx, \ - T *source, int pe) { \ - return roc_shmem_atomic_fetch(ctx, source, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_atomic_fetch(T *source, int pe) { \ - return roc_shmem_atomic_fetch(source, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_atomic_set( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - roc_shmem_atomic_set(ctx, dest, value, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_atomic_set(T *dest, T value, int pe) { \ - roc_shmem_atomic_set(dest, value, pe); \ - } \ - __device__ T roc_shmem_ctx_##TNAME##_atomic_swap(roc_shmem_ctx_t ctx, \ - T *dest, T value, int pe) { \ - return roc_shmem_atomic_swap(ctx, dest, value, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_atomic_swap(T *dest, T value, int pe) { \ - return roc_shmem_atomic_swap(dest, value, pe); \ - } - -#define AMO_BITWISE_DEF_GEN(T, TNAME) \ - __device__ T roc_shmem_ctx_##TNAME##_atomic_fetch_and( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_and(ctx, dest, value, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_atomic_fetch_and(T *dest, T value, \ - int pe) { \ - return roc_shmem_atomic_fetch_and(dest, value, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_atomic_and( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - roc_shmem_atomic_and(ctx, dest, value, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_atomic_and(T *dest, T value, int pe) { \ - roc_shmem_atomic_and(dest, value, pe); \ - } \ - __device__ T roc_shmem_ctx_##TNAME##_atomic_fetch_or( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_or(ctx, dest, value, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_atomic_fetch_or(T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_or(dest, value, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_atomic_or( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - roc_shmem_atomic_or(ctx, dest, value, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_atomic_or(T *dest, T value, int pe) { \ - roc_shmem_atomic_or(dest, value, pe); \ - } \ - __device__ T roc_shmem_ctx_##TNAME##_atomic_fetch_xor( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_xor(ctx, dest, value, pe); \ - } \ - __device__ T roc_shmem_##TNAME##_atomic_fetch_xor(T *dest, T value, \ - int pe) { \ - return roc_shmem_atomic_fetch_xor(dest, value, pe); \ - } \ - __device__ void roc_shmem_ctx_##TNAME##_atomic_xor( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - roc_shmem_atomic_xor(ctx, dest, value, pe); \ - } \ - __device__ void roc_shmem_##TNAME##_atomic_xor(T *dest, T value, int pe) { \ - roc_shmem_atomic_xor(dest, value, pe); \ - } - -#define WAIT_DEF_GEN(T, TNAME) \ - __device__ void roc_shmem_##TNAME##_wait_until(T *ivars, int cmp, \ - T val) { \ - roc_shmem_wait_until(ivars, cmp, val); \ - } \ - __device__ size_t roc_shmem_##TNAME##_wait_until_any(T *ivars, size_t nelems,\ - const int* status, \ - int cmp, \ - T val) { \ - return roc_shmem_wait_until_any(ivars, nelems, status, cmp, val); \ - } \ - __device__ void roc_shmem_##TNAME##_wait_until_all(T *ivars, size_t nelems,\ - const int* status, \ - int cmp, \ - T val) { \ - roc_shmem_wait_until_all(ivars, nelems, status, cmp, val); \ - } \ - __device__ size_t roc_shmem_##TNAME##_wait_until_some(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T val) { \ - return roc_shmem_wait_until_some(ivars, nelems, indices, status, cmp, \ - val); \ - } \ - __device__ size_t roc_shmem_##TNAME##_wait_until_any_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals) { \ - return roc_shmem_wait_until_any_vector(ivars, nelems, status, cmp, \ - vals); \ - } \ - __device__ void roc_shmem_##TNAME##_wait_until_all_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals) { \ - roc_shmem_wait_until_all_vector(ivars, nelems, status, cmp, vals); \ - } \ - __device__ size_t roc_shmem_##TNAME##_wait_until_some_vector(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status,\ - int cmp, \ - T* vals) { \ - return roc_shmem_wait_until_some_vector(ivars, nelems, indices, \ - status, cmp, vals); \ - } \ - __device__ int roc_shmem_##TNAME##_test(T *ivars, int cmp, T val) { \ - return roc_shmem_test(ivars, cmp, val); \ - } - -/****************************************************************************** - ************************* Macro Invocation Per Type ************************** - *****************************************************************************/ - -// clang-format off -INT_REDUCTION_GEN(int) -INT_REDUCTION_GEN(short) -INT_REDUCTION_GEN(long) -INT_REDUCTION_GEN(long long) -FLOAT_REDUCTION_GEN(float) -FLOAT_REDUCTION_GEN(double) -// long double reduction fails. hipcc/device may not support long double. -// so disable it for now. -// FLOAT_REDUCTION_GEN(long double) - -RMA_GEN(float) -RMA_GEN(double) -// RMA_GEN(long double) -RMA_GEN(char) -RMA_GEN(signed char) -RMA_GEN(short) -RMA_GEN(int) -RMA_GEN(long) -RMA_GEN(long long) -RMA_GEN(unsigned char) -RMA_GEN(unsigned short) -RMA_GEN(unsigned int) -RMA_GEN(unsigned long) -RMA_GEN(unsigned long long) - -AMO_STANDARD_GEN(int) -AMO_STANDARD_GEN(long) -AMO_STANDARD_GEN(long long) -AMO_STANDARD_GEN(unsigned int) -AMO_STANDARD_GEN(unsigned long) -AMO_STANDARD_GEN(unsigned long long) - -AMO_EXTENDED_GEN(float) -AMO_EXTENDED_GEN(double) -AMO_EXTENDED_GEN(int) -AMO_EXTENDED_GEN(long) -AMO_EXTENDED_GEN(long long) -AMO_EXTENDED_GEN(unsigned int) -AMO_EXTENDED_GEN(unsigned long) -AMO_EXTENDED_GEN(unsigned long long) - -AMO_BITWISE_GEN(unsigned int) -AMO_BITWISE_GEN(unsigned long) -AMO_BITWISE_GEN(unsigned long long) - -/* Supported synchronization types */ -WAIT_GEN(float) -WAIT_GEN(double) -// WAIT_GEN(long double) -WAIT_GEN(char) -WAIT_GEN(unsigned char) -WAIT_GEN(unsigned short) -WAIT_GEN(signed char) -WAIT_GEN(short) -WAIT_GEN(int) -WAIT_GEN(long) -WAIT_GEN(long long) -WAIT_GEN(unsigned int) -WAIT_GEN(unsigned long) -WAIT_GEN(unsigned long long) - -INT_REDUCTION_DEF_GEN(int, int) -INT_REDUCTION_DEF_GEN(short, short) -INT_REDUCTION_DEF_GEN(long, long) -INT_REDUCTION_DEF_GEN(long long, longlong) -FLOAT_REDUCTION_DEF_GEN(float, float) -FLOAT_REDUCTION_DEF_GEN(double, double) -// long double reduction fails. hipcc/device may not support long double. -// so disable it for now. -// FLOAT_REDUCTION_DEF_GEN(long double, longdouble) - -RMA_DEF_GEN(float, float) -RMA_DEF_GEN(double, double) -RMA_DEF_GEN(char, char) -// RMA_DEF_GEN(long double, longdouble) -RMA_DEF_GEN(signed char, schar) -RMA_DEF_GEN(short, short) -RMA_DEF_GEN(int, int) -RMA_DEF_GEN(long, long) -RMA_DEF_GEN(long long, longlong) -RMA_DEF_GEN(unsigned char, uchar) -RMA_DEF_GEN(unsigned short, ushort) -RMA_DEF_GEN(unsigned int, uint) -RMA_DEF_GEN(unsigned long, ulong) -RMA_DEF_GEN(unsigned long long, ulonglong) -RMA_DEF_GEN(int8_t, int8) -RMA_DEF_GEN(int16_t, int16) -RMA_DEF_GEN(int32_t, int32) -RMA_DEF_GEN(int64_t, int64) -RMA_DEF_GEN(uint8_t, uint8) -RMA_DEF_GEN(uint16_t, uint16) -RMA_DEF_GEN(uint32_t, uint32) -RMA_DEF_GEN(uint64_t, uint64) -RMA_DEF_GEN(size_t, size) -RMA_DEF_GEN(ptrdiff_t, ptrdiff) - -AMO_STANDARD_DEF_GEN(int, int) -AMO_STANDARD_DEF_GEN(long, long) -AMO_STANDARD_DEF_GEN(long long, longlong) -AMO_STANDARD_DEF_GEN(unsigned int, uint) -AMO_STANDARD_DEF_GEN(unsigned long, ulong) -AMO_STANDARD_DEF_GEN(unsigned long long, ulonglong) -AMO_STANDARD_DEF_GEN(int32_t, int32) -AMO_STANDARD_DEF_GEN(int64_t, int64) -AMO_STANDARD_DEF_GEN(uint32_t, uint32) -AMO_STANDARD_DEF_GEN(uint64_t, uint64) -AMO_STANDARD_DEF_GEN(size_t, size) -AMO_STANDARD_DEF_GEN(ptrdiff_t, ptrdiff) - -AMO_EXTENDED_DEF_GEN(float, float) -AMO_EXTENDED_DEF_GEN(double, double) -AMO_EXTENDED_DEF_GEN(int, int) -AMO_EXTENDED_DEF_GEN(long, long) -AMO_EXTENDED_DEF_GEN(long long, longlong) -AMO_EXTENDED_DEF_GEN(unsigned int, uint) -AMO_EXTENDED_DEF_GEN(unsigned long, ulong) -AMO_EXTENDED_DEF_GEN(unsigned long long, ulonglong) -AMO_EXTENDED_DEF_GEN(int32_t, int32) -AMO_EXTENDED_DEF_GEN(int64_t, int64) -AMO_EXTENDED_DEF_GEN(uint32_t, uint32) -AMO_EXTENDED_DEF_GEN(uint64_t, uint64) -AMO_EXTENDED_DEF_GEN(size_t, size) -AMO_EXTENDED_DEF_GEN(ptrdiff_t, ptrdiff) - -AMO_BITWISE_DEF_GEN(unsigned int, uint) -AMO_BITWISE_DEF_GEN(unsigned long, ulong) -AMO_BITWISE_DEF_GEN(unsigned long long, ulonglong) -AMO_BITWISE_DEF_GEN(int32_t, int32) -AMO_BITWISE_DEF_GEN(int64_t, int64) -AMO_BITWISE_DEF_GEN(uint32_t, uint32) -AMO_BITWISE_DEF_GEN(uint64_t, uint64) - -WAIT_DEF_GEN(float, float) -WAIT_DEF_GEN(double, double) -// WAIT_DEF_GEN(long double, longdouble) -WAIT_DEF_GEN(char, char) -WAIT_DEF_GEN(signed char, schar) -WAIT_DEF_GEN(short, short) -WAIT_DEF_GEN(int, int) -WAIT_DEF_GEN(long, long) -WAIT_DEF_GEN(long long, longlong) -WAIT_DEF_GEN(unsigned char, uchar) -WAIT_DEF_GEN(unsigned short, ushort) -WAIT_DEF_GEN(unsigned int, uint) -WAIT_DEF_GEN(unsigned long, ulong) -WAIT_DEF_GEN(unsigned long long, ulonglong) -// clang-format on - -} // namespace rocshmem diff --git a/src/roc_shmem.cpp b/src/rocshmem.cpp similarity index 51% rename from src/roc_shmem.cpp rename to src/rocshmem.cpp index e778bbeab5..93b331b4d2 100644 --- a/src/roc_shmem.cpp +++ b/src/rocshmem.cpp @@ -21,15 +21,15 @@ *****************************************************************************/ /** - * @file roc_shmem.cpp - * @brief Public header for ROC_SHMEM device and host libraries. + * @file rocshmem.cpp + * @brief Public header for rocSHMEM device and host libraries. * - * This is the implementation for the public roc_shmem.hpp header file. This + * This is the implementation for the public rocshmem.hpp header file. This * guy just extracts the transport from the opaque public handles and delegates * to the appropriate backend. */ -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include #include @@ -53,18 +53,18 @@ namespace rocshmem { -#define VERIFY_BACKEND() \ - { \ - if (!backend) { \ - fprintf(stderr, "ROC_SHMEM_ERROR: %s in file '%s' in line %d\n", \ - "Call 'roc_shmem_init'", __FILE__, __LINE__); \ - abort(); \ - } \ +#define VERIFY_BACKEND() \ + { \ + if (!backend) { \ + fprintf(stderr, "ROCSHMEM_ERROR: %s in file '%s' in line %d\n", \ + "Call 'rocshmem_init'", __FILE__, __LINE__); \ + abort(); \ + } \ } Backend *backend = nullptr; -roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT; +rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; /** * Begin Host Code @@ -100,57 +100,57 @@ roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT; } } -[[maybe_unused]] __host__ void roc_shmem_init(MPI_Comm comm) { +[[maybe_unused]] __host__ void rocshmem_init(MPI_Comm comm) { library_init(comm); } -[[maybe_unused]] __host__ void roc_shmem_init_thread( +[[maybe_unused]] __host__ void rocshmem_init_thread( [[maybe_unused]] int required, int *provided, MPI_Comm comm) { library_init(comm); - roc_shmem_query_thread(provided); + rocshmem_query_thread(provided); } -[[maybe_unused]] __host__ int roc_shmem_my_pe() { +[[maybe_unused]] __host__ int rocshmem_my_pe() { MPIInitSingleton *s = s->GetInstance(); return s->get_rank(); } -[[maybe_unused]] __host__ int roc_shmem_n_pes() { +[[maybe_unused]] __host__ int rocshmem_n_pes() { MPIInitSingleton *s = s->GetInstance(); return s->get_nprocs(); } -[[maybe_unused]] __host__ void *roc_shmem_malloc(size_t size) { +[[maybe_unused]] __host__ void *rocshmem_malloc(size_t size) { VERIFY_BACKEND(); void *ptr; backend->heap.malloc(&ptr, size); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); return ptr; } -[[maybe_unused]] __host__ void roc_shmem_free(void *ptr) { +[[maybe_unused]] __host__ void rocshmem_free(void *ptr) { VERIFY_BACKEND(); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); backend->heap.free(ptr); } -[[maybe_unused]] __host__ void roc_shmem_reset_stats() { +[[maybe_unused]] __host__ void rocshmem_reset_stats() { VERIFY_BACKEND(); backend->reset_stats(); } -[[maybe_unused]] __host__ void roc_shmem_dump_stats() { +[[maybe_unused]] __host__ void rocshmem_dump_stats() { /** TODO: Many stats are backend independent! **/ VERIFY_BACKEND(); backend->dump_stats(); } -[[maybe_unused]] __host__ void roc_shmem_finalize() { +[[maybe_unused]] __host__ void rocshmem_finalize() { VERIFY_BACKEND(); /* @@ -173,15 +173,15 @@ roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT; delete MPIInitSingleton::GetInstance(); } -__host__ void roc_shmem_query_thread(int *provided) { +__host__ void rocshmem_query_thread(int *provided) { /* * Host-facing functions always support full * thread flexibility i.e. THREAD_MULTIPLE. */ - *provided = ROC_SHMEM_THREAD_MULTIPLE; + *provided = ROCSHMEM_THREAD_MULTIPLE; } -__host__ void roc_shmem_global_exit(int status) { +__host__ void rocshmem_global_exit(int status) { VERIFY_BACKEND(); backend->global_exit(status); } @@ -190,16 +190,16 @@ __host__ void roc_shmem_global_exit(int status) { ****************************** Teams Interface ******************************* *****************************************************************************/ -__host__ int roc_shmem_team_n_pes(roc_shmem_team_t team) { - if (team == ROC_SHMEM_TEAM_INVALID) { +__host__ int rocshmem_team_n_pes(rocshmem_team_t team) { + if (team == ROCSHMEM_TEAM_INVALID) { return -1; } else { return get_internal_team(team)->num_pes; } } -__host__ int roc_shmem_team_my_pe(roc_shmem_team_t team) { - if (team == ROC_SHMEM_TEAM_INVALID) { +__host__ int rocshmem_team_my_pe(rocshmem_team_t team) { + if (team == ROCSHMEM_TEAM_INVALID) { return -1; } else { return get_internal_team(team)->my_pe; @@ -218,13 +218,13 @@ __host__ inline int pe_in_active_set(int start, int stride, int size, int pe) { return translated_pe; } -__host__ int roc_shmem_team_split_strided( - roc_shmem_team_t parent_team, int start, int stride, int size, - [[maybe_unused]] const roc_shmem_team_config_t *config, - [[maybe_unused]] long config_mask, roc_shmem_team_t *new_team) { +__host__ int rocshmem_team_split_strided( + rocshmem_team_t parent_team, int start, int stride, int size, + [[maybe_unused]] const rocshmem_team_config_t *config, + [[maybe_unused]] long config_mask, rocshmem_team_t *new_team) { VERIFY_BACKEND(); - *new_team = ROC_SHMEM_TEAM_INVALID; + *new_team = ROCSHMEM_TEAM_INVALID; auto num_user_teams{backend->team_tracker.get_num_user_teams()}; auto max_num_teams{backend->team_tracker.get_max_num_teams()}; @@ -232,7 +232,7 @@ __host__ int roc_shmem_team_split_strided( abort(); } - if (parent_team == ROC_SHMEM_TEAM_INVALID) { + if (parent_team == ROCSHMEM_TEAM_INVALID) { return 0; // TODO(bpotter): is this the right return value? } @@ -287,7 +287,7 @@ __host__ int roc_shmem_team_split_strided( * TODO: are there any backend specific objects? */ if (my_pe_in_new_team < 0) { - *new_team = ROC_SHMEM_TEAM_INVALID; + *new_team = ROCSHMEM_TEAM_INVALID; } else { backend->create_new_team(parent_team_obj, team_info_wrt_parent, team_info_wrt_world, size, my_pe_in_new_team, @@ -301,8 +301,8 @@ __host__ int roc_shmem_team_split_strided( return 0; } -__host__ void roc_shmem_team_destroy(roc_shmem_team_t team) { - if (team == ROC_SHMEM_TEAM_INVALID || team == ROC_SHMEM_TEAM_WORLD) { +__host__ void rocshmem_team_destroy(rocshmem_team_t team) { + if (team == ROCSHMEM_TEAM_INVALID || team == ROCSHMEM_TEAM_WORLD) { /* Do nothing */ return; } @@ -312,8 +312,8 @@ __host__ void roc_shmem_team_destroy(roc_shmem_team_t team) { backend->team_destroy(team); } -__host__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, int src_pe, - roc_shmem_team_t dst_team) { +__host__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, int src_pe, + rocshmem_team_t dst_team) { return team_translate_pe(src_team, src_pe, dst_team); } @@ -322,150 +322,150 @@ __host__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, int src_pe, *****************************************************************************/ template -__host__ void roc_shmem_put(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_put(ROC_SHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); +__host__ void rocshmem_put(T *dest, const T *source, size_t nelems, int pe) { + rocshmem_put(ROCSHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); } -__host__ void roc_shmem_putmem(void *dest, const void *source, size_t nelems, +__host__ void rocshmem_putmem(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_putmem(ROC_SHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_ctx_putmem(ROCSHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); } template -__host__ void roc_shmem_p(T *dest, T value, int pe) { - roc_shmem_p(ROC_SHMEM_HOST_CTX_DEFAULT, dest, value, pe); +__host__ void rocshmem_p(T *dest, T value, int pe) { + rocshmem_p(ROCSHMEM_HOST_CTX_DEFAULT, dest, value, pe); } template -__host__ void roc_shmem_get(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_get(ROC_SHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); +__host__ void rocshmem_get(T *dest, const T *source, size_t nelems, int pe) { + rocshmem_get(ROCSHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); } -__host__ void roc_shmem_getmem(void *dest, const void *source, size_t nelems, +__host__ void rocshmem_getmem(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_getmem(ROC_SHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_ctx_getmem(ROCSHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); } template -__host__ T roc_shmem_g(const T *source, int pe) { - return roc_shmem_g(ROC_SHMEM_HOST_CTX_DEFAULT, source, pe); +__host__ T rocshmem_g(const T *source, int pe) { + return rocshmem_g(ROCSHMEM_HOST_CTX_DEFAULT, source, pe); } template -__host__ void roc_shmem_put_nbi(T *dest, const T *source, size_t nelems, +__host__ void rocshmem_put_nbi(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_put_nbi(ROC_SHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_put_nbi(ROCSHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); } -__host__ void roc_shmem_putmem_nbi(void *dest, const void *source, +__host__ void rocshmem_putmem_nbi(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_putmem_nbi(ROC_SHMEM_HOST_CTX_DEFAULT, dest, source, nelems, + rocshmem_ctx_putmem_nbi(ROCSHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); } template -__host__ void roc_shmem_get_nbi(T *dest, const T *source, size_t nelems, +__host__ void rocshmem_get_nbi(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_get_nbi(ROC_SHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_get_nbi(ROCSHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); } -__host__ void roc_shmem_getmem_nbi(void *dest, const void *source, +__host__ void rocshmem_getmem_nbi(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_getmem_nbi(ROC_SHMEM_HOST_CTX_DEFAULT, dest, source, nelems, + rocshmem_ctx_getmem_nbi(ROCSHMEM_HOST_CTX_DEFAULT, dest, source, nelems, pe); } template -__host__ T roc_shmem_atomic_fetch_add(T *dest, T val, int pe) { - return roc_shmem_atomic_fetch_add(ROC_SHMEM_HOST_CTX_DEFAULT, dest, val, pe); +__host__ T rocshmem_atomic_fetch_add(T *dest, T val, int pe) { + return rocshmem_atomic_fetch_add(ROCSHMEM_HOST_CTX_DEFAULT, dest, val, pe); } template -__host__ T roc_shmem_atomic_compare_swap(T *dest, T cond, T val, int pe) { - return roc_shmem_atomic_compare_swap(ROC_SHMEM_HOST_CTX_DEFAULT, dest, cond, +__host__ T rocshmem_atomic_compare_swap(T *dest, T cond, T val, int pe) { + return rocshmem_atomic_compare_swap(ROCSHMEM_HOST_CTX_DEFAULT, dest, cond, val, pe); } template -__host__ T roc_shmem_atomic_fetch_inc(T *dest, int pe) { - return roc_shmem_atomic_fetch_inc(ROC_SHMEM_HOST_CTX_DEFAULT, dest, pe); +__host__ T rocshmem_atomic_fetch_inc(T *dest, int pe) { + return rocshmem_atomic_fetch_inc(ROCSHMEM_HOST_CTX_DEFAULT, dest, pe); } template -__host__ T roc_shmem_atomic_fetch(T *source, int pe) { - return roc_shmem_atomic_fetch(ROC_SHMEM_HOST_CTX_DEFAULT, source, pe); +__host__ T rocshmem_atomic_fetch(T *source, int pe) { + return rocshmem_atomic_fetch(ROCSHMEM_HOST_CTX_DEFAULT, source, pe); } template -__host__ void roc_shmem_atomic_add(T *dest, T val, int pe) { - roc_shmem_atomic_add(ROC_SHMEM_HOST_CTX_DEFAULT, dest, val, pe); +__host__ void rocshmem_atomic_add(T *dest, T val, int pe) { + rocshmem_atomic_add(ROCSHMEM_HOST_CTX_DEFAULT, dest, val, pe); } template -__host__ void roc_shmem_atomic_inc(T *dest, int pe) { - roc_shmem_atomic_inc(ROC_SHMEM_HOST_CTX_DEFAULT, dest, pe); +__host__ void rocshmem_atomic_inc(T *dest, int pe) { + rocshmem_atomic_inc(ROCSHMEM_HOST_CTX_DEFAULT, dest, pe); } template -__host__ void roc_shmem_atomic_set(T *dest, T val, int pe) { - roc_shmem_atomic_set(ROC_SHMEM_HOST_CTX_DEFAULT, dest, val, pe); +__host__ void rocshmem_atomic_set(T *dest, T val, int pe) { + rocshmem_atomic_set(ROCSHMEM_HOST_CTX_DEFAULT, dest, val, pe); } template -__host__ T roc_shmem_atomic_swap(T *dest, T value, int pe) { - return roc_shmem_atomic_swap(ROC_SHMEM_HOST_CTX_DEFAULT, dest, value, pe); +__host__ T rocshmem_atomic_swap(T *dest, T value, int pe) { + return rocshmem_atomic_swap(ROCSHMEM_HOST_CTX_DEFAULT, dest, value, pe); } template -__host__ T roc_shmem_atomic_fetch_and(T *dest, T value, int pe) { - return roc_shmem_atomic_fetch_and(ROC_SHMEM_HOST_CTX_DEFAULT, dest, value, +__host__ T rocshmem_atomic_fetch_and(T *dest, T value, int pe) { + return rocshmem_atomic_fetch_and(ROCSHMEM_HOST_CTX_DEFAULT, dest, value, pe); } template -__host__ void roc_shmem_atomic_and(T *dest, T value, int pe) { - roc_shmem_atomic_and(ROC_SHMEM_HOST_CTX_DEFAULT, dest, value, pe); +__host__ void rocshmem_atomic_and(T *dest, T value, int pe) { + rocshmem_atomic_and(ROCSHMEM_HOST_CTX_DEFAULT, dest, value, pe); } template -__host__ T roc_shmem_atomic_fetch_or(T *dest, T value, int pe) { - return roc_shmem_atomic_fetch_or(ROC_SHMEM_HOST_CTX_DEFAULT, dest, value, pe); +__host__ T rocshmem_atomic_fetch_or(T *dest, T value, int pe) { + return rocshmem_atomic_fetch_or(ROCSHMEM_HOST_CTX_DEFAULT, dest, value, pe); } template -__host__ void roc_shmem_atomic_or(T *dest, T value, int pe) { - roc_shmem_atomic_or(ROC_SHMEM_HOST_CTX_DEFAULT, dest, value, pe); +__host__ void rocshmem_atomic_or(T *dest, T value, int pe) { + rocshmem_atomic_or(ROCSHMEM_HOST_CTX_DEFAULT, dest, value, pe); } template -__host__ T roc_shmem_atomic_fetch_xor(T *dest, T value, int pe) { - return roc_shmem_atomic_fetch_xor(ROC_SHMEM_HOST_CTX_DEFAULT, dest, value, +__host__ T rocshmem_atomic_fetch_xor(T *dest, T value, int pe) { + return rocshmem_atomic_fetch_xor(ROCSHMEM_HOST_CTX_DEFAULT, dest, value, pe); } template -__host__ void roc_shmem_atomic_xor(T *dest, T value, int pe) { - roc_shmem_atomic_xor(ROC_SHMEM_HOST_CTX_DEFAULT, dest, value, pe); +__host__ void rocshmem_atomic_xor(T *dest, T value, int pe) { + rocshmem_atomic_xor(ROCSHMEM_HOST_CTX_DEFAULT, dest, value, pe); } -__host__ void roc_shmem_fence() { - roc_shmem_ctx_fence(ROC_SHMEM_HOST_CTX_DEFAULT); +__host__ void rocshmem_fence() { + rocshmem_ctx_fence(ROCSHMEM_HOST_CTX_DEFAULT); } -__host__ void roc_shmem_quiet() { - roc_shmem_ctx_quiet(ROC_SHMEM_HOST_CTX_DEFAULT); +__host__ void rocshmem_quiet() { + rocshmem_ctx_quiet(ROCSHMEM_HOST_CTX_DEFAULT); } /****************************************************************************** ************************* Private Context Interfaces ************************* *****************************************************************************/ -__host__ Context *get_internal_ctx(roc_shmem_ctx_t ctx) { +__host__ Context *get_internal_ctx(rocshmem_ctx_t ctx) { return reinterpret_cast(ctx.ctx_opaque); } -__host__ int roc_shmem_ctx_create(int64_t options, roc_shmem_ctx_t *ctx) { - DPRINTF("Host function: roc_shmem_ctx_create\n"); +__host__ int rocshmem_ctx_create(int64_t options, rocshmem_ctx_t *ctx) { + DPRINTF("Host function: rocshmem_ctx_create\n"); void *phys_ctx; backend->ctx_create(options, &phys_ctx); @@ -480,8 +480,8 @@ __host__ int roc_shmem_ctx_create(int64_t options, roc_shmem_ctx_t *ctx) { return 0; } -__host__ void roc_shmem_ctx_destroy(roc_shmem_ctx_t ctx) { - DPRINTF("Host function: roc_shmem_ctx_destroy\n"); +__host__ void rocshmem_ctx_destroy(rocshmem_ctx_t ctx) { + DPRINTF("Host function: rocshmem_ctx_destroy\n"); /* TODO: Implicit quiet on this context */ @@ -493,348 +493,348 @@ __host__ void roc_shmem_ctx_destroy(roc_shmem_ctx_t ctx) { } template -__host__ void roc_shmem_put(roc_shmem_ctx_t ctx, T *dest, const T *source, +__host__ void rocshmem_put(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - DPRINTF("Host function: roc_shmem_put\n"); + DPRINTF("Host function: rocshmem_put\n"); get_internal_ctx(ctx)->put(dest, source, nelems, pe); } -__host__ void roc_shmem_ctx_putmem(roc_shmem_ctx_t ctx, void *dest, +__host__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - DPRINTF("Host function: roc_shmem_ctx_putmem\n"); + DPRINTF("Host function: rocshmem_ctx_putmem\n"); get_internal_ctx(ctx)->putmem(dest, source, nelems, pe); } template -__host__ void roc_shmem_p(roc_shmem_ctx_t ctx, T *dest, T value, int pe) { - DPRINTF("Host function: roc_shmem_p\n"); +__host__ void rocshmem_p(rocshmem_ctx_t ctx, T *dest, T value, int pe) { + DPRINTF("Host function: rocshmem_p\n"); get_internal_ctx(ctx)->p(dest, value, pe); } template -__host__ void roc_shmem_get(roc_shmem_ctx_t ctx, T *dest, const T *source, +__host__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - DPRINTF("Host function: roc_shmem_get\n"); + DPRINTF("Host function: rocshmem_get\n"); get_internal_ctx(ctx)->get(dest, source, nelems, pe); } -__host__ void roc_shmem_ctx_getmem(roc_shmem_ctx_t ctx, void *dest, +__host__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - DPRINTF("Host function: roc_shmem_ctx_getmem\n"); + DPRINTF("Host function: rocshmem_ctx_getmem\n"); get_internal_ctx(ctx)->getmem(dest, source, nelems, pe); } template -__host__ T roc_shmem_g(roc_shmem_ctx_t ctx, const T *source, int pe) { - DPRINTF("Host function: roc_shmem_g\n"); +__host__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe) { + DPRINTF("Host function: rocshmem_g\n"); return get_internal_ctx(ctx)->g(source, pe); } template -__host__ void roc_shmem_put_nbi(roc_shmem_ctx_t ctx, T *dest, const T *source, +__host__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - DPRINTF("Host function: roc_shmem_put_nbi\n"); + DPRINTF("Host function: rocshmem_put_nbi\n"); get_internal_ctx(ctx)->put_nbi(dest, source, nelems, pe); } -__host__ void roc_shmem_ctx_putmem_nbi(roc_shmem_ctx_t ctx, void *dest, +__host__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - DPRINTF("Host function: roc_shmem_ctx_putmem_nbi\n"); + DPRINTF("Host function: rocshmem_ctx_putmem_nbi\n"); get_internal_ctx(ctx)->putmem_nbi(dest, source, nelems, pe); } template -__host__ void roc_shmem_get_nbi(roc_shmem_ctx_t ctx, T *dest, const T *source, +__host__ void rocshmem_get_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - DPRINTF("Host function: roc_shmem_get_nbi\n"); + DPRINTF("Host function: rocshmem_get_nbi\n"); get_internal_ctx(ctx)->get_nbi(dest, source, nelems, pe); } -__host__ void roc_shmem_ctx_getmem_nbi(roc_shmem_ctx_t ctx, void *dest, +__host__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - DPRINTF("Host function: roc_shmem_ctx_getmem_nbi\n"); + DPRINTF("Host function: rocshmem_ctx_getmem_nbi\n"); get_internal_ctx(ctx)->getmem_nbi(dest, source, nelems, pe); } template -__host__ T roc_shmem_atomic_fetch_add(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_fetch_add\n"); + DPRINTF("Host function: rocshmem_atomic_fetch_add\n"); return get_internal_ctx(ctx)->amo_fetch_add(dest, val, pe); } template -__host__ T roc_shmem_atomic_compare_swap(roc_shmem_ctx_t ctx, T *dest, T cond, +__host__ T rocshmem_atomic_compare_swap(rocshmem_ctx_t ctx, T *dest, T cond, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_compare_swap\n"); + DPRINTF("Host function: rocshmem_atomic_compare_swap\n"); return get_internal_ctx(ctx)->amo_fetch_cas(dest, val, cond, pe); } template -__host__ T roc_shmem_atomic_fetch_inc(roc_shmem_ctx_t ctx, T *dest, int pe) { - DPRINTF("Host function: roc_shmem_atomic_fetch_inc\n"); +__host__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, T *dest, int pe) { + DPRINTF("Host function: rocshmem_atomic_fetch_inc\n"); return get_internal_ctx(ctx)->amo_fetch_add(dest, 1, pe); } template -__host__ T roc_shmem_atomic_fetch(roc_shmem_ctx_t ctx, T *source, int pe) { - DPRINTF("Host function: roc_shmem_atomic_fetch\n"); +__host__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe) { + DPRINTF("Host function: rocshmem_atomic_fetch\n"); return get_internal_ctx(ctx)->amo_fetch_add(source, 0, pe); } template -__host__ void roc_shmem_atomic_add(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_add\n"); + DPRINTF("Host function: rocshmem_atomic_add\n"); get_internal_ctx(ctx)->amo_add(dest, val, pe); } template -__host__ void roc_shmem_atomic_inc(roc_shmem_ctx_t ctx, T *dest, int pe) { - DPRINTF("Host function: roc_shmem_atomic_inc\n"); +__host__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe) { + DPRINTF("Host function: rocshmem_atomic_inc\n"); get_internal_ctx(ctx)->amo_add(dest, 1, pe); } template -__host__ void roc_shmem_atomic_set(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_set\n"); + DPRINTF("Host function: rocshmem_atomic_set\n"); get_internal_ctx(ctx)->amo_set(dest, val, pe); } template -__host__ T roc_shmem_atomic_swap(roc_shmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_set\n"); +__host__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T *dest, T val, int pe) { + DPRINTF("Host function: rocshmem_atomic_set\n"); return get_internal_ctx(ctx)->amo_swap(dest, val, pe); } template -__host__ T roc_shmem_atomic_fetch_and(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ T rocshmem_atomic_fetch_and(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_fetch_and\n"); + DPRINTF("Host function: rocshmem_atomic_fetch_and\n"); return get_internal_ctx(ctx)->amo_fetch_and(dest, val, pe); } template -__host__ void roc_shmem_atomic_and(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_and\n"); + DPRINTF("Host function: rocshmem_atomic_and\n"); get_internal_ctx(ctx)->amo_and(dest, val, pe); } template -__host__ T roc_shmem_atomic_fetch_or(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_fetch_or\n"); + DPRINTF("Host function: rocshmem_atomic_fetch_or\n"); return get_internal_ctx(ctx)->amo_fetch_or(dest, val, pe); } template -__host__ void roc_shmem_atomic_or(roc_shmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_or\n"); +__host__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T *dest, T val, int pe) { + DPRINTF("Host function: rocshmem_atomic_or\n"); get_internal_ctx(ctx)->amo_or(dest, val, pe); } template -__host__ T roc_shmem_atomic_fetch_xor(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ T rocshmem_atomic_fetch_xor(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_fetch_xor\n"); + DPRINTF("Host function: rocshmem_atomic_fetch_xor\n"); return get_internal_ctx(ctx)->amo_fetch_xor(dest, val, pe); } template -__host__ void roc_shmem_atomic_xor(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - DPRINTF("Host function: roc_shmem_atomic_xor\n"); + DPRINTF("Host function: rocshmem_atomic_xor\n"); get_internal_ctx(ctx)->amo_xor(dest, val, pe); } -__host__ void roc_shmem_ctx_fence(roc_shmem_ctx_t ctx) { - DPRINTF("Host function: roc_shmem_ctx_fence\n"); +__host__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx) { + DPRINTF("Host function: rocshmem_ctx_fence\n"); get_internal_ctx(ctx)->fence(); } -__host__ void roc_shmem_ctx_quiet(roc_shmem_ctx_t ctx) { - DPRINTF("Host function: roc_shmem_ctx_quiet\n"); +__host__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx) { + DPRINTF("Host function: rocshmem_ctx_quiet\n"); get_internal_ctx(ctx)->quiet(); } -__host__ void roc_shmem_barrier_all() { - DPRINTF("Host function: roc_shmem_barrier_all\n"); +__host__ void rocshmem_barrier_all() { + DPRINTF("Host function: rocshmem_barrier_all\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->barrier_all(); + get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->barrier_all(); } -__host__ void roc_shmem_sync_all() { - DPRINTF("Host function: roc_shmem_sync_all\n"); +__host__ void rocshmem_sync_all() { + DPRINTF("Host function: rocshmem_sync_all\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->sync_all(); + get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->sync_all(); } template -__host__ void roc_shmem_broadcast([[maybe_unused]] roc_shmem_ctx_t ctx, T *dest, +__host__ void rocshmem_broadcast([[maybe_unused]] rocshmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, int pe_start, int log_pe_stride, int pe_size, long *p_sync) { - DPRINTF("Host function: roc_shmem_broadcast\n"); + DPRINTF("Host function: rocshmem_broadcast\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT) + get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT) ->broadcast(dest, source, nelem, pe_root, pe_start, log_pe_stride, pe_size, p_sync); } template -__host__ void roc_shmem_broadcast([[maybe_unused]] roc_shmem_ctx_t ctx, - roc_shmem_team_t team, T *dest, +__host__ void rocshmem_broadcast([[maybe_unused]] rocshmem_ctx_t ctx, + rocshmem_team_t team, T *dest, const T *source, int nelem, int pe_root) { - DPRINTF("Host function: Team-based roc_shmem_broadcast\n"); + DPRINTF("Host function: Team-based rocshmem_broadcast\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT) + get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT) ->broadcast(team, dest, source, nelem, pe_root); } -template -__host__ void roc_shmem_to_all([[maybe_unused]] roc_shmem_ctx_t ctx, T *dest, +template +__host__ void rocshmem_to_all([[maybe_unused]] rocshmem_ctx_t ctx, T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync) { - DPRINTF("Host function: roc_shmem_to_all\n"); + DPRINTF("Host function: rocshmem_to_all\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT) + get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT) ->to_all(dest, source, nreduce, PE_start, logPE_stride, PE_size, pWrk, pSync); } -template -__host__ int roc_shmem_reduce([[maybe_unused]] roc_shmem_ctx_t ctx, - roc_shmem_team_t team, T *dest, const T *source, +template +__host__ int rocshmem_reduce([[maybe_unused]] rocshmem_ctx_t ctx, + rocshmem_team_t team, T *dest, const T *source, int nreduce) { - DPRINTF("Host function: Team-based roc_shmem_reduce\n"); + DPRINTF("Host function: Team-based rocshmem_reduce\n"); - return get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT) + return get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT) ->reduce(team, dest, source, nreduce); } template -__host__ void roc_shmem_wait_until(T *ivars, int cmp, T val) { - DPRINTF("Host function: roc_shmem_wait_until\n"); +__host__ void rocshmem_wait_until(T *ivars, int cmp, T val) { + DPRINTF("Host function: rocshmem_wait_until\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->wait_until(ivars, cmp, val); + get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->wait_until(ivars, cmp, val); } template -__host__ void roc_shmem_wait_until_all(T *ivars, size_t nelems, const int* status, +__host__ void rocshmem_wait_until_all(T *ivars, size_t nelems, const int* status, int cmp, T val) { - DPRINTF("Host function: roc_shmem_wait_until_all\n"); + DPRINTF("Host function: rocshmem_wait_until_all\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->wait_until_all(ivars, + get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->wait_until_all(ivars, nelems, status, cmp, val); } template -__host__ size_t roc_shmem_wait_until_any(T *ivars, size_t nelems, const int* status, +__host__ size_t rocshmem_wait_until_any(T *ivars, size_t nelems, const int* status, int cmp, T val) { - DPRINTF("Host function: roc_shmem_wait_until_any\n"); + DPRINTF("Host function: rocshmem_wait_until_any\n"); - return get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->wait_until_any(ivars, + return get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->wait_until_any(ivars, nelems, status, cmp, val); } template -__host__ size_t roc_shmem_wait_until_some(T *ivars, size_t nelems, size_t* indices, +__host__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems, size_t* indices, const int* status, int cmp, T val) { - DPRINTF("Host function: roc_shmem_wait_until_some\n"); + DPRINTF("Host function: rocshmem_wait_until_some\n"); - return get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->wait_until_some(ivars, nelems, + return get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->wait_until_some(ivars, nelems, indices, status, cmp, val); } template -__host__ size_t roc_shmem_wait_until_any_vector(T *ivars, size_t nelems, const int* status, +__host__ size_t rocshmem_wait_until_any_vector(T *ivars, size_t nelems, const int* status, int cmp, T* vals) { - DPRINTF("Host function: roc_shmem_wait_until_any_vector\n"); + DPRINTF("Host function: rocshmem_wait_until_any_vector\n"); - return get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->wait_until_any_vector(ivars, + return get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->wait_until_any_vector(ivars, nelems, status, cmp, vals); } template -__host__ void roc_shmem_wait_until_all_vector(T *ivars, size_t nelems, const int* status, +__host__ void rocshmem_wait_until_all_vector(T *ivars, size_t nelems, const int* status, int cmp, T* vals) { - DPRINTF("Host function: roc_shmem_wait_until_all_vector\n"); + DPRINTF("Host function: rocshmem_wait_until_all_vector\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->wait_until_all_vector(ivars, + get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->wait_until_all_vector(ivars, nelems, status, cmp, vals); } template -__host__ size_t roc_shmem_wait_until_some_vector(T *ivars, size_t nelems, +__host__ size_t rocshmem_wait_until_some_vector(T *ivars, size_t nelems, size_t* indices, const int* status, int cmp, T* vals) { - DPRINTF("Host function: roc_shmem_wait_until_some_vector\n"); + DPRINTF("Host function: rocshmem_wait_until_some_vector\n"); - return get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->wait_until_some_vector(ivars, + return get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->wait_until_some_vector(ivars, nelems, indices, status, cmp, vals); } template -__host__ int roc_shmem_test(T *ivars, int cmp, T val) { - DPRINTF("Host function: roc_shmem_testl\n"); +__host__ int rocshmem_test(T *ivars, int cmp, T val) { + DPRINTF("Host function: rocshmem_testl\n"); - return get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT)->test(ivars, cmp, val); + return get_internal_ctx(ROCSHMEM_HOST_CTX_DEFAULT)->test(ivars, cmp, val); } /** * Template generator for reductions **/ -#define REDUCTION_GEN(T, Op) \ - template __host__ void roc_shmem_to_all( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, int nreduce, \ - int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); \ - template __host__ int roc_shmem_reduce( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ +#define REDUCTION_GEN(T, Op) \ + template __host__ void rocshmem_to_all( \ + rocshmem_ctx_t ctx, T * dest, const T *source, int nreduce, \ + int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); \ + template __host__ int rocshmem_reduce( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ int nreduce); #define ARITH_REDUCTION_GEN(T) \ - REDUCTION_GEN(T, ROC_SHMEM_SUM) \ - REDUCTION_GEN(T, ROC_SHMEM_MIN) \ - REDUCTION_GEN(T, ROC_SHMEM_MAX) \ - REDUCTION_GEN(T, ROC_SHMEM_PROD) + REDUCTION_GEN(T, ROCSHMEM_SUM) \ + REDUCTION_GEN(T, ROCSHMEM_MIN) \ + REDUCTION_GEN(T, ROCSHMEM_MAX) \ + REDUCTION_GEN(T, ROCSHMEM_PROD) #define BITWISE_REDUCTION_GEN(T) \ - REDUCTION_GEN(T, ROC_SHMEM_OR) \ - REDUCTION_GEN(T, ROC_SHMEM_AND) \ - REDUCTION_GEN(T, ROC_SHMEM_XOR) + REDUCTION_GEN(T, ROCSHMEM_OR) \ + REDUCTION_GEN(T, ROCSHMEM_AND) \ + REDUCTION_GEN(T, ROCSHMEM_XOR) #define INT_REDUCTION_GEN(T) \ ARITH_REDUCTION_GEN(T) \ @@ -846,374 +846,374 @@ __host__ int roc_shmem_test(T *ivars, int cmp, T val) { * Declare templates for the required datatypes (for the compiler) **/ #define RMA_GEN(T) \ - template __host__ void roc_shmem_put( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __host__ void roc_shmem_put_nbi( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __host__ void roc_shmem_p(roc_shmem_ctx_t ctx, T * dest, \ + template __host__ void rocshmem_put( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __host__ void rocshmem_put_nbi( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __host__ void rocshmem_p(rocshmem_ctx_t ctx, T * dest, \ T value, int pe); \ - template __host__ void roc_shmem_get( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __host__ void roc_shmem_get_nbi( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ - template __host__ T roc_shmem_g(roc_shmem_ctx_t ctx, const T *source, \ + template __host__ void rocshmem_get( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __host__ void rocshmem_get_nbi( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __host__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, \ int pe); \ - template __host__ void roc_shmem_put(T * dest, const T *source, \ + template __host__ void rocshmem_put(T * dest, const T *source, \ size_t nelems, int pe); \ - template __host__ void roc_shmem_put_nbi(T * dest, const T *source, \ + template __host__ void rocshmem_put_nbi(T * dest, const T *source, \ size_t nelems, int pe); \ - template __host__ void roc_shmem_p(T * dest, T value, int pe); \ - template __host__ void roc_shmem_get(T * dest, const T *source, \ + template __host__ void rocshmem_p(T * dest, T value, int pe); \ + template __host__ void rocshmem_get(T * dest, const T *source, \ size_t nelems, int pe); \ - template __host__ void roc_shmem_get_nbi(T * dest, const T *source, \ + template __host__ void rocshmem_get_nbi(T * dest, const T *source, \ size_t nelems, int pe); \ - template __host__ T roc_shmem_g(const T *source, int pe); \ - template __host__ void roc_shmem_broadcast( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, int nelem, int pe_root, \ + template __host__ T rocshmem_g(const T *source, int pe); \ + template __host__ void rocshmem_broadcast( \ + rocshmem_ctx_t ctx, T * dest, const T *source, int nelem, int pe_root, \ int pe_start, int log_pe_stride, int pe_size, long *p_sync); \ - template __host__ void roc_shmem_broadcast( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ + template __host__ void rocshmem_broadcast( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ int nelem, int pe_root); /** * Declare templates for the standard amo types */ -#define AMO_STANDARD_GEN(T) \ - template __host__ T roc_shmem_atomic_compare_swap( \ - roc_shmem_ctx_t ctx, T * dest, T cond, T value, int pe); \ - template __host__ T roc_shmem_atomic_compare_swap(T * dest, T cond, \ - T value, int pe); \ - template __host__ T roc_shmem_atomic_fetch_inc(roc_shmem_ctx_t ctx, \ - T * dest, int pe); \ - template __host__ T roc_shmem_atomic_fetch_inc(T * dest, int pe); \ - template __host__ void roc_shmem_atomic_inc(roc_shmem_ctx_t ctx, \ - T * dest, int pe); \ - template __host__ void roc_shmem_atomic_inc(T * dest, int pe); \ - template __host__ T roc_shmem_atomic_fetch_add( \ - roc_shmem_ctx_t ctx, T * dest, T value, int pe); \ - template __host__ T roc_shmem_atomic_fetch_add(T * dest, T value, \ - int pe); \ - template __host__ void roc_shmem_atomic_add(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __host__ void roc_shmem_atomic_add(T * dest, T value, int pe); +#define AMO_STANDARD_GEN(T) \ + template __host__ T rocshmem_atomic_compare_swap( \ + rocshmem_ctx_t ctx, T * dest, T cond, T value, int pe); \ + template __host__ T rocshmem_atomic_compare_swap(T * dest, T cond, \ + T value, int pe); \ + template __host__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, \ + T * dest, int pe); \ + template __host__ T rocshmem_atomic_fetch_inc(T * dest, int pe); \ + template __host__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, \ + T * dest, int pe); \ + template __host__ void rocshmem_atomic_inc(T * dest, int pe); \ + template __host__ T rocshmem_atomic_fetch_add( \ + rocshmem_ctx_t ctx, T * dest, T value, int pe); \ + template __host__ T rocshmem_atomic_fetch_add(T * dest, T value, \ + int pe); \ + template __host__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __host__ void rocshmem_atomic_add(T * dest, T value, int pe); /** * Declare templates for the extended amo types */ -#define AMO_EXTENDED_GEN(T) \ - template __host__ T roc_shmem_atomic_fetch(roc_shmem_ctx_t ctx, T * dest, \ - int pe); \ - template __host__ T roc_shmem_atomic_fetch(T * dest, int pe); \ - template __host__ void roc_shmem_atomic_set(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __host__ void roc_shmem_atomic_set(T * dest, T value, int pe); \ - template __host__ T roc_shmem_atomic_swap(roc_shmem_ctx_t ctx, T * dest, \ - T value, int pe); \ - template __host__ T roc_shmem_atomic_swap(T * dest, T value, int pe); +#define AMO_EXTENDED_GEN(T) \ + template __host__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T * dest, \ + int pe); \ + template __host__ T rocshmem_atomic_fetch(T * dest, int pe); \ + template __host__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __host__ void rocshmem_atomic_set(T * dest, T value, int pe); \ + template __host__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T * dest, \ + T value, int pe); \ + template __host__ T rocshmem_atomic_swap(T * dest, T value, int pe); /** * Declare templates for the bitwise amo types */ -#define AMO_BITWISE_GEN(T) \ - template __host__ T roc_shmem_atomic_fetch_and( \ - roc_shmem_ctx_t ctx, T * dest, T value, int pe); \ - template __host__ T roc_shmem_atomic_fetch_and(T * dest, T value, \ - int pe); \ - template __host__ void roc_shmem_atomic_and(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __host__ void roc_shmem_atomic_and(T * dest, T value, int pe); \ - template __host__ T roc_shmem_atomic_fetch_or(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __host__ T roc_shmem_atomic_fetch_or(T * dest, T value, int pe); \ - template __host__ void roc_shmem_atomic_or(roc_shmem_ctx_t ctx, T * dest, \ - T value, int pe); \ - template __host__ void roc_shmem_atomic_or(T * dest, T value, int pe); \ - template __host__ T roc_shmem_atomic_fetch_xor( \ - roc_shmem_ctx_t ctx, T * dest, T value, int pe); \ - template __host__ T roc_shmem_atomic_fetch_xor(T * dest, T value, \ - int pe); \ - template __host__ void roc_shmem_atomic_xor(roc_shmem_ctx_t ctx, \ - T * dest, T value, int pe); \ - template __host__ void roc_shmem_atomic_xor(T * dest, T value, int pe); +#define AMO_BITWISE_GEN(T) \ + template __host__ T rocshmem_atomic_fetch_and( \ + rocshmem_ctx_t ctx, T * dest, T value, int pe); \ + template __host__ T rocshmem_atomic_fetch_and(T * dest, T value, \ + int pe); \ + template __host__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __host__ void rocshmem_atomic_and(T * dest, T value, int pe); \ + template __host__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe);\ + template __host__ T rocshmem_atomic_fetch_or(T * dest, T value, int pe); \ + template __host__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T * dest, \ + T value, int pe); \ + template __host__ void rocshmem_atomic_or(T * dest, T value, int pe); \ + template __host__ T rocshmem_atomic_fetch_xor( \ + rocshmem_ctx_t ctx, T * dest, T value, int pe); \ + template __host__ T rocshmem_atomic_fetch_xor(T * dest, T value, \ + int pe); \ + template __host__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __host__ void rocshmem_atomic_xor(T * dest, T value, int pe); /** * Declare templates for the wait types */ -#define WAIT_GEN(T) \ - template __host__ void roc_shmem_wait_until(T *ivars, int cmp, \ - T val); \ - template __host__ int roc_shmem_test(T *ivars, int cmp, T val); \ - template __host__ void Context::wait_until(T *ivars, int cmp, \ - T val); \ - template __host__ size_t roc_shmem_wait_until_any(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T val); \ - template __host__ void roc_shmem_wait_until_all(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T val); \ - template __host__ size_t roc_shmem_wait_until_some(T *ivars, size_t nelems,\ - size_t* indices, const int* status, \ - int cmp, T val); \ - template __host__ size_t roc_shmem_wait_until_any_vector(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T* vals); \ - template __host__ void roc_shmem_wait_until_all_vector(T *ivars, \ - size_t nelems, const int* status, \ - int cmp, T* vals); \ - template __host__ size_t roc_shmem_wait_until_some_vector(T *ivars, \ - size_t nelems, size_t* indices, \ - const int* status, int cmp, \ - T* vals); \ +#define WAIT_GEN(T) \ + template __host__ void rocshmem_wait_until(T *ivars, int cmp, \ + T val); \ + template __host__ int rocshmem_test(T *ivars, int cmp, T val); \ + template __host__ void Context::wait_until(T *ivars, int cmp, \ + T val); \ + template __host__ size_t rocshmem_wait_until_any(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T val); \ + template __host__ void rocshmem_wait_until_all(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T val); \ + template __host__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems,\ + size_t* indices, const int* status, \ + int cmp, T val); \ + template __host__ size_t rocshmem_wait_until_any_vector(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T* vals); \ + template __host__ void rocshmem_wait_until_all_vector(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T* vals); \ + template __host__ size_t rocshmem_wait_until_some_vector(T *ivars, \ + size_t nelems, size_t* indices, \ + const int* status, int cmp, \ + T* vals); \ template __host__ int Context::test(T *ivars, int cmp, T val); /** * Define APIs to call the template functions **/ -#define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ - __host__ void roc_shmem_ctx_##TNAME##_##Op_API##_to_all( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, int nreduce, \ - int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync) { \ - roc_shmem_to_all(ctx, dest, source, nreduce, PE_start, \ - logPE_stride, PE_size, pWrk, pSync); \ - } \ - __host__ int roc_shmem_ctx_##TNAME##_##Op_API##_reduce( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ - int nreduce) { \ - return roc_shmem_reduce(ctx, team, dest, source, nreduce); \ +#define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ + __host__ void rocshmem_ctx_##TNAME##_##Op_API##_to_all( \ + rocshmem_ctx_t ctx, T *dest, const T *source, int nreduce, \ + int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync) { \ + rocshmem_to_all(ctx, dest, source, nreduce, PE_start, \ + logPE_stride, PE_size, pWrk, pSync); \ + } \ + __host__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ + int nreduce) { \ + return rocshmem_reduce(ctx, team, dest, source, nreduce); \ } -#define ARITH_REDUCTION_DEF_GEN(T, TNAME) \ - REDUCTION_DEF_GEN(T, TNAME, sum, ROC_SHMEM_SUM) \ - REDUCTION_DEF_GEN(T, TNAME, min, ROC_SHMEM_MIN) \ - REDUCTION_DEF_GEN(T, TNAME, max, ROC_SHMEM_MAX) \ - REDUCTION_DEF_GEN(T, TNAME, prod, ROC_SHMEM_PROD) +#define ARITH_REDUCTION_DEF_GEN(T, TNAME) \ + REDUCTION_DEF_GEN(T, TNAME, sum, ROCSHMEM_SUM) \ + REDUCTION_DEF_GEN(T, TNAME, min, ROCSHMEM_MIN) \ + REDUCTION_DEF_GEN(T, TNAME, max, ROCSHMEM_MAX) \ + REDUCTION_DEF_GEN(T, TNAME, prod, ROCSHMEM_PROD) -#define BITWISE_REDUCTION_DEF_GEN(T, TNAME) \ - REDUCTION_DEF_GEN(T, TNAME, or, ROC_SHMEM_OR) \ - REDUCTION_DEF_GEN(T, TNAME, and, ROC_SHMEM_AND) \ - REDUCTION_DEF_GEN(T, TNAME, xor, ROC_SHMEM_XOR) +#define BITWISE_REDUCTION_DEF_GEN(T, TNAME) \ + REDUCTION_DEF_GEN(T, TNAME, or, ROCSHMEM_OR) \ + REDUCTION_DEF_GEN(T, TNAME, and, ROCSHMEM_AND) \ + REDUCTION_DEF_GEN(T, TNAME, xor, ROCSHMEM_XOR) -#define INT_REDUCTION_DEF_GEN(T, TNAME) \ - ARITH_REDUCTION_DEF_GEN(T, TNAME) \ +#define INT_REDUCTION_DEF_GEN(T, TNAME) \ + ARITH_REDUCTION_DEF_GEN(T, TNAME) \ BITWISE_REDUCTION_DEF_GEN(T, TNAME) #define FLOAT_REDUCTION_DEF_GEN(T, TNAME) ARITH_REDUCTION_DEF_GEN(T, TNAME) #define RMA_DEF_GEN(T, TNAME) \ - __host__ void roc_shmem_ctx_##TNAME##_put( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_put(ctx, dest, source, nelems, pe); \ + __host__ void rocshmem_ctx_##TNAME##_put( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_put(ctx, dest, source, nelems, pe); \ } \ - __host__ void roc_shmem_ctx_##TNAME##_put_nbi( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_put_nbi(ctx, dest, source, nelems, pe); \ + __host__ void rocshmem_ctx_##TNAME##_put_nbi( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_put_nbi(ctx, dest, source, nelems, pe); \ } \ - __host__ void roc_shmem_ctx_##TNAME##_p(roc_shmem_ctx_t ctx, T *dest, \ + __host__ void rocshmem_ctx_##TNAME##_p(rocshmem_ctx_t ctx, T *dest, \ T value, int pe) { \ - roc_shmem_p(ctx, dest, value, pe); \ + rocshmem_p(ctx, dest, value, pe); \ } \ - __host__ void roc_shmem_ctx_##TNAME##_get( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_get(ctx, dest, source, nelems, pe); \ + __host__ void rocshmem_ctx_##TNAME##_get( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_get(ctx, dest, source, nelems, pe); \ } \ - __host__ void roc_shmem_ctx_##TNAME##_get_nbi( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ - roc_shmem_get_nbi(ctx, dest, source, nelems, pe); \ + __host__ void rocshmem_ctx_##TNAME##_get_nbi( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_get_nbi(ctx, dest, source, nelems, pe); \ } \ - __host__ T roc_shmem_ctx_##TNAME##_g(roc_shmem_ctx_t ctx, const T *source, \ + __host__ T rocshmem_ctx_##TNAME##_g(rocshmem_ctx_t ctx, const T *source, \ int pe) { \ - return roc_shmem_g(ctx, source, pe); \ + return rocshmem_g(ctx, source, pe); \ } \ - __host__ void roc_shmem_##TNAME##_put(T *dest, const T *source, \ + __host__ void rocshmem_##TNAME##_put(T *dest, const T *source, \ size_t nelems, int pe) { \ - roc_shmem_put(dest, source, nelems, pe); \ + rocshmem_put(dest, source, nelems, pe); \ } \ - __host__ void roc_shmem_##TNAME##_put_nbi(T *dest, const T *source, \ + __host__ void rocshmem_##TNAME##_put_nbi(T *dest, const T *source, \ size_t nelems, int pe) { \ - roc_shmem_put_nbi(dest, source, nelems, pe); \ + rocshmem_put_nbi(dest, source, nelems, pe); \ } \ - __host__ void roc_shmem_##TNAME##_p(T *dest, T value, int pe) { \ - roc_shmem_p(dest, value, pe); \ + __host__ void rocshmem_##TNAME##_p(T *dest, T value, int pe) { \ + rocshmem_p(dest, value, pe); \ } \ - __host__ void roc_shmem_##TNAME##_get(T *dest, const T *source, \ + __host__ void rocshmem_##TNAME##_get(T *dest, const T *source, \ size_t nelems, int pe) { \ - roc_shmem_get(dest, source, nelems, pe); \ + rocshmem_get(dest, source, nelems, pe); \ } \ - __host__ void roc_shmem_##TNAME##_get_nbi(T *dest, const T *source, \ + __host__ void rocshmem_##TNAME##_get_nbi(T *dest, const T *source, \ size_t nelems, int pe) { \ - roc_shmem_get_nbi(dest, source, nelems, pe); \ + rocshmem_get_nbi(dest, source, nelems, pe); \ } \ - __host__ T roc_shmem_##TNAME##_g(const T *source, int pe) { \ - return roc_shmem_g(source, pe); \ + __host__ T rocshmem_##TNAME##_g(const T *source, int pe) { \ + return rocshmem_g(source, pe); \ } \ - __host__ void roc_shmem_ctx_##TNAME##_broadcast( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ + __host__ void rocshmem_ctx_##TNAME##_broadcast( \ + rocshmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ int pe_start, int log_pe_stride, int pe_size, long *p_sync) { \ - roc_shmem_broadcast(ctx, dest, source, nelem, pe_root, pe_start, \ + rocshmem_broadcast(ctx, dest, source, nelem, pe_root, pe_start, \ log_pe_stride, pe_size, p_sync); \ } \ - __host__ void roc_shmem_ctx_##TNAME##_broadcast( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ + __host__ void rocshmem_ctx_##TNAME##_broadcast( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nelem, int pe_root) { \ - roc_shmem_broadcast(ctx, team, dest, source, nelem, pe_root); \ + rocshmem_broadcast(ctx, team, dest, source, nelem, pe_root); \ } -#define AMO_STANDARD_DEF_GEN(T, TNAME) \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_compare_swap( \ - roc_shmem_ctx_t ctx, T *dest, T cond, T value, int pe) { \ - return roc_shmem_atomic_compare_swap(ctx, dest, cond, value, pe); \ - } \ - __host__ T roc_shmem_##TNAME##_atomic_compare_swap(T *dest, T cond, T value, \ - int pe) { \ - return roc_shmem_atomic_compare_swap(dest, cond, value, pe); \ - } \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_inc(roc_shmem_ctx_t ctx, \ - T *dest, int pe) { \ - return roc_shmem_atomic_fetch_inc(ctx, dest, pe); \ - } \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_inc(T *dest, int pe) { \ - return roc_shmem_atomic_fetch_inc(dest, pe); \ - } \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_inc(roc_shmem_ctx_t ctx, \ - T *dest, int pe) { \ - roc_shmem_atomic_inc(ctx, dest, pe); \ - } \ - __host__ void roc_shmem_##TNAME##_atomic_inc(T *dest, int pe) { \ - roc_shmem_atomic_inc(dest, pe); \ - } \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_add( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_add(ctx, dest, value, pe); \ - } \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_add(T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_add(dest, value, pe); \ - } \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_add(roc_shmem_ctx_t ctx, \ +#define AMO_STANDARD_DEF_GEN(T, TNAME) \ + __host__ T rocshmem_ctx_##TNAME##_atomic_compare_swap( \ + rocshmem_ctx_t ctx, T *dest, T cond, T value, int pe) { \ + return rocshmem_atomic_compare_swap(ctx, dest, cond, value, pe); \ + } \ + __host__ T rocshmem_##TNAME##_atomic_compare_swap(T *dest, T cond, T value, \ + int pe) { \ + return rocshmem_atomic_compare_swap(dest, cond, value, pe); \ + } \ + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_inc(rocshmem_ctx_t ctx, \ + T *dest, int pe) { \ + return rocshmem_atomic_fetch_inc(ctx, dest, pe); \ + } \ + __host__ T rocshmem_##TNAME##_atomic_fetch_inc(T *dest, int pe) { \ + return rocshmem_atomic_fetch_inc(dest, pe); \ + } \ + __host__ void rocshmem_ctx_##TNAME##_atomic_inc(rocshmem_ctx_t ctx, \ + T *dest, int pe) { \ + rocshmem_atomic_inc(ctx, dest, pe); \ + } \ + __host__ void rocshmem_##TNAME##_atomic_inc(T *dest, int pe) { \ + rocshmem_atomic_inc(dest, pe); \ + } \ + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_add( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_add(ctx, dest, value, pe); \ + } \ + __host__ T rocshmem_##TNAME##_atomic_fetch_add(T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_add(dest, value, pe); \ + } \ + __host__ void rocshmem_ctx_##TNAME##_atomic_add(rocshmem_ctx_t ctx, \ T *dest, T value, int pe) { \ - roc_shmem_atomic_add(ctx, dest, value, pe); \ - } \ - __host__ void roc_shmem_##TNAME##_atomic_add(T *dest, T value, int pe) { \ - roc_shmem_atomic_add(dest, value, pe); \ + rocshmem_atomic_add(ctx, dest, value, pe); \ + } \ + __host__ void rocshmem_##TNAME##_atomic_add(T *dest, T value, int pe) { \ + rocshmem_atomic_add(dest, value, pe); \ } -#define AMO_EXTENDED_DEF_GEN(T, TNAME) \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch(roc_shmem_ctx_t ctx, \ - T *source, int pe) { \ - return roc_shmem_atomic_fetch(ctx, source, pe); \ - } \ - __host__ T roc_shmem_##TNAME##_atomic_fetch(T *source, int pe) { \ - return roc_shmem_atomic_fetch(source, pe); \ - } \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_set(roc_shmem_ctx_t ctx, \ - T *dest, T value, int pe) { \ - roc_shmem_atomic_set(ctx, dest, value, pe); \ - } \ - __host__ void roc_shmem_##TNAME##_atomic_set(T *dest, T value, int pe) { \ - roc_shmem_atomic_set(dest, value, pe); \ - } \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_swap(roc_shmem_ctx_t ctx, T *dest, \ - T value, int pe) { \ - return roc_shmem_atomic_swap(ctx, dest, value, pe); \ - } \ - __host__ T roc_shmem_##TNAME##_atomic_swap(T *dest, T value, int pe) { \ - return roc_shmem_atomic_swap(dest, value, pe); \ +#define AMO_EXTENDED_DEF_GEN(T, TNAME) \ + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch(rocshmem_ctx_t ctx, \ + T *source, int pe) { \ + return rocshmem_atomic_fetch(ctx, source, pe); \ + } \ + __host__ T rocshmem_##TNAME##_atomic_fetch(T *source, int pe) { \ + return rocshmem_atomic_fetch(source, pe); \ + } \ + __host__ void rocshmem_ctx_##TNAME##_atomic_set(rocshmem_ctx_t ctx, \ + T *dest, T value, int pe) {\ + rocshmem_atomic_set(ctx, dest, value, pe); \ + } \ + __host__ void rocshmem_##TNAME##_atomic_set(T *dest, T value, int pe) { \ + rocshmem_atomic_set(dest, value, pe); \ + } \ + __host__ T rocshmem_ctx_##TNAME##_atomic_swap(rocshmem_ctx_t ctx, T *dest, \ + T value, int pe) { \ + return rocshmem_atomic_swap(ctx, dest, value, pe); \ + } \ + __host__ T rocshmem_##TNAME##_atomic_swap(T *dest, T value, int pe) { \ + return rocshmem_atomic_swap(dest, value, pe); \ } -#define AMO_BITWISE_DEF_GEN(T, TNAME) \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_and( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_and(ctx, dest, value, pe); \ - } \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_and(T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_and(dest, value, pe); \ - } \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_and(roc_shmem_ctx_t ctx, \ - T *dest, T value, int pe) { \ - roc_shmem_atomic_and(ctx, dest, value, pe); \ - } \ - __host__ void roc_shmem_##TNAME##_atomic_and(T *dest, T value, int pe) { \ - roc_shmem_atomic_and(dest, value, pe); \ - } \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_or( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_or(ctx, dest, value, pe); \ - } \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_or(T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_or(dest, value, pe); \ - } \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_or(roc_shmem_ctx_t ctx, \ - T *dest, T value, int pe) { \ - roc_shmem_atomic_or(ctx, dest, value, pe); \ - } \ - __host__ void roc_shmem_##TNAME##_atomic_or(T *dest, T value, int pe) { \ - roc_shmem_atomic_or(dest, value, pe); \ - } \ - __host__ T roc_shmem_ctx_##TNAME##_atomic_fetch_xor( \ - roc_shmem_ctx_t ctx, T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_xor(ctx, dest, value, pe); \ - } \ - __host__ T roc_shmem_##TNAME##_atomic_fetch_xor(T *dest, T value, int pe) { \ - return roc_shmem_atomic_fetch_xor(dest, value, pe); \ - } \ - __host__ void roc_shmem_ctx_##TNAME##_atomic_xor(roc_shmem_ctx_t ctx, \ - T *dest, T value, int pe) { \ - roc_shmem_atomic_xor(ctx, dest, value, pe); \ - } \ - __host__ void roc_shmem_##TNAME##_atomic_xor(T *dest, T value, int pe) { \ - roc_shmem_atomic_xor(dest, value, pe); \ +#define AMO_BITWISE_DEF_GEN(T, TNAME) \ + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_and( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_and(ctx, dest, value, pe); \ + } \ + __host__ T rocshmem_##TNAME##_atomic_fetch_and(T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_and(dest, value, pe); \ + } \ + __host__ void rocshmem_ctx_##TNAME##_atomic_and(rocshmem_ctx_t ctx, \ + T *dest, T value, int pe) {\ + rocshmem_atomic_and(ctx, dest, value, pe); \ + } \ + __host__ void rocshmem_##TNAME##_atomic_and(T *dest, T value, int pe) { \ + rocshmem_atomic_and(dest, value, pe); \ + } \ + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_or( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_or(ctx, dest, value, pe); \ + } \ + __host__ T rocshmem_##TNAME##_atomic_fetch_or(T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_or(dest, value, pe); \ + } \ + __host__ void rocshmem_ctx_##TNAME##_atomic_or(rocshmem_ctx_t ctx, \ + T *dest, T value, int pe) { \ + rocshmem_atomic_or(ctx, dest, value, pe); \ + } \ + __host__ void rocshmem_##TNAME##_atomic_or(T *dest, T value, int pe) { \ + rocshmem_atomic_or(dest, value, pe); \ + } \ + __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_xor( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_xor(ctx, dest, value, pe); \ + } \ + __host__ T rocshmem_##TNAME##_atomic_fetch_xor(T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_xor(dest, value, pe); \ + } \ + __host__ void rocshmem_ctx_##TNAME##_atomic_xor(rocshmem_ctx_t ctx, \ + T *dest, T value, int pe) {\ + rocshmem_atomic_xor(ctx, dest, value, pe); \ + } \ + __host__ void rocshmem_##TNAME##_atomic_xor(T *dest, T value, int pe) { \ + rocshmem_atomic_xor(dest, value, pe); \ } -#define WAIT_DEF_GEN(T, TNAME) \ - __host__ void roc_shmem_##TNAME##_wait_until(T *ivars, int cmp, \ - T val) { \ - roc_shmem_wait_until(ivars, cmp, val); \ - } \ - __host__ size_t roc_shmem_##TNAME##_wait_until_any(T *ivars, size_t nelems,\ - const int* status, \ - int cmp, \ - T val) { \ - return roc_shmem_wait_until_any(ivars, nelems, status, cmp, val); \ - } \ - __host__ void roc_shmem_##TNAME##_wait_until_all(T *ivars, size_t nelems, \ - const int* status, \ - int cmp, \ - T val) { \ - roc_shmem_wait_until_all(ivars, nelems, status, cmp, val); \ - } \ - __host__ size_t roc_shmem_##TNAME##_wait_until_some(T *ivars, size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T val) { \ - return roc_shmem_wait_until_some(ivars, nelems, indices, status, cmp, val); \ - } \ - __host__ size_t roc_shmem_##TNAME##_wait_until_any_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals) { \ - return roc_shmem_wait_until_any_vector(ivars, nelems, status, cmp, \ - vals); \ - } \ - __host__ void roc_shmem_##TNAME##_wait_until_all_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals) { \ - roc_shmem_wait_until_all_vector(ivars, nelems, status, cmp, vals); \ - } \ - __host__ size_t roc_shmem_##TNAME##_wait_until_some_vector(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status,\ - int cmp, \ - T* vals) { \ - return roc_shmem_wait_until_some_vector(ivars, nelems, indices, \ - status, cmp, vals); \ - } \ - __host__ int roc_shmem_##TNAME##_test(T *ivars, int cmp, T val) { \ - return roc_shmem_test(ivars, cmp, val); \ +#define WAIT_DEF_GEN(T, TNAME) \ + __host__ void rocshmem_##TNAME##_wait_until(T *ivars, int cmp, \ + T val) { \ + rocshmem_wait_until(ivars, cmp, val); \ + } \ + __host__ size_t rocshmem_##TNAME##_wait_until_any(T *ivars, size_t nelems, \ + const int* status, \ + int cmp, \ + T val) { \ + return rocshmem_wait_until_any(ivars, nelems, status, cmp, val); \ + } \ + __host__ void rocshmem_##TNAME##_wait_until_all(T *ivars, size_t nelems, \ + const int* status, \ + int cmp, \ + T val) { \ + rocshmem_wait_until_all(ivars, nelems, status, cmp, val); \ + } \ + __host__ size_t rocshmem_##TNAME##_wait_until_some(T *ivars, size_t nelems, \ + size_t* indices, \ + const int* status, \ + int cmp, \ + T val) { \ + return rocshmem_wait_until_some(ivars, nelems, indices, status, cmp, val); \ + } \ + __host__ size_t rocshmem_##TNAME##_wait_until_any_vector(T *ivars, \ + size_t nelems, \ + const int* status, \ + int cmp, \ + T* vals) { \ + return rocshmem_wait_until_any_vector(ivars, nelems, status, cmp, \ + vals); \ + } \ + __host__ void rocshmem_##TNAME##_wait_until_all_vector(T *ivars, \ + size_t nelems, \ + const int* status, \ + int cmp, \ + T* vals) { \ + rocshmem_wait_until_all_vector(ivars, nelems, status, cmp, vals); \ + } \ + __host__ size_t rocshmem_##TNAME##_wait_until_some_vector(T *ivars, \ + size_t nelems, \ + size_t* indices, \ + const int* status, \ + int cmp, \ + T* vals) { \ + return rocshmem_wait_until_some_vector(ivars, nelems, indices, \ + status, cmp, vals); \ + } \ + __host__ int rocshmem_##TNAME##_test(T *ivars, int cmp, T val) { \ + return rocshmem_test(ivars, cmp, val); \ } /****************************************************************************** diff --git a/src/roc_shmem_calc.hpp b/src/rocshmem_calc.hpp similarity index 87% rename from src/roc_shmem_calc.hpp rename to src/rocshmem_calc.hpp index 5420cde3ea..d8d99a3e06 100644 --- a/src/roc_shmem_calc.hpp +++ b/src/rocshmem_calc.hpp @@ -20,14 +20,14 @@ * IN THE SOFTWARE. *****************************************************************************/ -#ifndef LIBRARY_SRC_ROC_SHMEM_CALC_HPP_ -#define LIBRARY_SRC_ROC_SHMEM_CALC_HPP_ +#ifndef LIBRARY_SRC_ROCSHMEM_CALC_HPP_ +#define LIBRARY_SRC_ROCSHMEM_CALC_HPP_ namespace rocshmem { // clang-format off NOWARN(-Wunused-parameter, -template +template struct OpWrap { template __device__ static void Calc(T *src, T *dst, int i) { @@ -41,7 +41,7 @@ struct OpWrap { ************************** TEMPLATE SPECIALIZATIONS ************************** *****************************************************************************/ template <> -struct OpWrap { +struct OpWrap { template __device__ static void Calc(T *src, T *dst, int i) { dst[i] += src[i]; @@ -49,7 +49,7 @@ struct OpWrap { }; template <> -struct OpWrap { +struct OpWrap { template __device__ static void Calc(T *src, T *dst, int i) { dst[i] = max(dst[i], src[i]); @@ -57,7 +57,7 @@ struct OpWrap { }; template <> -struct OpWrap { +struct OpWrap { template __device__ static void Calc(T *src, T *dst, int i) { dst[i] = min(dst[i], src[i]); @@ -65,7 +65,7 @@ struct OpWrap { }; template <> -struct OpWrap { +struct OpWrap { template __device__ static void Calc(T *src, T *dst, int i) { dst[i] *= src[i]; @@ -73,7 +73,7 @@ struct OpWrap { }; template <> -struct OpWrap { +struct OpWrap { template __device__ static void Calc(T *src, T *dst, int i) { dst[i] &= src[i]; @@ -81,7 +81,7 @@ struct OpWrap { }; template <> -struct OpWrap { +struct OpWrap { template __device__ static void Calc(T *src, T *dst, int i) { dst[i] |= src[i]; @@ -89,7 +89,7 @@ struct OpWrap { }; template <> -struct OpWrap { +struct OpWrap { template __device__ static void Calc(T *src, T *dst, int i) { dst[i] ^= src[i]; @@ -97,4 +97,4 @@ struct OpWrap { }; } -#endif // LIBRARY_SRC_ROC_SHMEM_CALC_HPP_ +#endif // LIBRARY_SRC_ROCSHMEM_CALC_HPP_ diff --git a/src/rocshmem_gpu.cpp b/src/rocshmem_gpu.cpp new file mode 100644 index 0000000000..57e62f5ea0 --- /dev/null +++ b/src/rocshmem_gpu.cpp @@ -0,0 +1,1540 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +/** + * @file rocshmem.cpp + * @brief Public header for rocSHMEM device and host libraries. + * + * This is the implementation for the public rocshmem.hpp header file. This + * guy just extracts the transport from the opaque public handles and delegates + * to the appropriate backend. + * + * The device-side delegation is nasty because we can't use polymorphism with + * our current shader compiler stack. Maybe one day..... + * + * TODO: Could probably autogenerate many of these functions from macros. + * + * TODO: Support runtime backend detection. + * + */ + +#include + +#include + +#include "config.h" // NOLINT(build/include_subdir) +#include "rocshmem/rocshmem.hpp" +#include "backend_bc.hpp" +#include "context_incl.hpp" +#include "team.hpp" +#include "templates.hpp" +#include "util.hpp" + +#ifdef USE_GPU_IB +#include "gpu_ib/context_ib_tmpl_device.hpp" +#elif defined(USE_RO) +#include "reverse_offload/context_ro_tmpl_device.hpp" +#else +#include "ipc/context_ipc_tmpl_device.hpp" +#endif + +/****************************************************************************** + **************************** Device Vars And Init **************************** + *****************************************************************************/ + +namespace rocshmem { + +__device__ __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT{}; + +__constant__ Backend *device_backend_proxy; + +__device__ void rocshmem_wg_init() { + int provided; + + /* + * Non-threaded init is allowed to select any thread mode, so don't worry + * if provided is different. + */ + rocshmem_wg_init_thread(ROCSHMEM_THREAD_WG_FUNNELED, &provided); +} + +__device__ void rocshmem_wg_init_thread([[maybe_unused]] int requested, + int *provided) { + rocshmem_query_thread(provided); +} + +__device__ void rocshmem_query_thread(int *provided) { +#ifdef USE_THREADS + *provided = ROCSHMEM_THREAD_MULTIPLE; +#else + *provided = ROCSHMEM_THREAD_WG_FUNNELED; +#endif +} + +__device__ void rocshmem_wg_finalize() {} + +/****************************************************************************** + ************************** Default Context Wrappers ************************** + *****************************************************************************/ + +__device__ void rocshmem_putmem(void *dest, const void *source, size_t nelems, + int pe) { + rocshmem_ctx_putmem(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); +} + +template +__device__ void rocshmem_put(T *dest, const T *source, size_t nelems, int pe) { + rocshmem_put(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); +} + +template +__device__ void rocshmem_p(T *dest, T value, int pe) { + rocshmem_p(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +template +__device__ T rocshmem_g(const T *source, int pe) { + return rocshmem_g(ROCSHMEM_CTX_DEFAULT, source, pe); +} + +__device__ void rocshmem_getmem(void *dest, const void *source, size_t nelems, + int pe) { + rocshmem_ctx_getmem(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); +} + +template +__device__ void rocshmem_get(T *dest, const T *source, size_t nelems, int pe) { + rocshmem_get(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); +} + +__device__ void rocshmem_putmem_nbi(void *dest, const void *source, + size_t nelems, int pe) { + rocshmem_ctx_putmem_nbi(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); +} + +template +__device__ void rocshmem_put_nbi(T *dest, const T *source, size_t nelems, + int pe) { + rocshmem_put_nbi(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); +} + +__device__ void rocshmem_getmem_nbi(void *dest, const void *source, + size_t nelems, int pe) { + rocshmem_ctx_getmem_nbi(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); +} + +template +__device__ void rocshmem_get_nbi(T *dest, const T *source, size_t nelems, + int pe) { + rocshmem_get_nbi(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); +} + +__device__ void rocshmem_fence() { + rocshmem_ctx_fence(ROCSHMEM_CTX_DEFAULT); +} + +__device__ void rocshmem_fence(int pe) { + rocshmem_ctx_fence(ROCSHMEM_CTX_DEFAULT, pe); +} + +__device__ void rocshmem_quiet() { + rocshmem_ctx_quiet(ROCSHMEM_CTX_DEFAULT); +} + +__device__ void rocshmem_threadfence_system() { + rocshmem_ctx_threadfence_system(ROCSHMEM_CTX_DEFAULT); +} + +template +__device__ T rocshmem_atomic_fetch_add(T *dest, T val, int pe) { + return rocshmem_atomic_fetch_add(ROCSHMEM_CTX_DEFAULT, dest, val, pe); +} + +template +__device__ T rocshmem_atomic_compare_swap(T *dest, T cond, T val, int pe) { + return rocshmem_atomic_compare_swap(ROCSHMEM_CTX_DEFAULT, dest, cond, val, + pe); +} + +template +__device__ T rocshmem_atomic_fetch_inc(T *dest, int pe) { + return rocshmem_atomic_fetch_inc(ROCSHMEM_CTX_DEFAULT, dest, pe); +} + +template +__device__ T rocshmem_atomic_fetch(T *source, int pe) { + return rocshmem_atomic_fetch(ROCSHMEM_CTX_DEFAULT, source, pe); +} + +template +__device__ void rocshmem_atomic_add(T *dest, T val, int pe) { + rocshmem_atomic_add(ROCSHMEM_CTX_DEFAULT, dest, val, pe); +} + +template +__device__ void rocshmem_atomic_inc(T *dest, int pe) { + rocshmem_atomic_inc(ROCSHMEM_CTX_DEFAULT, dest, pe); +} + +template +__device__ void rocshmem_atomic_set(T *dest, T value, int pe) { + rocshmem_atomic_set(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +template +__device__ T rocshmem_atomic_swap(T *dest, T value, int pe) { + return rocshmem_atomic_swap(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +template +__device__ T rocshmem_atomic_fetch_and(T *dest, T value, int pe) { + return rocshmem_atomic_fetch_and(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +template +__device__ void rocshmem_atomic_and(T *dest, T value, int pe) { + rocshmem_atomic_and(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +template +__device__ T rocshmem_atomic_fetch_or(T *dest, T value, int pe) { + return rocshmem_atomic_fetch_or(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +template +__device__ void rocshmem_atomic_or(T *dest, T value, int pe) { + rocshmem_atomic_or(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +template +__device__ T rocshmem_atomic_fetch_xor(T *dest, T value, int pe) { + return rocshmem_atomic_fetch_xor(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +template +__device__ void rocshmem_atomic_xor(T *dest, T value, int pe) { + rocshmem_atomic_xor(ROCSHMEM_CTX_DEFAULT, dest, value, pe); +} + +/****************************************************************************** + ************************* Private Context Interfaces ************************* + *****************************************************************************/ + +__device__ int translate_pe(rocshmem_ctx_t ctx, int pe) { + if (ctx.team_opaque) { + TeamInfo *tinfo = reinterpret_cast(ctx.team_opaque); + return (tinfo->pe_start + tinfo->stride * pe); + } else { + return pe; + } +} + +__host__ void set_internal_ctx(rocshmem_ctx_t *ctx) { + CHECK_HIP(hipMemcpyToSymbol(HIP_SYMBOL(ROCSHMEM_CTX_DEFAULT), ctx, + sizeof(rocshmem_ctx_t), 0, + hipMemcpyHostToDevice)); +} + +__device__ Context *get_internal_ctx(rocshmem_ctx_t ctx) { + return reinterpret_cast(ctx.ctx_opaque); +} + +__device__ int rocshmem_wg_ctx_create(long option, rocshmem_ctx_t *ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_create\n"); + bool result{true}; + if (get_flat_block_id() == 0) { + ctx->team_opaque = reinterpret_cast(ROCSHMEM_CTX_DEFAULT.team_opaque); + result = device_backend_proxy->create_ctx(option, ctx); + reinterpret_cast(ctx->ctx_opaque)->setFence(option); + } + __syncthreads(); + return result == true ? 0 : -1; +} + +__device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options, + rocshmem_ctx_t *ctx) { + GPU_DPRINTF("Function: rocshmem_team_create_ctx\n"); + if (team == ROCSHMEM_TEAM_INVALID) { + return -1; + } + + bool result{true}; + if (get_flat_block_id() == 0) { + Team *team_obj{get_internal_team(team)}; + TeamInfo *info_wrt_world = team_obj->tinfo_wrt_world; + ctx->team_opaque = info_wrt_world; + result = device_backend_proxy->create_ctx(options, ctx); + reinterpret_cast(ctx->ctx_opaque)->setFence(options); + } + __syncthreads(); + + return result == true ? 0 : -1; +} + +__device__ void rocshmem_wg_ctx_destroy( + [[maybe_unused]] rocshmem_ctx_t *ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_destroy\n"); + + if (get_flat_block_id() == 0) { + device_backend_proxy->destroy_ctx(ctx); + } +} + +__device__ void rocshmem_ctx_threadfence_system(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_threadfence_system\n"); + + get_internal_ctx(ctx)->threadfence_system(); +} + +__device__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_putmem\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->putmem(dest, source, nelems, pe_in_world); +} + +template +__device__ void rocshmem_put(rocshmem_ctx_t ctx, T *dest, const T *source, + size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_put\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->put(dest, source, nelems, pe_in_world); +} + +template +__device__ void rocshmem_p(rocshmem_ctx_t ctx, T *dest, T value, int pe) { + GPU_DPRINTF("Function: rocshmem_p\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->p(dest, value, pe_in_world); +} + +template +__device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe) { + GPU_DPRINTF("Function: rocshmem_g\n"); + + int pe_in_world = translate_pe(ctx, pe); + + return get_internal_ctx(ctx)->g(source, pe_in_world); +} + +__device__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_getmem\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->getmem(dest, source, nelems, pe_in_world); +} + +template +__device__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source, + size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_get\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->get(dest, source, nelems, pe_in_world); +} + +__device__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->putmem_nbi(dest, source, nelems, pe_in_world); +} + +template +__device__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, + size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_put_nbi\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->put_nbi(dest, source, nelems, pe_in_world); +} + +__device__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->getmem_nbi(dest, source, nelems, pe_in_world); +} + +template +__device__ void rocshmem_get_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, + size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_get_nbi\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->get_nbi(dest, source, nelems, pe_in_world); +} + +__device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_fence\n"); + + get_internal_ctx(ctx)->fence(); +} + +__device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx, int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_fence\n"); + + int pe_in_world = translate_pe(ctx, pe); + + get_internal_ctx(ctx)->fence(pe_in_world); +} + +__device__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_quiet\n"); + + get_internal_ctx(ctx)->quiet(); +} + +__device__ void *rocshmem_ptr(const void *dest, int pe) { + GPU_DPRINTF("Function: rocshmem_ptr\n"); + + return get_internal_ctx(ROCSHMEM_CTX_DEFAULT)->shmem_ptr(dest, pe); +} + +template +__device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team, + T *dest, const T *source, int nreduce) { + GPU_DPRINTF("Function: rocshmem_reduce\n"); + + return get_internal_ctx(ctx)->reduce(team, dest, source, nreduce); +} + +template +__device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx, + rocshmem_team_t team, T *dest, + const T *source, int nelem, + int pe_root) { + GPU_DPRINTF("Function: Team-based rocshmem_broadcast\n"); + + get_internal_ctx(ctx)->broadcast(team, dest, source, nelem, pe_root); +} + +template +__device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx, + rocshmem_team_t team, T *dest, + const T *source, int nelem) { + GPU_DPRINTF("Function: rocshmem_alltoall\n"); + + get_internal_ctx(ctx)->alltoall(team, dest, source, nelem); +} + +template +__device__ void rocshmem_wg_fcollect(rocshmem_ctx_t ctx, + rocshmem_team_t team, T *dest, + const T *source, int nelem) { + GPU_DPRINTF("Function: rocshmem_fcollect\n"); + + get_internal_ctx(ctx)->fcollect(team, dest, source, nelem); +} + +template +__device__ void rocshmem_wait_until(T *ivars, int cmp, T val) { + GPU_DPRINTF("Function: rocshmem_wait_until\n"); + + Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); + ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL); + ctx_internal->wait_until(ivars, cmp, val); +} + +template +__device__ void rocshmem_wait_until_all(T *ivars, size_t nelems, const int* status, + int cmp, T val) { + GPU_DPRINTF("Function: rocshmem_wait_until_all\n"); + + Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); + ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL); + ctx_internal->wait_until_all(ivars, nelems, status, cmp, val); +} + +template +__device__ size_t rocshmem_wait_until_any(T *ivars, size_t nelems, const int* status, + int cmp, T val) { + GPU_DPRINTF("Function: rocshmem_wait_until_any\n"); + + Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); + ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY); + return ctx_internal->wait_until_any(ivars, nelems, status, cmp, val); +} + +template +__device__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems, size_t* indices, + const int* status, int cmp, + T val) { + DPRINTF("Function: rocshmem_wait_until_some\n"); + + Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); + ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME); + return ctx_internal->wait_until_some(ivars, nelems, indices, status, cmp, val); +} + +template +__device__ size_t rocshmem_wait_until_any_vector(T *ivars, size_t nelems, const int* status, + int cmp, T* vals) { + DPRINTF("Function: rocshmem_wait_until_any_vector\n"); + + Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); + ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY_VECTOR); + return ctx_internal->wait_until_any_vector(ivars, nelems, status, cmp, vals); +} + +template +__device__ void rocshmem_wait_until_all_vector(T *ivars, size_t nelems, const int* status, + int cmp, T* vals) { + DPRINTF("Function: rocshmem_wait_until_all_vector\n"); + + Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); + ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL_VECTOR); + ctx_internal->wait_until_all_vector(ivars, nelems, status, cmp, vals); +} + +template +__device__ size_t rocshmem_wait_until_some_vector(T *ivars, size_t nelems, + size_t* indices, + const int* status, + int cmp, T* vals) { + DPRINTF("Function: rocshmem_wait_until_some_vector\n"); + + Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); + ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME_VECTOR); + return ctx_internal->wait_until_some_vector(ivars, nelems, indices, status, cmp, vals); +} + +template +__device__ int rocshmem_test(T *ivars, int cmp, T val) { + GPU_DPRINTF("Function: rocshmem_testl\n"); + + Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); + ctx_internal->ctxStats.incStat(NUM_TEST); + + return ctx_internal->test(ivars, cmp, val); +} + +__device__ void rocshmem_ctx_wg_barrier_all(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_barrier_all\n"); + + get_internal_ctx(ctx)->barrier_all(); +} + +__device__ void rocshmem_wg_barrier_all() { + rocshmem_ctx_wg_barrier_all(ROCSHMEM_CTX_DEFAULT); +} + +__device__ void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n"); + + get_internal_ctx(ctx)->sync_all(); +} + +__device__ void rocshmem_wg_sync_all() { + rocshmem_ctx_wg_sync_all(ROCSHMEM_CTX_DEFAULT); +} + +__device__ void rocshmem_ctx_wg_team_sync(rocshmem_ctx_t ctx, + rocshmem_team_t team) { + GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n"); + + get_internal_ctx(ctx)->sync(team); +} + +__device__ void rocshmem_wg_team_sync(rocshmem_team_t team) { + rocshmem_ctx_wg_team_sync(ROCSHMEM_CTX_DEFAULT, team); +} + +__device__ int rocshmem_ctx_n_pes(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_n_pes\n"); + + return get_internal_ctx(ctx)->num_pes; +} + +__device__ int rocshmem_n_pes() { + return get_internal_ctx(ROCSHMEM_CTX_DEFAULT)->num_pes; +} + +__device__ int rocshmem_ctx_my_pe(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_my_pe\n"); + + return get_internal_ctx(ctx)->my_pe; +} + +__device__ int rocshmem_my_pe() { + return get_internal_ctx(ROCSHMEM_CTX_DEFAULT)->my_pe; +} + +__device__ uint64_t rocshmem_timer() { + GPU_DPRINTF("Function: rocshmem_timer\n"); + + return __read_clock(); +} + +template +__device__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_fetch_add\n"); + + return get_internal_ctx(ctx)->amo_fetch_add(dest, val, pe); +} + +template +__device__ T rocshmem_atomic_compare_swap(rocshmem_ctx_t ctx, T *dest, T cond, + T val, int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_compare_swap\n"); + + return get_internal_ctx(ctx)->amo_fetch_cas(dest, val, cond, pe); +} + +template +__device__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, T *dest, int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_fetch_inc\n"); + + return get_internal_ctx(ctx)->amo_fetch_add(dest, 1, pe); +} + +template +__device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_fetch\n"); + + return get_internal_ctx(ctx)->amo_fetch_add(source, 0, pe); +} + +template +__device__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_add\n"); + + get_internal_ctx(ctx)->amo_add(dest, val, pe); +} + +template +__device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_inc\n"); + + get_internal_ctx(ctx)->amo_add(dest, 1, pe); +} + +template +__device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_set\n"); + + get_internal_ctx(ctx)->amo_set(dest, val, pe); +} + +template +__device__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_swap\n"); + + return get_internal_ctx(ctx)->amo_swap(dest, val, pe); +} + +template +__device__ T rocshmem_atomic_fetch_and(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_fetch_and\n"); + + return get_internal_ctx(ctx)->amo_fetch_and(dest, val, pe); +} + +template +__device__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_and\n"); + + get_internal_ctx(ctx)->amo_and(dest, val, pe); +} + +template +__device__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_fetch_or\n"); + + return get_internal_ctx(ctx)->amo_fetch_or(dest, val, pe); +} + +template +__device__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_or\n"); + + get_internal_ctx(ctx)->amo_or(dest, val, pe); +} + +template +__device__ T rocshmem_atomic_fetch_xor(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_fetch_xor\n"); + + return get_internal_ctx(ctx)->amo_fetch_xor(dest, val, pe); +} + +template +__device__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, T *dest, T val, + int pe) { + GPU_DPRINTF("Function: rocshmem_atomic_xor\n"); + + get_internal_ctx(ctx)->amo_xor(dest, val, pe); +} + +/** + * SHMEM X RMA API for WG and Wave level + */ +__device__ void rocshmem_ctx_putmem_wave(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_putmem_wave\n"); + + get_internal_ctx(ctx)->putmem_wave(dest, source, nelems, pe); +} + +__device__ void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_putmem_wg\n"); + + get_internal_ctx(ctx)->putmem_wg(dest, source, nelems, pe); +} + +__device__ void rocshmem_ctx_putmem_nbi_wave(rocshmem_ctx_t ctx, void *dest, + const void *source, + size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wave\n"); + + get_internal_ctx(ctx)->putmem_nbi_wave(dest, source, nelems, pe); +} + +__device__ void rocshmem_ctx_putmem_nbi_wg(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wg\n"); + + get_internal_ctx(ctx)->putmem_nbi_wg(dest, source, nelems, pe); +} + +template +__device__ void rocshmem_put_wave(rocshmem_ctx_t ctx, T *dest, + const T *source, size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_put_wave\n"); + + get_internal_ctx(ctx)->put_wave(dest, source, nelems, pe); +} + +template +__device__ void rocshmem_put_wg(rocshmem_ctx_t ctx, T *dest, const T *source, + size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_put_wg\n"); + + get_internal_ctx(ctx)->put_wg(dest, source, nelems, pe); +} + +template +__device__ void rocshmem_put_nbi_wave(rocshmem_ctx_t ctx, T *dest, + const T *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_put_nbi_wave\n"); + + get_internal_ctx(ctx)->put_nbi_wave(dest, source, nelems, pe); +} + +template +__device__ void rocshmem_put_nbi_wg(rocshmem_ctx_t ctx, T *dest, + const T *source, size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_put_nbi_wg\n"); + + get_internal_ctx(ctx)->put_nbi_wg(dest, source, nelems, pe); +} + +__device__ void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_getmem_wg\n"); + + get_internal_ctx(ctx)->getmem_wg(dest, source, nelems, pe); +} + +__device__ void rocshmem_ctx_getmem_wave(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_getmem_wave\n"); + + get_internal_ctx(ctx)->getmem_wave(dest, source, nelems, pe); +} + +template +__device__ void rocshmem_get_wg(rocshmem_ctx_t ctx, T *dest, const T *source, + size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_get_wg\n"); + + get_internal_ctx(ctx)->get_wg(dest, source, nelems, pe); +} + +template +__device__ void rocshmem_get_wave(rocshmem_ctx_t ctx, T *dest, + const T *source, size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_get_wave\n"); + + get_internal_ctx(ctx)->get_wave(dest, source, nelems, pe); +} + +__device__ void rocshmem_ctx_getmem_nbi_wg(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wg\n"); + + get_internal_ctx(ctx)->getmem_nbi_wg(dest, source, nelems, pe); +} + +template +__device__ void rocshmem_get_nbi_wg(rocshmem_ctx_t ctx, T *dest, + const T *source, size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_get_nbi_wg\n"); + + get_internal_ctx(ctx)->get_nbi_wg(dest, source, nelems, pe); +} + +__device__ void rocshmem_ctx_getmem_nbi_wave(rocshmem_ctx_t ctx, void *dest, + const void *source, + size_t nelems, int pe) { + GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wave\n"); + + get_internal_ctx(ctx)->getmem_nbi_wave(dest, source, nelems, pe); +} + +template +__device__ void rocshmem_get_nbi_wave(rocshmem_ctx_t ctx, T *dest, + const T *source, size_t nelems, + int pe) { + GPU_DPRINTF("Function: rocshmem_get_nbi_wave\n"); + + get_internal_ctx(ctx)->get_nbi_wave(dest, source, nelems, pe); +} + +/****************************************************************************** + ****************************** Teams Interface ******************************* + *****************************************************************************/ + +__device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, + int src_pe, + rocshmem_team_t dst_team) { + return team_translate_pe(src_team, src_pe, dst_team); +} + +/****************************************************************************** + ************************* Template Generation Macros ************************* + *****************************************************************************/ + +/** + * Template generator for reductions + */ +#define REDUCTION_GEN(T, Op) \ + template __device__ int rocshmem_wg_reduce( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ + int nreduce); + +/** + * Declare templates for the required datatypes (for the compiler) + */ +#define RMA_GEN(T) \ + template __device__ void rocshmem_put( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_put_nbi( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_p(rocshmem_ctx_t ctx, T * dest, \ + T value, int pe); \ + template __device__ void rocshmem_get( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_get_nbi( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, \ + int pe); \ + template __device__ void rocshmem_put(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ void rocshmem_put_nbi(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ void rocshmem_p(T * dest, T value, int pe); \ + template __device__ void rocshmem_get(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ void rocshmem_get_nbi(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ T rocshmem_g(const T *source, int pe); \ + template __device__ void rocshmem_wg_broadcast( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ + int nelem, int pe_root); \ + template __device__ void rocshmem_wg_alltoall( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ + int nelem); \ + template __device__ void rocshmem_wg_fcollect( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ + int nelem); \ + template __device__ void rocshmem_put_wave( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_put_wg( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_put_wave(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ void rocshmem_put_wg(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ void rocshmem_put_nbi_wave( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_put_nbi_wg( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_put_nbi_wave( \ + T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_put_nbi_wg(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ void rocshmem_get_wave( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_get_wg( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_get_wave(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ void rocshmem_get_wg(T * dest, const T *source, \ + size_t nelems, int pe); \ + template __device__ void rocshmem_get_nbi_wave( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_get_nbi_wg( \ + rocshmem_ctx_t ctx, T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_get_nbi_wave( \ + T * dest, const T *source, size_t nelems, int pe); \ + template __device__ void rocshmem_get_nbi_wg(T * dest, const T *source, \ + size_t nelems, int pe); + +/** + * Declare templates for the standard amo types + */ +#define AMO_STANDARD_GEN(T) \ + template __device__ T rocshmem_atomic_compare_swap( \ + rocshmem_ctx_t ctx, T * dest, T cond, T value, int pe); \ + template __device__ T rocshmem_atomic_compare_swap(T * dest, T cond, \ + T value, int pe); \ + template __device__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, \ + T * dest, int pe); \ + template __device__ T rocshmem_atomic_fetch_inc(T * dest, int pe); \ + template __device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, \ + T * dest, int pe); \ + template __device__ void rocshmem_atomic_inc(T * dest, int pe); \ + template __device__ T rocshmem_atomic_fetch_add( \ + rocshmem_ctx_t ctx, T * dest, T value, int pe); \ + template __device__ T rocshmem_atomic_fetch_add(T * dest, T value, \ + int pe); \ + template __device__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __device__ void rocshmem_atomic_add(T * dest, T value, int pe); + +/** + * Declare templates for the extended amo types + */ +#define AMO_EXTENDED_GEN(T) \ + template __device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, \ + T * dest, int pe); \ + template __device__ T rocshmem_atomic_fetch(T * dest, int pe); \ + template __device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __device__ void rocshmem_atomic_set(T * dest, T value, int pe); \ + template __device__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __device__ T rocshmem_atomic_swap(T * dest, T value, int pe); + +/** + * Declare templates for the bitwise amo types + */ +#define AMO_BITWISE_GEN(T) \ + template __device__ T rocshmem_atomic_fetch_and( \ + rocshmem_ctx_t ctx, T * dest, T value, int pe); \ + template __device__ T rocshmem_atomic_fetch_and(T * dest, T value, \ + int pe); \ + template __device__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __device__ void rocshmem_atomic_and(T * dest, T value, int pe); \ + template __device__ T rocshmem_atomic_fetch_or( \ + rocshmem_ctx_t ctx, T * dest, T value, int pe); \ + template __device__ T rocshmem_atomic_fetch_or(T * dest, T value, \ + int pe); \ + template __device__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __device__ void rocshmem_atomic_or(T * dest, T value, int pe); \ + template __device__ T rocshmem_atomic_fetch_xor( \ + rocshmem_ctx_t ctx, T * dest, T value, int pe); \ + template __device__ T rocshmem_atomic_fetch_xor(T * dest, T value, \ + int pe); \ + template __device__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, \ + T * dest, T value, int pe); \ + template __device__ void rocshmem_atomic_xor(T * dest, T value, int pe); + +/** + * Declare templates for the wait types + */ +#define WAIT_GEN(T) \ + template __device__ void rocshmem_wait_until(T *ivars, \ + int cmp, T val); \ + template __device__ size_t rocshmem_wait_until_any(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T val); \ + template __device__ void rocshmem_wait_until_all(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T val); \ + template __device__ size_t rocshmem_wait_until_some(T *ivars, \ + size_t nelems, size_t* indices, \ + const int* status, \ + int cmp, T val); \ + template __device__ size_t rocshmem_wait_until_any_vector(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T* vals); \ + template __device__ void rocshmem_wait_until_all_vector(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T* vals); \ + template __device__ size_t rocshmem_wait_until_some_vector(T *ivars, \ + size_t nelems, size_t* indices, \ + const int* status, int cmp, \ + T* vals); \ + template __device__ int rocshmem_test(T *ivars, int cmp, \ + T val); \ + template __device__ void Context::wait_until(T *ivars, int cmp, \ + T val); \ + template __device__ size_t Context::wait_until_any(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T val); \ + template __device__ void Context::wait_until_all(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T val); \ + template __device__ size_t Context::wait_until_some(T *ivars, \ + size_t nelems, \ + size_t* indices, const int* status, \ + int cmp, T val); \ + template __device__ size_t Context::wait_until_any_vector(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T* vals); \ + template __device__ void Context::wait_until_all_vector(T *ivars, \ + size_t nelems, const int* status, \ + int cmp, T* vals); \ + template __device__ size_t Context::wait_until_some_vector(T *ivars, \ + size_t nelems, size_t* indices, \ + const int* status, int cmp, \ + T* vals); \ + template __device__ int Context::test(T *ivars, int cmp, T val); + +#define ARITH_REDUCTION_GEN(T) \ + REDUCTION_GEN(T, ROCSHMEM_SUM) \ + REDUCTION_GEN(T, ROCSHMEM_MIN) \ + REDUCTION_GEN(T, ROCSHMEM_MAX) \ + REDUCTION_GEN(T, ROCSHMEM_PROD) + +#define BITWISE_REDUCTION_GEN(T) \ + REDUCTION_GEN(T, ROCSHMEM_OR) \ + REDUCTION_GEN(T, ROCSHMEM_AND) \ + REDUCTION_GEN(T, ROCSHMEM_XOR) + +#define INT_REDUCTION_GEN(T) \ + ARITH_REDUCTION_GEN(T) \ + BITWISE_REDUCTION_GEN(T) + +#define FLOAT_REDUCTION_GEN(T) ARITH_REDUCTION_GEN(T) + +/** + * Define APIs to call the template functions + **/ + +#define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ + __device__ int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ + int nreduce) { \ + return rocshmem_wg_reduce(ctx, team, dest, source, nreduce); \ + } + +#define ARITH_REDUCTION_DEF_GEN(T, TNAME) \ + REDUCTION_DEF_GEN(T, TNAME, sum, ROCSHMEM_SUM) \ + REDUCTION_DEF_GEN(T, TNAME, min, ROCSHMEM_MIN) \ + REDUCTION_DEF_GEN(T, TNAME, max, ROCSHMEM_MAX) \ + REDUCTION_DEF_GEN(T, TNAME, prod, ROCSHMEM_PROD) + +#define BITWISE_REDUCTION_DEF_GEN(T, TNAME) \ + REDUCTION_DEF_GEN(T, TNAME, or, ROCSHMEM_OR) \ + REDUCTION_DEF_GEN(T, TNAME, and, ROCSHMEM_AND) \ + REDUCTION_DEF_GEN(T, TNAME, xor, ROCSHMEM_XOR) + +#define INT_REDUCTION_DEF_GEN(T, TNAME) \ + ARITH_REDUCTION_DEF_GEN(T, TNAME) \ + BITWISE_REDUCTION_DEF_GEN(T, TNAME) + +#define FLOAT_REDUCTION_DEF_GEN(T, TNAME) ARITH_REDUCTION_DEF_GEN(T, TNAME) + +#define RMA_DEF_GEN(T, TNAME) \ + __device__ void rocshmem_ctx_##TNAME##_put( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_put(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_put_nbi( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_put_nbi(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_p(rocshmem_ctx_t ctx, T *dest, \ + T value, int pe) { \ + rocshmem_p(ctx, dest, value, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_get( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_get(ctx, dest, source, nelems, pe); \ + } \ + __device__ T rocshmem_ctx_##TNAME##_g(rocshmem_ctx_t ctx, const T *source, \ + int pe) { \ + return rocshmem_g(ctx, source, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_get_nbi( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_get_nbi(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_put(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_put(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_put_nbi(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_put_nbi(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_p(T *dest, T value, int pe) { \ + rocshmem_p(dest, value, pe); \ + } \ + __device__ void rocshmem_##TNAME##_get(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_get(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_get_nbi(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_get_nbi(dest, source, nelems, pe); \ + } \ + __device__ T rocshmem_##TNAME##_g(const T *source, int pe) { \ + return rocshmem_g(source, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_put_wave( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_put_wave(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_put_wg( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_put_wg(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_put_wave(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_put_wave(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_put_wg(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_put_wg(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_put_nbi_wave( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_put_nbi_wave(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_put_nbi_wg( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_put_nbi_wg(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_put_nbi_wave(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_put_nbi_wave(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_put_nbi_wg(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_put_nbi_wg(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_get_wave( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_get_wave(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_get_wg( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_get_wg(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_get_wave(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_get_wave(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_get_wg(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_get_wg(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_get_nbi_wave( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_get_nbi_wave(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_get_nbi_wg( \ + rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { \ + rocshmem_get_nbi_wg(ctx, dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_get_nbi_wave(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_get_nbi_wave(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_##TNAME##_get_nbi_wg(T *dest, const T *source, \ + size_t nelems, int pe) { \ + rocshmem_get_nbi_wg(dest, source, nelems, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_wg_broadcast( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ + int nelem, int pe_root) { \ + rocshmem_wg_broadcast(ctx, team, dest, source, nelem, pe_root); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_wg_alltoall( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ + int nelem) { \ + rocshmem_wg_alltoall(ctx, team, dest, source, nelem); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_wg_fcollect( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ + int nelem) { \ + rocshmem_wg_fcollect(ctx, team, dest, source, nelem); \ + } + +#define AMO_STANDARD_DEF_GEN(T, TNAME) \ + __device__ T rocshmem_ctx_##TNAME##_atomic_compare_swap( \ + rocshmem_ctx_t ctx, T *dest, T cond, T value, int pe) { \ + return rocshmem_atomic_compare_swap(ctx, dest, cond, value, pe); \ + } \ + __device__ T rocshmem_##TNAME##_atomic_compare_swap(T *dest, T cond, \ + T value, int pe) { \ + return rocshmem_atomic_compare_swap(dest, cond, value, pe); \ + } \ + __device__ T rocshmem_ctx_##TNAME##_atomic_fetch_inc(rocshmem_ctx_t ctx, \ + T *dest, int pe) { \ + return rocshmem_atomic_fetch_inc(ctx, dest, pe); \ + } \ + __device__ T rocshmem_##TNAME##_atomic_fetch_inc(T *dest, int pe) { \ + return rocshmem_atomic_fetch_inc(dest, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_atomic_inc(rocshmem_ctx_t ctx, \ + T *dest, int pe) { \ + rocshmem_atomic_inc(ctx, dest, pe); \ + } \ + __device__ void rocshmem_##TNAME##_atomic_inc(T *dest, int pe) { \ + rocshmem_atomic_inc(dest, pe); \ + } \ + __device__ T rocshmem_ctx_##TNAME##_atomic_fetch_add( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_add(ctx, dest, value, pe); \ + } \ + __device__ T rocshmem_##TNAME##_atomic_fetch_add(T *dest, T value, \ + int pe) { \ + return rocshmem_atomic_fetch_add(dest, value, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_atomic_add( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + rocshmem_atomic_add(ctx, dest, value, pe); \ + } \ + __device__ void rocshmem_##TNAME##_atomic_add(T *dest, T value, int pe) { \ + rocshmem_atomic_add(dest, value, pe); \ + } + +#define AMO_EXTENDED_DEF_GEN(T, TNAME) \ + __device__ T rocshmem_ctx_##TNAME##_atomic_fetch(rocshmem_ctx_t ctx, \ + T *source, int pe) { \ + return rocshmem_atomic_fetch(ctx, source, pe); \ + } \ + __device__ T rocshmem_##TNAME##_atomic_fetch(T *source, int pe) { \ + return rocshmem_atomic_fetch(source, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_atomic_set( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + rocshmem_atomic_set(ctx, dest, value, pe); \ + } \ + __device__ void rocshmem_##TNAME##_atomic_set(T *dest, T value, int pe) { \ + rocshmem_atomic_set(dest, value, pe); \ + } \ + __device__ T rocshmem_ctx_##TNAME##_atomic_swap(rocshmem_ctx_t ctx, \ + T *dest, T value, int pe) {\ + return rocshmem_atomic_swap(ctx, dest, value, pe); \ + } \ + __device__ T rocshmem_##TNAME##_atomic_swap(T *dest, T value, int pe) { \ + return rocshmem_atomic_swap(dest, value, pe); \ + } + +#define AMO_BITWISE_DEF_GEN(T, TNAME) \ + __device__ T rocshmem_ctx_##TNAME##_atomic_fetch_and( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_and(ctx, dest, value, pe); \ + } \ + __device__ T rocshmem_##TNAME##_atomic_fetch_and(T *dest, T value, \ + int pe) { \ + return rocshmem_atomic_fetch_and(dest, value, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_atomic_and( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + rocshmem_atomic_and(ctx, dest, value, pe); \ + } \ + __device__ void rocshmem_##TNAME##_atomic_and(T *dest, T value, int pe) { \ + rocshmem_atomic_and(dest, value, pe); \ + } \ + __device__ T rocshmem_ctx_##TNAME##_atomic_fetch_or( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_or(ctx, dest, value, pe); \ + } \ + __device__ T rocshmem_##TNAME##_atomic_fetch_or(T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_or(dest, value, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_atomic_or( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + rocshmem_atomic_or(ctx, dest, value, pe); \ + } \ + __device__ void rocshmem_##TNAME##_atomic_or(T *dest, T value, int pe) { \ + rocshmem_atomic_or(dest, value, pe); \ + } \ + __device__ T rocshmem_ctx_##TNAME##_atomic_fetch_xor( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + return rocshmem_atomic_fetch_xor(ctx, dest, value, pe); \ + } \ + __device__ T rocshmem_##TNAME##_atomic_fetch_xor(T *dest, T value, \ + int pe) { \ + return rocshmem_atomic_fetch_xor(dest, value, pe); \ + } \ + __device__ void rocshmem_ctx_##TNAME##_atomic_xor( \ + rocshmem_ctx_t ctx, T *dest, T value, int pe) { \ + rocshmem_atomic_xor(ctx, dest, value, pe); \ + } \ + __device__ void rocshmem_##TNAME##_atomic_xor(T *dest, T value, int pe) { \ + rocshmem_atomic_xor(dest, value, pe); \ + } + +#define WAIT_DEF_GEN(T, TNAME) \ + __device__ void rocshmem_##TNAME##_wait_until(T *ivars, int cmp, \ + T val) { \ + rocshmem_wait_until(ivars, cmp, val); \ + } \ + __device__ size_t rocshmem_##TNAME##_wait_until_any(T *ivars, size_t nelems,\ + const int* status, \ + int cmp, \ + T val) { \ + return rocshmem_wait_until_any(ivars, nelems, status, cmp, val); \ + } \ + __device__ void rocshmem_##TNAME##_wait_until_all(T *ivars, size_t nelems, \ + const int* status, \ + int cmp, \ + T val) { \ + rocshmem_wait_until_all(ivars, nelems, status, cmp, val); \ + } \ + __device__ size_t rocshmem_##TNAME##_wait_until_some(T *ivars, \ + size_t nelems, \ + size_t* indices, \ + const int* status, \ + int cmp, \ + T val) { \ + return rocshmem_wait_until_some(ivars, nelems, indices, status, cmp, \ + val); \ + } \ + __device__ size_t rocshmem_##TNAME##_wait_until_any_vector(T *ivars, \ + size_t nelems, \ + const int* status, \ + int cmp, \ + T* vals) { \ + return rocshmem_wait_until_any_vector(ivars, nelems, status, cmp, \ + vals); \ + } \ + __device__ void rocshmem_##TNAME##_wait_until_all_vector(T *ivars, \ + size_t nelems, \ + const int* status, \ + int cmp, \ + T* vals) { \ + rocshmem_wait_until_all_vector(ivars, nelems, status, cmp, vals); \ + } \ + __device__ size_t rocshmem_##TNAME##_wait_until_some_vector(T *ivars, \ + size_t nelems, \ + size_t* indices, \ + const int* status, \ + int cmp, \ + T* vals) { \ + return rocshmem_wait_until_some_vector(ivars, nelems, indices, \ + status, cmp, vals); \ + } \ + __device__ int rocshmem_##TNAME##_test(T *ivars, int cmp, T val) { \ + return rocshmem_test(ivars, cmp, val); \ + } + +/****************************************************************************** + ************************* Macro Invocation Per Type ************************** + *****************************************************************************/ + +// clang-format off +INT_REDUCTION_GEN(int) +INT_REDUCTION_GEN(short) +INT_REDUCTION_GEN(long) +INT_REDUCTION_GEN(long long) +FLOAT_REDUCTION_GEN(float) +FLOAT_REDUCTION_GEN(double) +// long double reduction fails. hipcc/device may not support long double. +// so disable it for now. +// FLOAT_REDUCTION_GEN(long double) + +RMA_GEN(float) +RMA_GEN(double) +// RMA_GEN(long double) +RMA_GEN(char) +RMA_GEN(signed char) +RMA_GEN(short) +RMA_GEN(int) +RMA_GEN(long) +RMA_GEN(long long) +RMA_GEN(unsigned char) +RMA_GEN(unsigned short) +RMA_GEN(unsigned int) +RMA_GEN(unsigned long) +RMA_GEN(unsigned long long) + +AMO_STANDARD_GEN(int) +AMO_STANDARD_GEN(long) +AMO_STANDARD_GEN(long long) +AMO_STANDARD_GEN(unsigned int) +AMO_STANDARD_GEN(unsigned long) +AMO_STANDARD_GEN(unsigned long long) + +AMO_EXTENDED_GEN(float) +AMO_EXTENDED_GEN(double) +AMO_EXTENDED_GEN(int) +AMO_EXTENDED_GEN(long) +AMO_EXTENDED_GEN(long long) +AMO_EXTENDED_GEN(unsigned int) +AMO_EXTENDED_GEN(unsigned long) +AMO_EXTENDED_GEN(unsigned long long) + +AMO_BITWISE_GEN(unsigned int) +AMO_BITWISE_GEN(unsigned long) +AMO_BITWISE_GEN(unsigned long long) + +/* Supported synchronization types */ +WAIT_GEN(float) +WAIT_GEN(double) +// WAIT_GEN(long double) +WAIT_GEN(char) +WAIT_GEN(unsigned char) +WAIT_GEN(unsigned short) +WAIT_GEN(signed char) +WAIT_GEN(short) +WAIT_GEN(int) +WAIT_GEN(long) +WAIT_GEN(long long) +WAIT_GEN(unsigned int) +WAIT_GEN(unsigned long) +WAIT_GEN(unsigned long long) + +INT_REDUCTION_DEF_GEN(int, int) +INT_REDUCTION_DEF_GEN(short, short) +INT_REDUCTION_DEF_GEN(long, long) +INT_REDUCTION_DEF_GEN(long long, longlong) +FLOAT_REDUCTION_DEF_GEN(float, float) +FLOAT_REDUCTION_DEF_GEN(double, double) +// long double reduction fails. hipcc/device may not support long double. +// so disable it for now. +// FLOAT_REDUCTION_DEF_GEN(long double, longdouble) + +RMA_DEF_GEN(float, float) +RMA_DEF_GEN(double, double) +RMA_DEF_GEN(char, char) +// RMA_DEF_GEN(long double, longdouble) +RMA_DEF_GEN(signed char, schar) +RMA_DEF_GEN(short, short) +RMA_DEF_GEN(int, int) +RMA_DEF_GEN(long, long) +RMA_DEF_GEN(long long, longlong) +RMA_DEF_GEN(unsigned char, uchar) +RMA_DEF_GEN(unsigned short, ushort) +RMA_DEF_GEN(unsigned int, uint) +RMA_DEF_GEN(unsigned long, ulong) +RMA_DEF_GEN(unsigned long long, ulonglong) +RMA_DEF_GEN(int8_t, int8) +RMA_DEF_GEN(int16_t, int16) +RMA_DEF_GEN(int32_t, int32) +RMA_DEF_GEN(int64_t, int64) +RMA_DEF_GEN(uint8_t, uint8) +RMA_DEF_GEN(uint16_t, uint16) +RMA_DEF_GEN(uint32_t, uint32) +RMA_DEF_GEN(uint64_t, uint64) +RMA_DEF_GEN(size_t, size) +RMA_DEF_GEN(ptrdiff_t, ptrdiff) + +AMO_STANDARD_DEF_GEN(int, int) +AMO_STANDARD_DEF_GEN(long, long) +AMO_STANDARD_DEF_GEN(long long, longlong) +AMO_STANDARD_DEF_GEN(unsigned int, uint) +AMO_STANDARD_DEF_GEN(unsigned long, ulong) +AMO_STANDARD_DEF_GEN(unsigned long long, ulonglong) +AMO_STANDARD_DEF_GEN(int32_t, int32) +AMO_STANDARD_DEF_GEN(int64_t, int64) +AMO_STANDARD_DEF_GEN(uint32_t, uint32) +AMO_STANDARD_DEF_GEN(uint64_t, uint64) +AMO_STANDARD_DEF_GEN(size_t, size) +AMO_STANDARD_DEF_GEN(ptrdiff_t, ptrdiff) + +AMO_EXTENDED_DEF_GEN(float, float) +AMO_EXTENDED_DEF_GEN(double, double) +AMO_EXTENDED_DEF_GEN(int, int) +AMO_EXTENDED_DEF_GEN(long, long) +AMO_EXTENDED_DEF_GEN(long long, longlong) +AMO_EXTENDED_DEF_GEN(unsigned int, uint) +AMO_EXTENDED_DEF_GEN(unsigned long, ulong) +AMO_EXTENDED_DEF_GEN(unsigned long long, ulonglong) +AMO_EXTENDED_DEF_GEN(int32_t, int32) +AMO_EXTENDED_DEF_GEN(int64_t, int64) +AMO_EXTENDED_DEF_GEN(uint32_t, uint32) +AMO_EXTENDED_DEF_GEN(uint64_t, uint64) +AMO_EXTENDED_DEF_GEN(size_t, size) +AMO_EXTENDED_DEF_GEN(ptrdiff_t, ptrdiff) + +AMO_BITWISE_DEF_GEN(unsigned int, uint) +AMO_BITWISE_DEF_GEN(unsigned long, ulong) +AMO_BITWISE_DEF_GEN(unsigned long long, ulonglong) +AMO_BITWISE_DEF_GEN(int32_t, int32) +AMO_BITWISE_DEF_GEN(int64_t, int64) +AMO_BITWISE_DEF_GEN(uint32_t, uint32) +AMO_BITWISE_DEF_GEN(uint64_t, uint64) + +WAIT_DEF_GEN(float, float) +WAIT_DEF_GEN(double, double) +// WAIT_DEF_GEN(long double, longdouble) +WAIT_DEF_GEN(char, char) +WAIT_DEF_GEN(signed char, schar) +WAIT_DEF_GEN(short, short) +WAIT_DEF_GEN(int, int) +WAIT_DEF_GEN(long, long) +WAIT_DEF_GEN(long long, longlong) +WAIT_DEF_GEN(unsigned char, uchar) +WAIT_DEF_GEN(unsigned short, ushort) +WAIT_DEF_GEN(unsigned int, uint) +WAIT_DEF_GEN(unsigned long, ulong) +WAIT_DEF_GEN(unsigned long long, ulonglong) +// clang-format on + +} // namespace rocshmem diff --git a/src/stats.hpp b/src/stats.hpp index 704313f102..d138c7fdbf 100644 --- a/src/stats.hpp +++ b/src/stats.hpp @@ -27,12 +27,12 @@ #include -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "util.hpp" namespace rocshmem { -enum roc_shmem_stats { +enum rocshmem_stats { NUM_PUT = 0, NUM_PUT_NBI, NUM_P, @@ -85,7 +85,7 @@ enum roc_shmem_stats { NUM_STATS }; -enum roc_shmem_host_stats { +enum rocshmem_host_stats { NUM_HOST_PUT = 0, NUM_HOST_PUT_NBI, NUM_HOST_P, @@ -135,10 +135,10 @@ class Stats { StatType stats[I] = {0}; public: - __device__ uint64_t startTimer() const { return roc_shmem_timer(); } + __device__ uint64_t startTimer() const { return rocshmem_timer(); } __device__ void endTimer(uint64_t start, int index) { - incStat(index, roc_shmem_timer() - start); + incStat(index, rocshmem_timer() - start); } __device__ void incStat(int index, int value = 1) { diff --git a/src/team.cpp b/src/team.cpp index 21ed00037c..b162df67e4 100644 --- a/src/team.cpp +++ b/src/team.cpp @@ -24,34 +24,34 @@ #include -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "backend_bc.hpp" #include "util.hpp" namespace rocshmem { -roc_shmem_team_t ROC_SHMEM_TEAM_WORLD; +rocshmem_team_t ROCSHMEM_TEAM_WORLD; -__host__ __device__ Team* get_internal_team(roc_shmem_team_t team) { +__host__ __device__ Team* get_internal_team(rocshmem_team_t team) { return reinterpret_cast(team); } -GPUIBTeam* get_internal_gpu_ib_team(roc_shmem_team_t team) { +GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team) { return reinterpret_cast(team); } -ROTeam* get_internal_ro_team(roc_shmem_team_t team) { +ROTeam* get_internal_ro_team(rocshmem_team_t team) { return reinterpret_cast(team); } -IPCTeam* get_internal_ipc_team(roc_shmem_team_t team) { +IPCTeam* get_internal_ipc_team(rocshmem_team_t team) { return reinterpret_cast(team); } -__host__ __device__ int team_translate_pe(roc_shmem_team_t src_team, int src_pe, - roc_shmem_team_t dst_team) { - if (src_team == ROC_SHMEM_TEAM_INVALID || - dst_team == ROC_SHMEM_TEAM_INVALID) { +__host__ __device__ int team_translate_pe(rocshmem_team_t src_team, int src_pe, + rocshmem_team_t dst_team) { + if (src_team == ROCSHMEM_TEAM_INVALID || + dst_team == ROCSHMEM_TEAM_INVALID) { return -1; } diff --git a/src/team.hpp b/src/team.hpp index 0dfaac6ae3..249c1b123b 100644 --- a/src/team.hpp +++ b/src/team.hpp @@ -25,7 +25,7 @@ #include -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" #include "backend_type.hpp" namespace rocshmem { @@ -157,16 +157,16 @@ class Team { BackendType type{BackendType::GPU_IB_BACKEND}; }; -__host__ __device__ Team* get_internal_team(roc_shmem_team_t team); +__host__ __device__ Team* get_internal_team(rocshmem_team_t team); -GPUIBTeam* get_internal_gpu_ib_team(roc_shmem_team_t team); +GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team); -ROTeam* get_internal_ro_team(roc_shmem_team_t team); +ROTeam* get_internal_ro_team(rocshmem_team_t team); -IPCTeam* get_internal_ipc_team(roc_shmem_team_t team); +IPCTeam* get_internal_ipc_team(rocshmem_team_t team); -__host__ __device__ int team_translate_pe(roc_shmem_team_t src_team, int src_pe, - roc_shmem_team_t dst_team); +__host__ __device__ int team_translate_pe(rocshmem_team_t src_team, int src_pe, + rocshmem_team_t dst_team); } // namespace rocshmem diff --git a/src/team_tracker.cpp b/src/team_tracker.cpp index c7cdd194c3..0f7c94c01a 100644 --- a/src/team_tracker.cpp +++ b/src/team_tracker.cpp @@ -28,19 +28,19 @@ namespace rocshmem { TeamTracker::TeamTracker() { char* value{nullptr}; - if ((value = getenv("ROC_SHMEM_MAX_NUM_TEAMS"))) { + if ((value = getenv("ROCSHMEM_MAX_NUM_TEAMS"))) { max_num_teams_ = atoi(value); } } -void TeamTracker::track(roc_shmem_team_t team) { - if (team == ROC_SHMEM_TEAM_INVALID) { +void TeamTracker::track(rocshmem_team_t team) { + if (team == ROCSHMEM_TEAM_INVALID) { return; } teams_.push_back(team); } -void TeamTracker::untrack(roc_shmem_team_t team) { +void TeamTracker::untrack(rocshmem_team_t team) { auto it{std::find(teams_.begin(), teams_.end(), team)}; assert(it != teams_.end()); teams_.erase(it); diff --git a/src/team_tracker.hpp b/src/team_tracker.hpp index 41fe1d0ba8..dc9ba8b97f 100644 --- a/src/team_tracker.hpp +++ b/src/team_tracker.hpp @@ -30,7 +30,7 @@ #include -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" namespace rocshmem { @@ -55,7 +55,7 @@ class TeamTracker { * * @param void */ - void track(roc_shmem_team_t team); + void track(rocshmem_team_t team); /** * @brief Remove team from the list of user-created teams @@ -64,7 +64,7 @@ class TeamTracker { * * @return void */ - void untrack(roc_shmem_team_t team); + void untrack(rocshmem_team_t team); /** * @brief Remove all teams from the list of user-created teams @@ -113,7 +113,7 @@ class TeamTracker { /** * @brief List of teams created by the user. */ - std::vector teams_{}; + std::vector teams_{}; /** * @brief The maximum number of teams the user can create. @@ -125,7 +125,7 @@ class TeamTracker { int max_num_teams_{40}; /** - * @brief Pointer to implementation of ROC_SHMEM_TEAM_WORLD + * @brief Pointer to implementation of ROCSHMEM_TEAM_WORLD */ Team* team_world_{nullptr}; }; diff --git a/src/templates.hpp b/src/templates.hpp index 1b482a219b..af2b581aa2 100644 --- a/src/templates.hpp +++ b/src/templates.hpp @@ -23,11 +23,11 @@ #ifndef LIBRARY_SRC_TEMPLATES_HPP_ #define LIBRARY_SRC_TEMPLATES_HPP_ -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" /** * @file templates.hpp - * @brief Internal header that declares templates for ROC_SHMEM's implentation + * @brief Internal header that declares templates for rocSHMEM's implentation * of the user-facing device APIs. * * This file contains templates for the OpenSHMEM APIs that take have @@ -35,8 +35,7 @@ */ /****************************************************************************** - **************************** DEVICE FUNCTIONS - *********************************** + **************************** DEVICE FUNCTIONS ******************************** *****************************************************************************/ namespace rocshmem { @@ -45,12 +44,12 @@ namespace rocshmem { * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -63,21 +62,21 @@ namespace rocshmem { * */ template -__device__ void roc_shmem_put(roc_shmem_ctx_t ctx, T *dest, const T *source, +__device__ void rocshmem_put(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_put(T *dest, const T *source, size_t nelems, int pe); +__device__ void rocshmem_put(T *dest, const T *source, size_t nelems, int pe); /** * @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe. - * The caller must call into roc_shmem_quiet() if remote completion is + * The caller must call into rocshmem_quiet() if remote completion is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -89,10 +88,10 @@ __device__ void roc_shmem_put(T *dest, const T *source, size_t nelems, int pe); * */ template -__device__ void roc_shmem_p(roc_shmem_ctx_t ctx, T *dest, T value, int pe); +__device__ void rocshmem_p(rocshmem_ctx_t ctx, T *dest, T value, int pe); template -__device__ void roc_shmem_p(T *dest, T value, int pe); +__device__ void rocshmem_p(T *dest, T value, int pe); /** * @brief Reads contiguous data of \p nelems elements from \p source on \p pe @@ -102,7 +101,7 @@ __device__ void roc_shmem_p(T *dest, T value, int pe); * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -115,11 +114,11 @@ __device__ void roc_shmem_p(T *dest, T value, int pe); * */ template -__device__ void roc_shmem_get(roc_shmem_ctx_t ctx, T *dest, const T *source, +__device__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_get(T *dest, const T *source, size_t nelems, int pe); +__device__ void rocshmem_get(T *dest, const T *source, size_t nelems, int pe); /** * @brief reads and returns single value from \p source at \p pe. @@ -128,7 +127,7 @@ __device__ void roc_shmem_get(T *dest, const T *source, size_t nelems, int pe); * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] source sourcen address. Must be an address on the symmetric @@ -139,22 +138,22 @@ __device__ void roc_shmem_get(T *dest, const T *source, size_t nelems, int pe); * */ template -__device__ T roc_shmem_g(roc_shmem_ctx_t ctx, const T *source, int pe); +__device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe); template -__device__ T roc_shmem_g(const T *source, int pe); +__device__ T rocshmem_g(const T *source, int pe); /** * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -167,23 +166,23 @@ __device__ T roc_shmem_g(const T *source, int pe); * */ template -__device__ void roc_shmem_put_nbi(roc_shmem_ctx_t ctx, T *dest, const T *src, +__device__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *src, size_t nelems, int pe); template -__device__ void roc_shmem_put_nbi(T *dest, const T *src, size_t nelems, int pe); +__device__ void rocshmem_put_nbi(T *dest, const T *src, size_t nelems, int pe); /** * @brief Reads contiguous data of \p nelems elements from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller will * return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ctx Context with which to perform this operation. * @param[in] dest Destination address. Must be an address on the symmetric @@ -196,11 +195,11 @@ __device__ void roc_shmem_put_nbi(T *dest, const T *src, size_t nelems, int pe); * */ template -__device__ void roc_shmem_get_nbi(roc_shmem_ctx_t ctx, T *dest, const T *source, +__device__ void rocshmem_get_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_get_nbi(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_nbi(T *dest, const T *source, size_t nelems, int pe); /** @@ -222,11 +221,11 @@ __device__ void roc_shmem_get_nbi(T *dest, const T *source, size_t nelems, * */ template -__device__ T roc_shmem_atomic_fetch_add(roc_shmem_ctx_t ctx, T *dest, T val, +__device__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val, int pe); template -__device__ T roc_shmem_atomic_fetch_add(T *dest, T val, int pe); +__device__ T rocshmem_atomic_fetch_add(T *dest, T val, int pe); /** * @brief Atomically compares if the value in \p dest with \p cond is equal @@ -249,11 +248,11 @@ __device__ T roc_shmem_atomic_fetch_add(T *dest, T val, int pe); * */ template -__device__ T roc_shmem_atomic_compare_swap(roc_shmem_ctx_t ctx, T *dest, T cond, +__device__ T rocshmem_atomic_compare_swap(rocshmem_ctx_t ctx, T *dest, T cond, T val, int pe); template -__device__ T roc_shmem_atomic_compare_swap(T *dest, T cond, T val, int pe); +__device__ T rocshmem_atomic_compare_swap(T *dest, T cond, T val, int pe); /** * @brief Atomically add 1 to \p dest on \p pe. The operation @@ -273,10 +272,10 @@ __device__ T roc_shmem_atomic_compare_swap(T *dest, T cond, T val, int pe); * */ template -__device__ T roc_shmem_atomic_fetch_inc(roc_shmem_ctx_t ctx, T *dest, int pe); +__device__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, T *dest, int pe); template -__device__ T roc_shmem_atomic_fetch_inc(T *dest, int pe); +__device__ T rocshmem_atomic_fetch_inc(T *dest, int pe); /** * @brief Atomically return the value of \p dest to the calling PE. @@ -296,10 +295,10 @@ __device__ T roc_shmem_atomic_fetch_inc(T *dest, int pe); * */ template -__device__ T roc_shmem_atomic_fetch(roc_shmem_ctx_t ctx, T *source, int pe); +__device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe); template -__device__ T roc_shmem_atomic_fetch(T *source, int pe); +__device__ T rocshmem_atomic_fetch(T *source, int pe); /** * @brief Atomically add the value \p val to \p dest on \p pe. @@ -319,11 +318,11 @@ __device__ T roc_shmem_atomic_fetch(T *source, int pe); * */ template -__device__ void roc_shmem_atomic_add(roc_shmem_ctx_t ctx, T *dest, T val, +__device__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, T *dest, T val, int pe); template -__device__ void roc_shmem_atomic_add(T *dest, T val, int pe); +__device__ void rocshmem_atomic_add(T *dest, T val, int pe); /** * @brief Atomically add 1 to \p dest on \p pe. @@ -342,10 +341,10 @@ __device__ void roc_shmem_atomic_add(T *dest, T val, int pe); * */ template -__device__ void roc_shmem_atomic_inc(roc_shmem_ctx_t ctx, T *dest, int pe); +__device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe); template -__device__ void roc_shmem_atomic_inc(T *dest, int pe); +__device__ void rocshmem_atomic_inc(T *dest, int pe); /** * @brief Atomically set value for \p dest on \p pe. @@ -365,11 +364,11 @@ __device__ void roc_shmem_atomic_inc(T *dest, int pe); * */ template -__device__ void roc_shmem_atomic_set(roc_shmem_ctx_t ctx, T *dest, T value, +__device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T value, int pe); template -__device__ void roc_shmem_atomic_set(T *dest, T value, int pe); +__device__ void rocshmem_atomic_set(T *dest, T value, int pe); /** * @brief Block the caller until the condition (* \p ptr \p cmps \p val) is @@ -378,7 +377,7 @@ __device__ void roc_shmem_atomic_set(T *dest, T value, int pe); * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ivars Pointer to memory on the symmetric heap to wait for. * @param[in] cmp Operation for the comparison. @@ -388,7 +387,7 @@ __device__ void roc_shmem_atomic_set(T *dest, T value, int pe); * */ template -__device__ void roc_shmem_wait_until(T *ivars, int cmp, T val); +__device__ void rocshmem_wait_until(T *ivars, int cmp, T val); /** * @brief test if the condition (* \p ptr \p cmps \p val) is @@ -397,7 +396,7 @@ __device__ void roc_shmem_wait_until(T *ivars, int cmp, T val); * This function can be called from divergent control paths at per-thread * granularity. However, performance may be improved if the caller can * coalesce contiguous messages and elect a leader thread to call into the - * ROC_SHMEM function. + * rocSHMEM function. * * @param[in] ivars Pointer to memory on the symmetric heap to wait for. * @param[in] cmp Operation for the comparison. @@ -407,7 +406,7 @@ __device__ void roc_shmem_wait_until(T *ivars, int cmp, T val); * */ template -__device__ int roc_shmem_test(T *ivars, int cmp, T val); +__device__ int rocshmem_test(T *ivars, int cmp, T val); /** * @brief Perform a broadcast between PEs in the active set. The caller @@ -425,14 +424,14 @@ __device__ int roc_shmem_test(T *ivars, int cmp, T val); * @param[in] PE_start PE to start the reduction. * @param[in] logPE_stride Stride of PEs participating in the reduction. * @param[in] PE_size Number PEs participating in the reduction. - * @param[in] pSync Temporary sync buffer provided to ROC_SHMEM. Must - be of size at least ROC_SHMEM_REDUCE_SYNC_SIZE. + * @param[in] pSync Temporary sync buffer provided to rocSHMEM. Must + be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. * * @return void * */ template -__device__ void roc_shmem_wg_broadcast(roc_shmem_ctx_t ctx, T *dest, +__device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx, T *dest, const T *source, int nelement, int PE_root, int PE_start, int logPE_stride, int PE_size, @@ -452,18 +451,18 @@ __device__ void roc_shmem_wg_broadcast(roc_shmem_ctx_t ctx, T *dest, * @param[in] PE_start PE to start the reduction. * @param[in] logPE_stride Stride of PEs participating in the reduction. * @param[in] PE_size Number PEs participating in the reduction. - * @param[in] pWrk Temporary work buffer provided to ROC_SHMEM. Must + * @param[in] pWrk Temporary work buffer provided to rocSHMEM. Must * be of size at least max(size/2 + 1, - ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE). - * @param[in] pSync Temporary sync buffer provided to ROC_SHMEM. Must - be of size at least ROC_SHMEM_REDUCE_SYNC_SIZE. + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE). + * @param[in] pSync Temporary sync buffer provided to rocSHMEM. Must + be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. * @param[in] handle GPU side handle. * * @return void * */ -template -__device__ void roc_shmem_wg_to_all(roc_shmem_ctx_t ctx, T *dest, +template +__device__ void rocshmem_wg_to_all(rocshmem_ctx_t ctx, T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); @@ -472,7 +471,7 @@ __device__ void roc_shmem_wg_to_all(roc_shmem_ctx_t ctx, T *dest, * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-wave * granularity. However, all threads in a wave must collectivlily participate in @@ -489,18 +488,18 @@ __device__ void roc_shmem_wg_to_all(roc_shmem_ctx_t ctx, T *dest, * */ template -__device__ void roc_shmem_put_wave(roc_shmem_ctx_t ctx, T *dest, +__device__ void rocshmem_put_wave(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_put_wave(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_put_wave(T *dest, const T *source, size_t nelems, int pe); /** * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest at \p pe. The caller will block until the operation * completes locally (it is safe to reuse \p source). The caller must - * call into roc_shmem_quiet() if remote completion is required. + * call into rocshmem_quiet() if remote completion is required. * * This function can be called from divergent control paths at per-workgroub * (WG) granularity. However, All threads in a WG must collectivelly participate @@ -517,11 +516,11 @@ __device__ void roc_shmem_put_wave(T *dest, const T *source, size_t nelems, * */ template -__device__ void roc_shmem_put_wg(roc_shmem_ctx_t ctx, T *dest, const T *source, +__device__ void rocshmem_put_wg(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_put_wg(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_put_wg(T *dest, const T *source, size_t nelems, int pe); /** @@ -544,11 +543,11 @@ __device__ void roc_shmem_put_wg(T *dest, const T *source, size_t nelems, * */ template -__device__ void roc_shmem_get_wave(roc_shmem_ctx_t ctx, T *dest, +__device__ void rocshmem_get_wave(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_get_wave(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_wave(T *dest, const T *source, size_t nelems, int pe); /** @@ -571,18 +570,18 @@ __device__ void roc_shmem_get_wave(T *dest, const T *source, size_t nelems, * */ template -__device__ void roc_shmem_get_wg(roc_shmem_ctx_t ctx, T *dest, const T *source, +__device__ void rocshmem_get_wg(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_get_wg(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_wg(T *dest, const T *source, size_t nelems, int pe); /** * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-wave @@ -599,18 +598,18 @@ __device__ void roc_shmem_get_wg(T *dest, const T *source, size_t nelems, * */ template -__device__ void roc_shmem_put_nbi_wave(roc_shmem_ctx_t ctx, T *dest, +__device__ void rocshmem_put_nbi_wave(rocshmem_ctx_t ctx, T *dest, const T *src, size_t nelems, int pe); template -__device__ void roc_shmem_put_nbi_wave(T *dest, const T *src, size_t nelems, +__device__ void rocshmem_put_nbi_wave(T *dest, const T *src, size_t nelems, int pe); /** * @brief Writes contiguous data of \p nelems elements from \p source on the * calling PE to \p dest on \p pe. The operation is not blocking. The caller * will return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-workgroup @@ -627,18 +626,18 @@ __device__ void roc_shmem_put_nbi_wave(T *dest, const T *src, size_t nelems, * */ template -__device__ void roc_shmem_put_nbi_wg(roc_shmem_ctx_t ctx, T *dest, +__device__ void rocshmem_put_nbi_wg(rocshmem_ctx_t ctx, T *dest, const T *src, size_t nelems, int pe); template -__device__ void roc_shmem_put_nbi_wg(T *dest, const T *src, size_t nelems, +__device__ void rocshmem_put_nbi_wg(T *dest, const T *src, size_t nelems, int pe); /** * @brief Reads contiguous data of \p nelems elements from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller will * return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-wave @@ -655,18 +654,18 @@ __device__ void roc_shmem_put_nbi_wg(T *dest, const T *src, size_t nelems, * */ template -__device__ void roc_shmem_get_nbi_wave(roc_shmem_ctx_t ctx, T *dest, +__device__ void rocshmem_get_nbi_wave(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_get_nbi_wave(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_nbi_wave(T *dest, const T *source, size_t nelems, int pe); /** * @brief Reads contiguous data of \p nelems elements from \p source on \p pe * to \p dest on the calling PE. The operation is not blocking. The caller will * return as soon as the request is posted. The caller must call - * roc_shmem_quiet() on the same context if completion notification is + * rocshmem_quiet() on the same context if completion notification is * required. * * This function can be called from divergent control paths at per-workgroup @@ -683,101 +682,101 @@ __device__ void roc_shmem_get_nbi_wave(T *dest, const T *source, size_t nelems, * */ template -__device__ void roc_shmem_get_nbi_wg(roc_shmem_ctx_t ctx, T *dest, +__device__ void rocshmem_get_nbi_wg(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__device__ void roc_shmem_get_nbi_wg(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_nbi_wg(T *dest, const T *source, size_t nelems, int pe); -__device__ void roc_shmem_putmem_wave(void *dest, const void *source, +__device__ void rocshmem_putmem_wave(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_putmem_wave(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_ctx_putmem_wave(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } -__device__ void roc_shmem_putmem_wg(void *dest, const void *source, +__device__ void rocshmem_putmem_wg(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_putmem_wg(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_ctx_putmem_wg(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } template -__device__ void roc_shmem_put_wave(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_put_wave(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_put_wave(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_put_wave(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } template -__device__ void roc_shmem_put_wg(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_put_wg(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_put_wg(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_put_wg(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } -__device__ void roc_shmem_getmem_wg(void *dest, const void *source, +__device__ void rocshmem_getmem_wg(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_getmem_wg(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_ctx_getmem_wg(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } template -__device__ void roc_shmem_get_wg(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_wg(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_get_wg(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_get_wg(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } -__device__ void roc_shmem_getmem_wave(void *dest, const void *source, +__device__ void rocshmem_getmem_wave(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_getmem_wave(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_ctx_getmem_wave(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } template -__device__ void roc_shmem_get_wave(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_wave(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_get_wave(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_get_wave(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } -__device__ void roc_shmem_putmem_nbi_wg(void *dest, const void *source, +__device__ void rocshmem_putmem_nbi_wg(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_putmem_nbi_wg(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_ctx_putmem_nbi_wg(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } template -__device__ void roc_shmem_put_nbi_wg(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_put_nbi_wg(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_put_nbi_wg(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_put_nbi_wg(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } -__device__ void roc_shmem_putmem_nbi_wave(void *dest, const void *source, +__device__ void rocshmem_putmem_nbi_wave(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_putmem_nbi_wave(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, + rocshmem_ctx_putmem_nbi_wave(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } template -__device__ void roc_shmem_put_nbi_wave(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_put_nbi_wave(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_put_nbi_wave(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_put_nbi_wave(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } -__device__ void roc_shmem_getmem_nbi_wg(void *dest, const void *source, +__device__ void rocshmem_getmem_nbi_wg(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_getmem_nbi_wg(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_ctx_getmem_nbi_wg(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } template -__device__ void roc_shmem_get_nbi_wg(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_nbi_wg(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_get_nbi_wg(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_get_nbi_wg(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } -__device__ void roc_shmem_getmem_nbi_wave(void *dest, const void *source, +__device__ void rocshmem_getmem_nbi_wave(void *dest, const void *source, size_t nelems, int pe) { - roc_shmem_ctx_getmem_nbi_wave(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, + rocshmem_ctx_getmem_nbi_wave(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } template -__device__ void roc_shmem_get_nbi_wave(T *dest, const T *source, size_t nelems, +__device__ void rocshmem_get_nbi_wave(T *dest, const T *source, size_t nelems, int pe) { - roc_shmem_get_nbi_wave(ROC_SHMEM_CTX_DEFAULT, dest, source, nelems, pe); + rocshmem_get_nbi_wave(ROCSHMEM_CTX_DEFAULT, dest, source, nelems, pe); } } // namespace rocshmem diff --git a/src/templates_host.hpp b/src/templates_host.hpp index dabd6a3748..0c09b0c1b9 100644 --- a/src/templates_host.hpp +++ b/src/templates_host.hpp @@ -23,11 +23,11 @@ #ifndef LIBRARY_SRC_TEMPLATES_HOST_HPP_ #define LIBRARY_SRC_TEMPLATES_HOST_HPP_ -#include "roc_shmem/roc_shmem.hpp" +#include "rocshmem/rocshmem.hpp" /** * @file templates_host.hpp - * @brief Internal header that declares templates for ROC_SHMEM's implentation + * @brief Internal header that declares templates for rocSHMEM's implementation * of the user-facing host APIs. * * This file contains templates for the OpenSHMEM APIs that take have @@ -41,102 +41,102 @@ namespace rocshmem { template -__host__ void roc_shmem_put(roc_shmem_ctx_t ctx, T *dest, const T *source, +__host__ void rocshmem_put(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__host__ void roc_shmem_put(T *dest, const T *source, size_t nelems, int pe); +__host__ void rocshmem_put(T *dest, const T *source, size_t nelems, int pe); template -__host__ void roc_shmem_p(roc_shmem_ctx_t ctx, T *dest, T value, int pe); +__host__ void rocshmem_p(rocshmem_ctx_t ctx, T *dest, T value, int pe); template -__host__ void roc_shmem_p(T *dest, T value, int pe); +__host__ void rocshmem_p(T *dest, T value, int pe); template -__host__ void roc_shmem_get(roc_shmem_ctx_t ctx, T *dest, const T *source, +__host__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__host__ void roc_shmem_get(T *dest, const T *source, size_t nelems, int pe); +__host__ void rocshmem_get(T *dest, const T *source, size_t nelems, int pe); template -__host__ T roc_shmem_g(roc_shmem_ctx_t ctx, const T *source, int pe); +__host__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe); template -__host__ T roc_shmem_g(const T *source, int pe); +__host__ T rocshmem_g(const T *source, int pe); template -__host__ void roc_shmem_put_nbi(roc_shmem_ctx_t ctx, T *dest, const T *src, +__host__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *src, size_t nelems, int pe); template -__host__ void roc_shmem_put_nbi(T *dest, const T *src, size_t nelems, int pe); +__host__ void rocshmem_put_nbi(T *dest, const T *src, size_t nelems, int pe); template -__host__ void roc_shmem_get_nbi(roc_shmem_ctx_t ctx, T *dest, const T *source, +__host__ void rocshmem_get_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); template -__host__ void roc_shmem_get_nbi(T *dest, const T *source, size_t nelems, +__host__ void rocshmem_get_nbi(T *dest, const T *source, size_t nelems, int pe); template -__host__ T roc_shmem_atomic_fetch_add(roc_shmem_ctx_t ctx, T *dest, T val, +__host__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val, int pe); template -__host__ T roc_shmem_atomic_fetch_add(T *dest, T val, int pe); +__host__ T rocshmem_atomic_fetch_add(T *dest, T val, int pe); template -__host__ T roc_shmem_atomic_compare_swap(roc_shmem_ctx_t ctx, T *dest, T cond, +__host__ T rocshmem_atomic_compare_swap(rocshmem_ctx_t ctx, T *dest, T cond, T val, int pe); template -__host__ T roc_shmem_atomic_compare_swap(T *dest, T cond, T val, int pe); +__host__ T rocshmem_atomic_compare_swap(T *dest, T cond, T val, int pe); template -__host__ T roc_shmem_atomic_fetch_inc(roc_shmem_ctx_t ctx, T *dest, int pe); +__host__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, T *dest, int pe); template -__host__ T roc_shmem_atomic_fetch_inc(T *dest, int pe); +__host__ T rocshmem_atomic_fetch_inc(T *dest, int pe); template -__host__ T roc_shmem_atomic_fetch(roc_shmem_ctx_t ctx, T *source, int pe); +__host__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe); template -__host__ T roc_shmem_atomic_fetch(T *source, int pe); +__host__ T rocshmem_atomic_fetch(T *source, int pe); template -__host__ void roc_shmem_atomic_add(roc_shmem_ctx_t ctx, T *dest, T val, int pe); +__host__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, T *dest, T val, int pe); template -__host__ void roc_shmem_atomic_add(T *dest, T val, int pe); +__host__ void rocshmem_atomic_add(T *dest, T val, int pe); template -__host__ void roc_shmem_atomic_inc(roc_shmem_ctx_t ctx, T *dest, int pe); +__host__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe); template -__host__ void roc_shmem_atomic_inc(T *dest, int pe); +__host__ void rocshmem_atomic_inc(T *dest, int pe); template -__host__ void roc_shmem_atomic_set(T *dest, T val, int pe); +__host__ void rocshmem_atomic_set(T *dest, T val, int pe); template -__host__ void roc_shmem_atomic_set(roc_shmem_ctx_t ctx, T *dest, T val, int pe); +__host__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val, int pe); template -__host__ void roc_shmem_broadcast(roc_shmem_ctx_t ctx, T *dest, const T *source, +__host__ void rocshmem_broadcast(rocshmem_ctx_t ctx, T *dest, const T *source, int nelement, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); -template -__host__ void roc_shmem_to_all(roc_shmem_ctx_t ctx, T *dest, const T *source, +template +__host__ void rocshmem_to_all(rocshmem_ctx_t ctx, T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); template -__host__ void roc_shmem_wait_until(T *ivars, int cmp, T val); +__host__ void rocshmem_wait_until(T *ivars, int cmp, T val); template __host__ void wait_until_all(T* ivars, size_t nelems, const int *status, @@ -164,7 +164,7 @@ __host__ size_t wait_until_some_vector(T* ivars, size_t nelems, int cmp, T* vals); template -__host__ int roc_shmem_test(T *ivars, int cmp, T val); +__host__ int rocshmem_test(T *ivars, int cmp, T val); } // namespace rocshmem diff --git a/tests/functional_tests/alltoall_tester.cpp b/tests/functional_tests/alltoall_tester.cpp index bfdd115878..570a05d2e1 100644 --- a/tests/functional_tests/alltoall_tester.cpp +++ b/tests/functional_tests/alltoall_tester.cpp @@ -24,17 +24,17 @@ using namespace rocshmem; /* Declare the template with a generic implementation */ template -__device__ void wg_alltoall(roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, +__device__ void wg_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelem) { return; } -/* Define templates to call ROC_SHMEM */ +/* Define templates to call rocSHMEM */ #define ALLTOALL_DEF_GEN(T, TNAME) \ template <> \ - __device__ void wg_alltoall(roc_shmem_ctx_t ctx, roc_shmem_team_t team, \ + __device__ void wg_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team, \ T * dest, const T *source, int nelem) { \ - roc_shmem_ctx_##TNAME##_wg_alltoall(ctx, team, dest, source, nelem); \ + rocshmem_ctx_##TNAME##_wg_alltoall(ctx, team, dest, source, nelem); \ } ALLTOALL_DEF_GEN(float, float) @@ -52,7 +52,7 @@ ALLTOALL_DEF_GEN(unsigned int, uint) ALLTOALL_DEF_GEN(unsigned long, ulong) ALLTOALL_DEF_GEN(unsigned long long, ulonglong) -roc_shmem_team_t team_alltoall_world_dup; +rocshmem_team_t team_alltoall_world_dup; /****************************************************************************** * DEVICE TEST KERNEL @@ -60,20 +60,20 @@ roc_shmem_team_t team_alltoall_world_dup; template __global__ void AlltoallTest(int loop, int skip, uint64_t *timer, T1 *source_buf, T1 *dest_buf, int size, - ShmemContextType ctx_type, roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; + ShmemContextType ctx_type, rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); - int n_pes = roc_shmem_ctx_n_pes(ctx); + int n_pes = rocshmem_ctx_n_pes(ctx); __syncthreads(); uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip && hipThreadIdx_x == 0) { - start = roc_shmem_timer(); + start = rocshmem_timer(); } wg_alltoall(ctx, team, dest_buf, // T* dest @@ -84,11 +84,11 @@ __global__ void AlltoallTest(int loop, int skip, uint64_t *timer, __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -99,24 +99,24 @@ AlltoallTester::AlltoallTester( TesterArguments args, std::function f1, std::function(const T1 &, T1)> f2) : Tester(args), init_buf{f1}, verify_buf{f2} { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); - source_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); - dest_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); + source_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); + dest_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); } template AlltoallTester::~AlltoallTester() { - roc_shmem_free(source_buf); - roc_shmem_free(dest_buf); + rocshmem_free(source_buf); + rocshmem_free(dest_buf); } template void AlltoallTester::preLaunchKernel() { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); bw_factor = sizeof(T1) * n_pes; - team_alltoall_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + team_alltoall_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &team_alltoall_world_dup); } @@ -135,12 +135,12 @@ void AlltoallTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, template void AlltoallTester::postLaunchKernel() { - roc_shmem_team_destroy(team_alltoall_world_dup); + rocshmem_team_destroy(team_alltoall_world_dup); } template void AlltoallTester::resetBuffers(uint64_t size) { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); for (int i = 0; i < n_pes; i++) { for (int j = 0; j < size; j++) { init_buf(source_buf[i * size + j], dest_buf[i * size + j], (T1)i); @@ -150,7 +150,7 @@ void AlltoallTester::resetBuffers(uint64_t size) { template void AlltoallTester::verifyResults(uint64_t size) { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); for (int i = 0; i < n_pes; i++) { for (int j = 0; j < size; j++) { auto r = verify_buf(dest_buf[i * size + j], i); diff --git a/tests/functional_tests/amo_bitwise_tester.cpp b/tests/functional_tests/amo_bitwise_tester.cpp index 625fd9cd7b..426fa2247a 100644 --- a/tests/functional_tests/amo_bitwise_tester.cpp +++ b/tests/functional_tests/amo_bitwise_tester.cpp @@ -23,7 +23,7 @@ #include "amo_bitwise_tester.hpp" #include -#include +#include using namespace rocshmem; @@ -41,13 +41,13 @@ __global__ void AMOBitwiseTest(int loop, int skip, uint64_t *timer, char *r_buf, template AMOBitwiseTester::AMOBitwiseTester(TesterArguments args) : Tester(args) { CHECK_HIP(hipMalloc((void **)&_ret_val, args.max_msg_size * args.num_wgs)); - _r_buf = (char *)roc_shmem_malloc(args.max_msg_size); - _s_buf = (T *)roc_shmem_malloc(args.max_msg_size * args.num_wgs); + _r_buf = (char *)rocshmem_malloc(args.max_msg_size); + _s_buf = (T *)rocshmem_malloc(args.max_msg_size * args.num_wgs); } template AMOBitwiseTester::~AMOBitwiseTester() { - roc_shmem_free(_r_buf); + rocshmem_free(_r_buf); CHECK_HIP(hipFree(_ret_val)); } @@ -125,48 +125,48 @@ void AMOBitwiseTester::verifyResults(uint64_t size) { __global__ void AMOBitwiseTest( \ int loop, int skip, uint64_t *timer, char *r_buf, T *s_buf, T *ret_val, \ TestType type, ShmemContextType ctx_type) { \ - __shared__ roc_shmem_ctx_t ctx; \ - roc_shmem_wg_init(); \ - roc_shmem_wg_ctx_create(ctx_type, &ctx); \ + __shared__ rocshmem_ctx_t ctx; \ + rocshmem_wg_init(); \ + rocshmem_wg_ctx_create(ctx_type, &ctx); \ if (hipThreadIdx_x == 0) { \ uint64_t start; \ T ret = 0; \ T cond = 0; \ for (int i = 0; i < loop + skip; i++) { \ - if (i == skip) start = roc_shmem_timer(); \ + if (i == skip) start = rocshmem_timer(); \ switch (type) { \ case AMO_FetchAndTestType: \ - ret = roc_shmem_ctx_##TNAME##_atomic_fetch_and(ctx, (T *)r_buf, \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_and(ctx, (T *)r_buf, \ 0xFFFF, 1); \ break; \ case AMO_AndTestType: \ - roc_shmem_ctx_##TNAME##_atomic_and(ctx, (T *)r_buf, 0xFFFF, 1); \ + rocshmem_ctx_##TNAME##_atomic_and(ctx, (T *)r_buf, 0xFFFF, 1); \ break; \ case AMO_FetchOrTestType: \ - ret = roc_shmem_ctx_##TNAME##_atomic_fetch_or(ctx, (T *)r_buf, \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_or(ctx, (T *)r_buf, \ 0xFFFF, 1); \ break; \ case AMO_OrTestType: \ - roc_shmem_ctx_##TNAME##_atomic_or(ctx, (T *)r_buf, 0xFFFF, 1); \ + rocshmem_ctx_##TNAME##_atomic_or(ctx, (T *)r_buf, 0xFFFF, 1); \ break; \ case AMO_FetchXorTestType: \ - ret = roc_shmem_ctx_##TNAME##_atomic_fetch_xor(ctx, (T *)r_buf, \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_xor(ctx, (T *)r_buf, \ 0xFFFF, 1); \ break; \ case AMO_XorTestType: \ - roc_shmem_ctx_##TNAME##_atomic_xor(ctx, (T *)r_buf, 0xFFFF, 1); \ + rocshmem_ctx_##TNAME##_atomic_xor(ctx, (T *)r_buf, 0xFFFF, 1); \ break; \ default: \ break; \ } \ } \ - roc_shmem_ctx_quiet(ctx); \ - timer[hipBlockIdx_x] = roc_shmem_timer() - start; \ + rocshmem_ctx_quiet(ctx); \ + timer[hipBlockIdx_x] = rocshmem_timer() - start; \ ret_val[hipBlockIdx_x] = ret; \ - roc_shmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ + rocshmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ } \ - roc_shmem_wg_ctx_destroy(&ctx); \ - roc_shmem_wg_finalize(); \ + rocshmem_wg_ctx_destroy(&ctx); \ + rocshmem_wg_finalize(); \ } \ template class AMOBitwiseTester; diff --git a/tests/functional_tests/amo_extended_tester.cpp b/tests/functional_tests/amo_extended_tester.cpp index 565e09f117..44b3f0865c 100644 --- a/tests/functional_tests/amo_extended_tester.cpp +++ b/tests/functional_tests/amo_extended_tester.cpp @@ -23,7 +23,7 @@ #include "amo_extended_tester.hpp" #include -#include +#include using namespace rocshmem; @@ -41,13 +41,13 @@ __global__ void AMOExtendedTest(int loop, int skip, uint64_t *timer, template AMOExtendedTester::AMOExtendedTester(TesterArguments args) : Tester(args) { CHECK_HIP(hipMalloc((void **)&_ret_val, args.max_msg_size * args.num_wgs)); - _r_buf = (char *)roc_shmem_malloc(args.max_msg_size); - _s_buf = (T *)roc_shmem_malloc(args.max_msg_size * args.num_wgs); + _r_buf = (char *)rocshmem_malloc(args.max_msg_size); + _s_buf = (T *)rocshmem_malloc(args.max_msg_size * args.num_wgs); } template AMOExtendedTester::~AMOExtendedTester() { - roc_shmem_free(_r_buf); + rocshmem_free(_r_buf); CHECK_HIP(hipFree(_ret_val)); } @@ -113,37 +113,37 @@ void AMOExtendedTester::verifyResults(uint64_t size) { __global__ void AMOExtendedTest( \ int loop, int skip, uint64_t *timer, char *r_buf, T *s_buf, T *ret_val, \ TestType type, ShmemContextType ctx_type) { \ - __shared__ roc_shmem_ctx_t ctx; \ - roc_shmem_wg_init(); \ - roc_shmem_wg_ctx_create(ctx_type, &ctx); \ + __shared__ rocshmem_ctx_t ctx; \ + rocshmem_wg_init(); \ + rocshmem_wg_ctx_create(ctx_type, &ctx); \ if (hipThreadIdx_x == 0) { \ uint64_t start; \ T ret = 0; \ T cond = 0; \ for (int i = 0; i < loop + skip; i++) { \ - if (i == skip) start = roc_shmem_timer(); \ + if (i == skip) start = rocshmem_timer(); \ switch (type) { \ case AMO_FetchTestType: \ - ret = roc_shmem_ctx_##TNAME##_atomic_fetch(ctx, (T *)r_buf, 1); \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch(ctx, (T *)r_buf, 1); \ break; \ case AMO_SetTestType: \ - roc_shmem_ctx_##TNAME##_atomic_set(ctx, (T *)r_buf, 44, 1); \ + rocshmem_ctx_##TNAME##_atomic_set(ctx, (T *)r_buf, 44, 1); \ break; \ case AMO_SwapTestType: \ - ret = roc_shmem_ctx_##TNAME##_atomic_swap(ctx, (T *)r_buf, \ + ret = rocshmem_ctx_##TNAME##_atomic_swap(ctx, (T *)r_buf, \ ret + 1, 1); \ break; \ default: \ break; \ } \ } \ - roc_shmem_ctx_quiet(ctx); \ - timer[hipBlockIdx_x] = roc_shmem_timer() - start; \ + rocshmem_ctx_quiet(ctx); \ + timer[hipBlockIdx_x] = rocshmem_timer() - start; \ ret_val[hipBlockIdx_x] = ret; \ - roc_shmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ + rocshmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ } \ - roc_shmem_wg_ctx_destroy(&ctx); \ - roc_shmem_wg_finalize(); \ + rocshmem_wg_ctx_destroy(&ctx); \ + rocshmem_wg_finalize(); \ } \ template class AMOExtendedTester; diff --git a/tests/functional_tests/amo_standard_tester.cpp b/tests/functional_tests/amo_standard_tester.cpp index 17a69f9120..93009b08de 100644 --- a/tests/functional_tests/amo_standard_tester.cpp +++ b/tests/functional_tests/amo_standard_tester.cpp @@ -23,7 +23,7 @@ #include "amo_standard_tester.hpp" #include -#include +#include using namespace rocshmem; @@ -41,13 +41,13 @@ __global__ void AMOStandardTest(int loop, int skip, uint64_t *timer, template AMOStandardTester::AMOStandardTester(TesterArguments args) : Tester(args) { CHECK_HIP(hipMalloc((void **)&_ret_val, args.max_msg_size * args.num_wgs)); - _r_buf = (char *)roc_shmem_malloc(args.max_msg_size); - _s_buf = (T *)roc_shmem_malloc(args.max_msg_size * args.num_wgs); + _r_buf = (char *)rocshmem_malloc(args.max_msg_size); + _s_buf = (T *)rocshmem_malloc(args.max_msg_size * args.num_wgs); } template AMOStandardTester::~AMOStandardTester() { - roc_shmem_free(_r_buf); + rocshmem_free(_r_buf); CHECK_HIP(hipFree(_ret_val)); } @@ -121,46 +121,46 @@ void AMOStandardTester::verifyResults(uint64_t size) { __global__ void AMOStandardTest( \ int loop, int skip, uint64_t *timer, char *r_buf, T *s_buf, T *ret_val, \ TestType type, ShmemContextType ctx_type) { \ - __shared__ roc_shmem_ctx_t ctx; \ - roc_shmem_wg_init(); \ - roc_shmem_wg_ctx_create(ctx_type, &ctx); \ + __shared__ rocshmem_ctx_t ctx; \ + rocshmem_wg_init(); \ + rocshmem_wg_ctx_create(ctx_type, &ctx); \ if (hipThreadIdx_x == 0) { \ uint64_t start; \ T ret = 0; \ T cond = 0; \ for (int i = 0; i < loop + skip; i++) { \ - if (i == skip) start = roc_shmem_timer(); \ + if (i == skip) start = rocshmem_timer(); \ switch (type) { \ case AMO_FAddTestType: \ - ret = roc_shmem_ctx_##TNAME##_atomic_fetch_add(ctx, (T *)r_buf, 2, \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_add(ctx, (T *)r_buf, 2, \ 1); \ break; \ case AMO_FIncTestType: \ ret = \ - roc_shmem_ctx_##TNAME##_atomic_fetch_inc(ctx, (T *)r_buf, 1); \ + rocshmem_ctx_##TNAME##_atomic_fetch_inc(ctx, (T *)r_buf, 1); \ break; \ case AMO_FCswapTestType: \ - ret = roc_shmem_ctx_##TNAME##_atomic_compare_swap(ctx, (T *)r_buf, \ + ret = rocshmem_ctx_##TNAME##_atomic_compare_swap(ctx, (T *)r_buf, \ cond, (T)i, 1); \ cond = i; \ break; \ case AMO_AddTestType: \ - roc_shmem_ctx_##TNAME##_atomic_add(ctx, (T *)r_buf, 2, 1); \ + rocshmem_ctx_##TNAME##_atomic_add(ctx, (T *)r_buf, 2, 1); \ break; \ case AMO_IncTestType: \ - roc_shmem_ctx_##TNAME##_atomic_inc(ctx, (T *)r_buf, 1); \ + rocshmem_ctx_##TNAME##_atomic_inc(ctx, (T *)r_buf, 1); \ break; \ default: \ break; \ } \ } \ - roc_shmem_ctx_quiet(ctx); \ - timer[hipBlockIdx_x] = roc_shmem_timer() - start; \ + rocshmem_ctx_quiet(ctx); \ + timer[hipBlockIdx_x] = rocshmem_timer() - start; \ ret_val[hipBlockIdx_x] = ret; \ - roc_shmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ + rocshmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ } \ - roc_shmem_wg_ctx_destroy(&ctx); \ - roc_shmem_wg_finalize(); \ + rocshmem_wg_ctx_destroy(&ctx); \ + rocshmem_wg_finalize(); \ } \ template class AMOStandardTester; diff --git a/tests/functional_tests/barrier_all_tester.cpp b/tests/functional_tests/barrier_all_tester.cpp index f88f7d52e8..54faaa9ffd 100644 --- a/tests/functional_tests/barrier_all_tester.cpp +++ b/tests/functional_tests/barrier_all_tester.cpp @@ -22,7 +22,7 @@ #include "barrier_all_tester.hpp" -#include +#include using namespace rocshmem; @@ -30,29 +30,29 @@ using namespace rocshmem; * DEVICE TEST KERNEL *****************************************************************************/ __global__ void BarrierAllTest(int loop, int skip, uint64_t *timer) { - __shared__ roc_shmem_ctx_t ctx; + __shared__ rocshmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ROC_SHMEM_CTX_WG_PRIVATE, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx); uint64_t start; for (int i = 0; i < loop + skip; i++) { if (hipThreadIdx_x == 0 && i == skip) { - start = roc_shmem_timer(); + start = rocshmem_timer(); } __syncthreads(); - roc_shmem_ctx_wg_barrier_all(ctx); + rocshmem_ctx_wg_barrier_all(ctx); } __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** diff --git a/tests/functional_tests/empty_tester.cpp b/tests/functional_tests/empty_tester.cpp index 1be65a32c0..5bc9facd77 100644 --- a/tests/functional_tests/empty_tester.cpp +++ b/tests/functional_tests/empty_tester.cpp @@ -22,7 +22,7 @@ #include "empty_tester.hpp" -#include +#include using namespace rocshmem; @@ -31,12 +31,12 @@ using namespace rocshmem; *****************************************************************************/ __global__ void EmptyTest(int loop, int skip, uint64_t *timer, int size, TestType type, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** diff --git a/tests/functional_tests/extended_primitives.cpp b/tests/functional_tests/extended_primitives.cpp index 8b540a565a..d3a4a7ba4f 100644 --- a/tests/functional_tests/extended_primitives.cpp +++ b/tests/functional_tests/extended_primitives.cpp @@ -22,7 +22,7 @@ #include "extended_primitives.hpp" -#include +#include #include @@ -35,9 +35,9 @@ __global__ void ExtendedPrimitiveTest(int loop, int skip, uint64_t *timer, char *s_buf, char *r_buf, int size, TestType type, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); /** * Calculate start index for each work group for tiled version @@ -50,34 +50,34 @@ __global__ void ExtendedPrimitiveTest(int loop, int skip, uint64_t *timer, r_buf += idx; for (int i = 0; i < loop + skip; i++) { - if (i == skip) start = roc_shmem_timer(); + if (i == skip) start = rocshmem_timer(); switch (type) { case WGGetTestType: - roc_shmem_ctx_getmem_wg(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_getmem_wg(ctx, r_buf, s_buf, size, 1); break; case WGGetNBITestType: - roc_shmem_ctx_getmem_nbi_wg(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_getmem_nbi_wg(ctx, r_buf, s_buf, size, 1); break; case WGPutTestType: - roc_shmem_ctx_putmem_wg(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem_wg(ctx, r_buf, s_buf, size, 1); break; case WGPutNBITestType: - roc_shmem_ctx_putmem_nbi_wg(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem_nbi_wg(ctx, r_buf, s_buf, size, 1); break; default: break; } } - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -85,13 +85,13 @@ __global__ void ExtendedPrimitiveTest(int loop, int skip, uint64_t *timer, *****************************************************************************/ ExtendedPrimitiveTester::ExtendedPrimitiveTester(TesterArguments args) : Tester(args) { - s_buf = static_cast(roc_shmem_malloc(args.max_msg_size * args.num_wgs)); - r_buf = static_cast(roc_shmem_malloc(args.max_msg_size * args.num_wgs)); + s_buf = static_cast(rocshmem_malloc(args.max_msg_size * args.num_wgs)); + r_buf = static_cast(rocshmem_malloc(args.max_msg_size * args.num_wgs)); } ExtendedPrimitiveTester::~ExtendedPrimitiveTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); + rocshmem_free(s_buf); + rocshmem_free(r_buf); } void ExtendedPrimitiveTester::resetBuffers(uint64_t size) { diff --git a/tests/functional_tests/fcollect_tester.cpp b/tests/functional_tests/fcollect_tester.cpp index fecc6d625f..d842569b0a 100644 --- a/tests/functional_tests/fcollect_tester.cpp +++ b/tests/functional_tests/fcollect_tester.cpp @@ -22,21 +22,21 @@ using namespace rocshmem; -roc_shmem_team_t team_fcollect_world_dup; +rocshmem_team_t team_fcollect_world_dup; /* Declare the template with a generic implementation */ template -__device__ void wg_fcollect(roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, +__device__ void wg_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelems) { return; } -/* Define templates to call ROC_SHMEM */ -#define FCOLLECT_DEF_GEN(T, TNAME) \ - template <> \ - __device__ void wg_fcollect(roc_shmem_ctx_t ctx, roc_shmem_team_t team, \ - T * dest, const T *source, int nelem) { \ - roc_shmem_ctx_##TNAME##_wg_fcollect(ctx, team, dest, source, nelem); \ +/* Define templates to call rocSHMEM */ +#define FCOLLECT_DEF_GEN(T, TNAME) \ + template <> \ + __device__ void wg_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team, \ + T * dest, const T *source, int nelem) { \ + rocshmem_ctx_##TNAME##_wg_fcollect(ctx, team, dest, source, nelem); \ } FCOLLECT_DEF_GEN(float, float) @@ -60,19 +60,19 @@ FCOLLECT_DEF_GEN(unsigned long long, ulonglong) template __global__ void FcollectTest(int loop, int skip, uint64_t *timer, T1 *source_buf, T1 *dest_buf, int size, - ShmemContextType ctx_type, roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; + ShmemContextType ctx_type, rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); - int n_pes = roc_shmem_ctx_n_pes(ctx); + int n_pes = rocshmem_ctx_n_pes(ctx); __syncthreads(); uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip && hipThreadIdx_x == 0) { - start = roc_shmem_timer(); + start = rocshmem_timer(); } wg_fcollect(ctx, team, dest_buf, // T* dest @@ -83,11 +83,11 @@ __global__ void FcollectTest(int loop, int skip, uint64_t *timer, __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -98,24 +98,24 @@ FcollectTester::FcollectTester( TesterArguments args, std::function f1, std::function(const T1 &, T1)> f2) : Tester(args), init_buf{f1}, verify_buf{f2} { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); - source_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); - dest_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); + source_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); + dest_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); } template FcollectTester::~FcollectTester() { - roc_shmem_free(source_buf); - roc_shmem_free(dest_buf); + rocshmem_free(source_buf); + rocshmem_free(dest_buf); } template void FcollectTester::preLaunchKernel() { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); bw_factor = sizeof(T1) * n_pes; - team_fcollect_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + team_fcollect_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &team_fcollect_world_dup); } @@ -134,12 +134,12 @@ void FcollectTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, template void FcollectTester::postLaunchKernel() { - roc_shmem_team_destroy(team_fcollect_world_dup); + rocshmem_team_destroy(team_fcollect_world_dup); } template void FcollectTester::resetBuffers(uint64_t size) { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); for (int i = 0; i < n_pes; i++) { for (int j = 0; j < size; j++) { // Note: This is redundant work, @@ -151,7 +151,7 @@ void FcollectTester::resetBuffers(uint64_t size) { template void FcollectTester::verifyResults(uint64_t size) { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); for (int i = 0; i < n_pes; i++) { for (int j = 0; j < size; j++) { auto r = verify_buf(dest_buf[i * size + j], i); diff --git a/tests/functional_tests/ping_all_tester.cpp b/tests/functional_tests/ping_all_tester.cpp index 36ffc4ed98..858882c7c7 100644 --- a/tests/functional_tests/ping_all_tester.cpp +++ b/tests/functional_tests/ping_all_tester.cpp @@ -22,7 +22,7 @@ #include "ping_all_tester.hpp" -#include +#include using namespace rocshmem; @@ -31,13 +31,13 @@ using namespace rocshmem; *****************************************************************************/ __global__ void PingAllTest(int loop, int skip, uint64_t *timer, int *r_buf, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; + __shared__ rocshmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); - int pe = roc_shmem_ctx_my_pe(ctx); - int num_pe = roc_shmem_ctx_n_pes(ctx); + int pe = rocshmem_ctx_my_pe(ctx); + int num_pe = rocshmem_ctx_n_pes(ctx); int status[1024]; for (int j{0}; j < num_pe; j++) { status[j] = 0; @@ -49,32 +49,32 @@ __global__ void PingAllTest(int loop, int skip, uint64_t *timer, int *r_buf, for (int i = 0; i < loop + skip; i++) { if (i == skip) { - start = roc_shmem_timer(); + start = rocshmem_timer(); } for (int j{0}; j < num_pe; j++) { - roc_shmem_ctx_int_p(ctx, &r_buf[blk_pe_off + pe], 1, j); + rocshmem_ctx_int_p(ctx, &r_buf[blk_pe_off + pe], 1, j); } - roc_shmem_int_wait_until_all(&r_buf[blk_pe_off], num_pe, status, ROC_SHMEM_CMP_EQ, 1); + rocshmem_int_wait_until_all(&r_buf[blk_pe_off], num_pe, status, ROCSHMEM_CMP_EQ, 1); } - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ PingAllTester::PingAllTester(TesterArguments args) : Tester(args) { - int num_pes {roc_shmem_n_pes()}; - r_buf = (int *)roc_shmem_malloc(sizeof(int) * args.wg_size * num_pes); + int num_pes {rocshmem_n_pes()}; + r_buf = (int *)rocshmem_malloc(sizeof(int) * args.wg_size * num_pes); } -PingAllTester::~PingAllTester() { roc_shmem_free(r_buf); } +PingAllTester::~PingAllTester() { rocshmem_free(r_buf); } void PingAllTester::resetBuffers(uint64_t size) { - int num_pes {roc_shmem_n_pes()}; + int num_pes {rocshmem_n_pes()}; memset(r_buf, 0, sizeof(int) * args.wg_size * num_pes); } diff --git a/tests/functional_tests/ping_pong_tester.cpp b/tests/functional_tests/ping_pong_tester.cpp index 064f8976f9..f0e6ebb1ea 100644 --- a/tests/functional_tests/ping_pong_tester.cpp +++ b/tests/functional_tests/ping_pong_tester.cpp @@ -22,7 +22,7 @@ #include "ping_pong_tester.hpp" -#include +#include using namespace rocshmem; @@ -31,45 +31,45 @@ using namespace rocshmem; *****************************************************************************/ __global__ void PingPongTest(int loop, int skip, uint64_t *timer, int *r_buf, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; + __shared__ rocshmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); - int pe = roc_shmem_ctx_my_pe(ctx); + int pe = rocshmem_ctx_my_pe(ctx); if (hipThreadIdx_x == 0) { uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip) { - start = roc_shmem_timer(); + start = rocshmem_timer(); } if (pe == 0) { - roc_shmem_ctx_int_p(ctx, &r_buf[hipBlockIdx_x], i + 1, 1); - roc_shmem_int_wait_until(&r_buf[hipBlockIdx_x], ROC_SHMEM_CMP_EQ, + rocshmem_ctx_int_p(ctx, &r_buf[hipBlockIdx_x], i + 1, 1); + rocshmem_int_wait_until(&r_buf[hipBlockIdx_x], ROCSHMEM_CMP_EQ, i + 1); } else { - roc_shmem_int_wait_until(&r_buf[hipBlockIdx_x], ROC_SHMEM_CMP_EQ, + rocshmem_int_wait_until(&r_buf[hipBlockIdx_x], ROCSHMEM_CMP_EQ, i + 1); - roc_shmem_ctx_int_p(ctx, &r_buf[hipBlockIdx_x], i + 1, 0); + rocshmem_ctx_int_p(ctx, &r_buf[hipBlockIdx_x], i + 1, 0); } } - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ PingPongTester::PingPongTester(TesterArguments args) : Tester(args) { - r_buf = (int *)roc_shmem_malloc(sizeof(int) * args.wg_size); + r_buf = (int *)rocshmem_malloc(sizeof(int) * args.wg_size); } -PingPongTester::~PingPongTester() { roc_shmem_free(r_buf); } +PingPongTester::~PingPongTester() { rocshmem_free(r_buf); } void PingPongTester::resetBuffers(uint64_t size) { memset(r_buf, 0, sizeof(int) * args.wg_size); diff --git a/tests/functional_tests/primitive_mr_tester.cpp b/tests/functional_tests/primitive_mr_tester.cpp index 4152b5e10f..0563be2471 100644 --- a/tests/functional_tests/primitive_mr_tester.cpp +++ b/tests/functional_tests/primitive_mr_tester.cpp @@ -22,7 +22,7 @@ #include "primitive_mr_tester.hpp" -#include +#include using namespace rocshmem; @@ -32,42 +32,42 @@ using namespace rocshmem; __global__ void PrimitiveMRTest(int loop, uint64_t *timer, char *s_buf, char *r_buf, int size, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); if (hipThreadIdx_x == 0) { uint64_t start; - start = roc_shmem_timer(); + start = rocshmem_timer(); for (int win_i = 0; win_i < 64 * loop; win_i++) { for (int i = 0; i < 64; i++) { - roc_shmem_ctx_putmem_nbi(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem_nbi(ctx, r_buf, s_buf, size, 1); } - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); } - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } __syncthreads(); - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ PrimitiveMRTester::PrimitiveMRTester(TesterArguments args) : Tester(args) { - s_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size); - r_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size); + s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); + r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); } PrimitiveMRTester::~PrimitiveMRTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); + rocshmem_free(s_buf); + rocshmem_free(r_buf); } void PrimitiveMRTester::resetBuffers(size_t size) { diff --git a/tests/functional_tests/primitive_tester.cpp b/tests/functional_tests/primitive_tester.cpp index 5f97583601..b6598411e9 100644 --- a/tests/functional_tests/primitive_tester.cpp +++ b/tests/functional_tests/primitive_tester.cpp @@ -22,7 +22,7 @@ #include "primitive_tester.hpp" -#include +#include using namespace rocshmem; @@ -32,40 +32,40 @@ using namespace rocshmem; __global__ void PrimitiveTest(int loop, int skip, uint64_t *timer, char *s_buf, char *r_buf, int size, TestType type, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip) { __syncthreads(); - start = roc_shmem_timer(); + start = rocshmem_timer(); } switch (type) { case GetTestType: - roc_shmem_ctx_getmem(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_getmem(ctx, r_buf, s_buf, size, 1); break; case GetNBITestType: - roc_shmem_ctx_getmem_nbi(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_getmem_nbi(ctx, r_buf, s_buf, size, 1); break; case PutTestType: - roc_shmem_ctx_putmem(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem(ctx, r_buf, s_buf, size, 1); break; case PutNBITestType: - roc_shmem_ctx_putmem_nbi(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem_nbi(ctx, r_buf, s_buf, size, 1); break; case PTestType: for (int s = 0; s < size; s++) { char val = s_buf[s]; - roc_shmem_ctx_char_p(ctx, &r_buf[s], val, 1); + rocshmem_ctx_char_p(ctx, &r_buf[s], val, 1); } break; case GTestType: for (int s = 0; s < size; s++) { - char ret = roc_shmem_ctx_char_g(ctx, &s_buf[s], 1); + char ret = rocshmem_ctx_char_g(ctx, &s_buf[s], 1); r_buf[s] = ret; } break; @@ -74,29 +74,29 @@ __global__ void PrimitiveTest(int loop, int skip, uint64_t *timer, char *s_buf, } } - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ PrimitiveTester::PrimitiveTester(TesterArguments args) : Tester(args) { - s_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size); - r_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size); + s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); + r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); } PrimitiveTester::~PrimitiveTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); + rocshmem_free(s_buf); + rocshmem_free(r_buf); } void PrimitiveTester::resetBuffers(uint64_t size) { diff --git a/tests/functional_tests/random_access_tester.cpp b/tests/functional_tests/random_access_tester.cpp index 7f2891b268..f25d834d6f 100644 --- a/tests/functional_tests/random_access_tester.cpp +++ b/tests/functional_tests/random_access_tester.cpp @@ -21,7 +21,7 @@ *****************************************************************************/ #include "random_access_tester.hpp" -#include +#include using namespace rocshmem; @@ -55,11 +55,11 @@ __global__ void RandomAccessTest(int loop, int skip, uint64_t *timer, uint32_t *threads_bins, uint32_t *off_bins, uint32_t *PE_bins, ShmemContextType ctx_type) { uint64_t start; - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); - int pe = roc_shmem_ctx_my_pe(ctx); + int pe = rocshmem_ctx_my_pe(ctx); int offset; int PE; @@ -69,26 +69,26 @@ __global__ void RandomAccessTest(int loop, int skip, uint64_t *timer, r_buf = r_buf + offset; for (int i = 0; i < loop + skip; i++) { - if (i == skip) start = roc_shmem_timer(); + if (i == skip) start = rocshmem_timer(); switch (type) { case GetType: - roc_shmem_ctx_getmem(ctx, r_buf, s_buf, size, PE); + rocshmem_ctx_getmem(ctx, r_buf, s_buf, size, PE); break; case PutType: - roc_shmem_ctx_putmem(ctx, (char *)r_buf, (char *)s_buf, size, PE); + rocshmem_ctx_putmem(ctx, (char *)r_buf, (char *)s_buf, size, PE); break; default: break; } } - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); atomicAdd((unsigned long long *)&timer[hipBlockIdx_x], - roc_shmem_timer() - start); + rocshmem_timer() - start); } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -131,8 +131,8 @@ RandomAccessTester::RandomAccessTester(TesterArguments args) : Tester(args) { _num_bins = args.thread_access / args.coal_coef; assert((args.wg_size / 64) <= 1); - s_buf = (int *)roc_shmem_malloc(max_size * wg_size * space); - r_buf = (int *)roc_shmem_malloc(max_size * wg_size * space); + s_buf = (int *)rocshmem_malloc(max_size * wg_size * space); + r_buf = (int *)rocshmem_malloc(max_size * wg_size * space); h_buf = (int *)malloc(max_size * wg_size * space); h_dev_buf = (int *)malloc(max_size * wg_size * space); CHECK_HIP(hipMalloc((void **)&_threads_bins, sizeof(uint32_t) * _num_waves * _num_bins)); @@ -144,8 +144,8 @@ RandomAccessTester::RandomAccessTester(TesterArguments args) : Tester(args) { } RandomAccessTester::~RandomAccessTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); + rocshmem_free(s_buf); + rocshmem_free(r_buf); free(h_buf); free(h_dev_buf); CHECK_HIP(hipFree(_threads_bins)); diff --git a/tests/functional_tests/shmem_ptr_tester.cpp b/tests/functional_tests/shmem_ptr_tester.cpp index e715714ab9..564354825e 100644 --- a/tests/functional_tests/shmem_ptr_tester.cpp +++ b/tests/functional_tests/shmem_ptr_tester.cpp @@ -21,7 +21,7 @@ *****************************************************************************/ #include "shmem_ptr_tester.hpp" -#include +#include using namespace rocshmem; @@ -29,18 +29,18 @@ using namespace rocshmem; * DEVICE TEST KERNEL *****************************************************************************/ __global__ void ShmemPtrTest(char *r_buf, int *available) { - roc_shmem_wg_init(); + rocshmem_wg_init(); if (hipThreadIdx_x == 0) { char *local_addr = r_buf + 4; - void *remote_addr = roc_shmem_ptr((void *)local_addr, 1); + void *remote_addr = rocshmem_ptr((void *)local_addr, 1); if (remote_addr != NULL) { *available = 1; ((char *)remote_addr)[0] = '1'; } } - roc_shmem_wg_finalize(); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -48,12 +48,12 @@ __global__ void ShmemPtrTest(char *r_buf, int *available) { *****************************************************************************/ ShmemPtrTester::ShmemPtrTester(TesterArguments args) : Tester(args) { CHECK_HIP(hipMalloc((void **)&_available, sizeof(int))); - r_buf = (char *)roc_shmem_malloc(args.max_msg_size); + r_buf = (char *)rocshmem_malloc(args.max_msg_size); } ShmemPtrTester::~ShmemPtrTester() { CHECK_HIP(hipFree(_available)); - roc_shmem_free(r_buf); + rocshmem_free(r_buf); } void ShmemPtrTester::resetBuffers(uint64_t size) { diff --git a/tests/functional_tests/swarm_tester.cpp b/tests/functional_tests/swarm_tester.cpp index b6c9f3357a..8f7cd16c66 100644 --- a/tests/functional_tests/swarm_tester.cpp +++ b/tests/functional_tests/swarm_tester.cpp @@ -22,7 +22,7 @@ #include "swarm_tester.hpp" -#include +#include using namespace rocshmem; @@ -31,13 +31,13 @@ using namespace rocshmem; *****************************************************************************/ __global__ void GetSwarmTest(int loop, int skip, uint64_t *timer, char *s_buf, char *r_buf, int size, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; + __shared__ rocshmem_ctx_t ctx; int provided; - roc_shmem_wg_init_thread(ROC_SHMEM_THREAD_MULTIPLE, &provided); - assert(provided == ROC_SHMEM_THREAD_MULTIPLE); + rocshmem_wg_init_thread(ROCSHMEM_THREAD_MULTIPLE, &provided); + assert(provided == ROCSHMEM_THREAD_MULTIPLE); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_ctx_create(ctx_type, &ctx); __syncthreads(); @@ -45,18 +45,18 @@ __global__ void GetSwarmTest(int loop, int skip, uint64_t *timer, char *s_buf, uint64_t start = 0; for (int i = 0; i < loop + skip; i++) { - if (i == skip) start = roc_shmem_timer(); + if (i == skip) start = rocshmem_timer(); - roc_shmem_ctx_getmem(ctx, &r_buf[index], &s_buf[index], size, 1); + rocshmem_ctx_getmem(ctx, &r_buf[index], &s_buf[index], size, 1); __syncthreads(); } atomicAdd((unsigned long long *)&timer[hipBlockIdx_x], - roc_shmem_timer() - start); + rocshmem_timer() - start); - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** diff --git a/tests/functional_tests/sync_tester.cpp b/tests/functional_tests/sync_tester.cpp index 54d4a99f99..d481862eaf 100644 --- a/tests/functional_tests/sync_tester.cpp +++ b/tests/functional_tests/sync_tester.cpp @@ -22,33 +22,33 @@ #include "sync_tester.hpp" -#include +#include using namespace rocshmem; -roc_shmem_team_t team_sync_world_dup; +rocshmem_team_t team_sync_world_dup; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ __global__ void SyncTest(int loop, int skip, uint64_t *timer, TestType type, - ShmemContextType ctx_type, roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + ShmemContextType ctx_type, rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); uint64_t start; for (int i = 0; i < loop + skip; i++) { if (hipThreadIdx_x == 0 && i == skip) { - start = roc_shmem_timer(); + start = rocshmem_timer(); } __syncthreads(); switch (type) { case SyncAllTestType: - roc_shmem_ctx_wg_sync_all(ctx); + rocshmem_ctx_wg_sync_all(ctx); break; case SyncTestType: - roc_shmem_ctx_wg_team_sync(ctx, team); + rocshmem_ctx_wg_team_sync(ctx, team); break; default: break; @@ -57,11 +57,11 @@ __global__ void SyncTest(int loop, int skip, uint64_t *timer, TestType type, __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -77,10 +77,10 @@ void SyncTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - team_sync_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + team_sync_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &team_sync_world_dup); hipLaunchKernelGGL(SyncTest, gridSize, blockSize, shared_bytes, stream, loop, diff --git a/tests/functional_tests/team_broadcast_tester.cpp b/tests/functional_tests/team_broadcast_tester.cpp index b81fa057e3..b744936bf1 100644 --- a/tests/functional_tests/team_broadcast_tester.cpp +++ b/tests/functional_tests/team_broadcast_tester.cpp @@ -24,20 +24,20 @@ using namespace rocshmem; /* Declare the template with a generic implementation */ template -__device__ void wg_team_broadcast(roc_shmem_ctx_t ctx, roc_shmem_team_t team, +__device__ void wg_team_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelem, int pe_root) { return; } -/* Define templates to call ROC_SHMEM */ -#define TEAM_BROADCAST_DEF_GEN(T, TNAME) \ - template <> \ - __device__ void wg_team_broadcast( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ - int nelem, int pe_root) { \ - roc_shmem_ctx_##TNAME##_wg_broadcast(ctx, team, dest, source, nelem, \ - pe_root); \ +/* Define templates to call ROCSHMEM */ +#define TEAM_BROADCAST_DEF_GEN(T, TNAME) \ + template <> \ + __device__ void wg_team_broadcast( \ + rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ + int nelem, int pe_root) { \ + rocshmem_ctx_##TNAME##_wg_broadcast(ctx, team, dest, source, nelem, \ + pe_root); \ } TEAM_BROADCAST_DEF_GEN(float, float) @@ -55,7 +55,7 @@ TEAM_BROADCAST_DEF_GEN(unsigned int, uint) TEAM_BROADCAST_DEF_GEN(unsigned long, ulong) TEAM_BROADCAST_DEF_GEN(unsigned long long, ulonglong) -roc_shmem_team_t team_bcast_world_dup; +rocshmem_team_t team_bcast_world_dup; /****************************************************************************** * DEVICE TEST KERNEL @@ -64,20 +64,20 @@ template __global__ void TeamBroadcastTest(int loop, int skip, uint64_t *timer, T1 *source_buf, T1 *dest_buf, int size, ShmemContextType ctx_type, - roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; + rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); - int n_pes = roc_shmem_ctx_n_pes(ctx); + int n_pes = rocshmem_ctx_n_pes(ctx); __syncthreads(); uint64_t start; for (int i = 0; i < loop; i++) { if (i == skip && hipThreadIdx_x == 0) { - start = roc_shmem_timer(); + start = rocshmem_timer(); } wg_team_broadcast(ctx, team, @@ -85,17 +85,17 @@ __global__ void TeamBroadcastTest(int loop, int skip, uint64_t *timer, source_buf, // const T* source size, // int nelement 0); // int PE_root - roc_shmem_ctx_wg_barrier_all(ctx); + rocshmem_ctx_wg_barrier_all(ctx); } __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -106,22 +106,22 @@ TeamBroadcastTester::TeamBroadcastTester( TesterArguments args, std::function f1, std::function(const T1 &)> f2) : Tester(args), init_buf{f1}, verify_buf{f2} { - source_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); - dest_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); + source_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); + dest_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); } template TeamBroadcastTester::~TeamBroadcastTester() { - roc_shmem_free(source_buf); - roc_shmem_free(dest_buf); + rocshmem_free(source_buf); + rocshmem_free(dest_buf); } template void TeamBroadcastTester::preLaunchKernel() { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - team_bcast_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + team_bcast_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &team_bcast_world_dup); } @@ -140,7 +140,7 @@ void TeamBroadcastTester::launchKernel(dim3 gridSize, dim3 blockSize, template void TeamBroadcastTester::postLaunchKernel() { - roc_shmem_team_destroy(team_bcast_world_dup); + rocshmem_team_destroy(team_bcast_world_dup); } template diff --git a/tests/functional_tests/team_ctx_infra_tester.cpp b/tests/functional_tests/team_ctx_infra_tester.cpp index 6e0fe0c45f..7920adeb6c 100644 --- a/tests/functional_tests/team_ctx_infra_tester.cpp +++ b/tests/functional_tests/team_ctx_infra_tester.cpp @@ -24,34 +24,34 @@ #include -#include +#include using namespace rocshmem; -/* this constant should equal ROC_SHMEM_MAX_NUM_TEAMS-1 */ +/* this constant should equal ROCSHMEM_MAX_NUM_TEAMS-1 */ #define NUM_TEAMS 39 -roc_shmem_team_t team_world_dup[NUM_TEAMS]; +rocshmem_team_t team_world_dup[NUM_TEAMS]; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ __global__ void TeamCtxInfraTest(ShmemContextType ctx_type, - roc_shmem_team_t *team) { - __shared__ roc_shmem_ctx_t ctx1, ctx2, ctx3; - __shared__ roc_shmem_ctx_t ctx[NUM_TEAMS]; + rocshmem_team_t *team) { + __shared__ rocshmem_ctx_t ctx1, ctx2, ctx3; + __shared__ rocshmem_ctx_t ctx[NUM_TEAMS]; - roc_shmem_wg_init(); + rocshmem_wg_init(); /** * Test 1: Assert team infos of different ctxs * from the same team are the same. */ - roc_shmem_wg_team_create_ctx(team[0], ctx_type, &ctx1); - roc_shmem_wg_team_create_ctx(team[0], ctx_type, &ctx2); - roc_shmem_wg_ctx_destroy(&ctx1); - roc_shmem_wg_team_create_ctx(team[0], ctx_type, &ctx3); + rocshmem_wg_team_create_ctx(team[0], ctx_type, &ctx1); + rocshmem_wg_team_create_ctx(team[0], ctx_type, &ctx2); + rocshmem_wg_ctx_destroy(&ctx1); + rocshmem_wg_team_create_ctx(team[0], ctx_type, &ctx3); __syncthreads(); @@ -60,8 +60,8 @@ __global__ void TeamCtxInfraTest(ShmemContextType ctx_type, abort(); } - roc_shmem_wg_ctx_destroy(&ctx2); - roc_shmem_wg_ctx_destroy(&ctx3); + rocshmem_wg_ctx_destroy(&ctx2); + rocshmem_wg_ctx_destroy(&ctx3); __syncthreads(); @@ -70,7 +70,7 @@ __global__ void TeamCtxInfraTest(ShmemContextType ctx_type, * from different teams are different. */ for (int team_i = 0; team_i < NUM_TEAMS; team_i++) { - roc_shmem_wg_team_create_ctx(team[team_i], ctx_type, &ctx[team_i]); + rocshmem_wg_team_create_ctx(team[team_i], ctx_type, &ctx[team_i]); } if (ctx[0].team_opaque == ctx[NUM_TEAMS - 1].team_opaque) { @@ -82,10 +82,10 @@ __global__ void TeamCtxInfraTest(ShmemContextType ctx_type, __syncthreads(); for (int team_i = 0; team_i < NUM_TEAMS; team_i++) { - roc_shmem_wg_ctx_destroy(&ctx[team_i]); + rocshmem_wg_ctx_destroy(&ctx[team_i]); } - roc_shmem_wg_finalize(); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -98,23 +98,23 @@ TeamCtxInfraTester::~TeamCtxInfraTester() {} void TeamCtxInfraTester::resetBuffers(uint64_t size) {} void TeamCtxInfraTester::preLaunchKernel() { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); for (int team_i = 0; team_i < NUM_TEAMS; team_i++) { - team_world_dup[team_i] = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + team_world_dup[team_i] = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &team_world_dup[team_i]); - if (team_world_dup[team_i] == ROC_SHMEM_TEAM_INVALID) { + if (team_world_dup[team_i] == ROCSHMEM_TEAM_INVALID) { printf("Team %d is invalid!\n", team_i); abort(); } } /* Assert the failure of a new team creation. */ - roc_shmem_team_t new_team = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + rocshmem_team_t new_team = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &new_team); - if (new_team != ROC_SHMEM_TEAM_INVALID) { + if (new_team != ROCSHMEM_TEAM_INVALID) { printf("new team is not invalid\n"); abort(); } @@ -125,10 +125,10 @@ void TeamCtxInfraTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, size_t shared_bytes = 0; /* Copy array of teams to device */ - roc_shmem_team_t *teams_on_device; - CHECK_HIP(hipMalloc(&teams_on_device, sizeof(roc_shmem_team_t) * NUM_TEAMS)); + rocshmem_team_t *teams_on_device; + CHECK_HIP(hipMalloc(&teams_on_device, sizeof(rocshmem_team_t) * NUM_TEAMS)); CHECK_HIP(hipMemcpy(teams_on_device, team_world_dup, - sizeof(roc_shmem_team_t) * NUM_TEAMS, hipMemcpyHostToDevice)); + sizeof(rocshmem_team_t) * NUM_TEAMS, hipMemcpyHostToDevice)); hipLaunchKernelGGL(TeamCtxInfraTest, gridSize, blockSize, shared_bytes, stream, _shmem_context, teams_on_device); @@ -138,7 +138,7 @@ void TeamCtxInfraTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, void TeamCtxInfraTester::postLaunchKernel() { for (int team_i = 0; team_i < NUM_TEAMS; team_i++) { - roc_shmem_team_destroy(team_world_dup[team_i]); + rocshmem_team_destroy(team_world_dup[team_i]); } } diff --git a/tests/functional_tests/team_ctx_primitive_tester.cpp b/tests/functional_tests/team_ctx_primitive_tester.cpp index e70b6796af..073d49a625 100644 --- a/tests/functional_tests/team_ctx_primitive_tester.cpp +++ b/tests/functional_tests/team_ctx_primitive_tester.cpp @@ -22,11 +22,11 @@ #include "team_ctx_primitive_tester.hpp" -#include +#include using namespace rocshmem; -roc_shmem_team_t team_primitive_world_dup; +rocshmem_team_t team_primitive_world_dup; /****************************************************************************** * DEVICE TEST KERNEL @@ -34,42 +34,42 @@ roc_shmem_team_t team_primitive_world_dup; __global__ void TeamCtxPrimitiveTest(int loop, int skip, uint64_t *timer, char *s_buf, char *r_buf, int size, TestType type, ShmemContextType ctx_type, - roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_team_create_ctx(team, ctx_type, &ctx); + rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_team_create_ctx(team, ctx_type, &ctx); if (hipThreadIdx_x == 0) { uint64_t start; for (int i = 0; i < loop + skip; i++) { - if (i == skip) start = roc_shmem_timer(); + if (i == skip) start = rocshmem_timer(); switch (type) { case TeamCtxGetTestType: - roc_shmem_ctx_getmem(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_getmem(ctx, r_buf, s_buf, size, 1); break; case TeamCtxGetNBITestType: - roc_shmem_ctx_getmem_nbi(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_getmem_nbi(ctx, r_buf, s_buf, size, 1); break; case TeamCtxPutTestType: - roc_shmem_ctx_putmem(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem(ctx, r_buf, s_buf, size, 1); break; case TeamCtxPutNBITestType: - roc_shmem_ctx_putmem_nbi(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem_nbi(ctx, r_buf, s_buf, size, 1); break; default: break; } } - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -77,13 +77,13 @@ __global__ void TeamCtxPrimitiveTest(int loop, int skip, uint64_t *timer, *****************************************************************************/ TeamCtxPrimitiveTester::TeamCtxPrimitiveTester(TesterArguments args) : Tester(args) { - s_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size); - r_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size); + s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); + r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); } TeamCtxPrimitiveTester::~TeamCtxPrimitiveTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); + rocshmem_free(s_buf); + rocshmem_free(r_buf); } void TeamCtxPrimitiveTester::resetBuffers(uint64_t size) { @@ -92,10 +92,10 @@ void TeamCtxPrimitiveTester::resetBuffers(uint64_t size) { } void TeamCtxPrimitiveTester::preLaunchKernel() { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - team_primitive_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + team_primitive_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &team_primitive_world_dup); } @@ -112,7 +112,7 @@ void TeamCtxPrimitiveTester::launchKernel(dim3 gridSize, dim3 blockSize, } void TeamCtxPrimitiveTester::postLaunchKernel() { - roc_shmem_team_destroy(team_primitive_world_dup); + rocshmem_team_destroy(team_primitive_world_dup); } void TeamCtxPrimitiveTester::verifyResults(uint64_t size) { diff --git a/tests/functional_tests/team_reduction_tester.cpp b/tests/functional_tests/team_reduction_tester.cpp index 6b9d56d225..3b86d7592c 100644 --- a/tests/functional_tests/team_reduction_tester.cpp +++ b/tests/functional_tests/team_reduction_tester.cpp @@ -23,32 +23,32 @@ using namespace rocshmem; /* Declare the template with a generic implementation */ -template -__device__ int wg_team_reduce(roc_shmem_ctx_t ctx, roc_shmem_team_t, T *dest, +template +__device__ int wg_team_reduce(rocshmem_ctx_t ctx, rocshmem_team_t, T *dest, const T *source, int nreduce) { - return ROC_SHMEM_SUCCESS; + return ROCSHMEM_SUCCESS; } -/* Define templates to call ROC_SHMEM */ +/* Define templates to call rocSHMEM */ #define TEAM_REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ template <> \ - __device__ int wg_team_reduce(roc_shmem_ctx_t ctx, \ - roc_shmem_team_t team, T * dest, \ + __device__ int wg_team_reduce(rocshmem_ctx_t ctx, \ + rocshmem_team_t team, T * dest, \ const T *source, int nreduce) { \ - return roc_shmem_ctx_##TNAME##_##Op_API##_wg_reduce(ctx, team, dest, \ + return rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce(ctx, team, dest, \ source, nreduce); \ } #define TEAM_ARITH_REDUCTION_DEF_GEN(T, TNAME) \ - TEAM_REDUCTION_DEF_GEN(T, TNAME, sum, ROC_SHMEM_SUM) \ - TEAM_REDUCTION_DEF_GEN(T, TNAME, min, ROC_SHMEM_MIN) \ - TEAM_REDUCTION_DEF_GEN(T, TNAME, max, ROC_SHMEM_MAX) \ - TEAM_REDUCTION_DEF_GEN(T, TNAME, prod, ROC_SHMEM_PROD) + TEAM_REDUCTION_DEF_GEN(T, TNAME, sum, ROCSHMEM_SUM) \ + TEAM_REDUCTION_DEF_GEN(T, TNAME, min, ROCSHMEM_MIN) \ + TEAM_REDUCTION_DEF_GEN(T, TNAME, max, ROCSHMEM_MAX) \ + TEAM_REDUCTION_DEF_GEN(T, TNAME, prod, ROCSHMEM_PROD) #define TEAM_BITWISE_REDUCTION_DEF_GEN(T, TNAME) \ - TEAM_REDUCTION_DEF_GEN(T, TNAME, or, ROC_SHMEM_OR) \ - TEAM_REDUCTION_DEF_GEN(T, TNAME, and, ROC_SHMEM_AND) \ - TEAM_REDUCTION_DEF_GEN(T, TNAME, xor, ROC_SHMEM_XOR) + TEAM_REDUCTION_DEF_GEN(T, TNAME, or, ROCSHMEM_OR) \ + TEAM_REDUCTION_DEF_GEN(T, TNAME, and, ROCSHMEM_AND) \ + TEAM_REDUCTION_DEF_GEN(T, TNAME, xor, ROCSHMEM_XOR) #define TEAM_INT_REDUCTION_DEF_GEN(T, TNAME) \ TEAM_ARITH_REDUCTION_DEF_GEN(T, TNAME) \ @@ -67,72 +67,72 @@ TEAM_FLOAT_REDUCTION_DEF_GEN(double, double) // so disable it for now. // FLOAT_REDUCTION_DEF_GEN(long double, longdouble) -roc_shmem_team_t team_reduce_world_dup; +rocshmem_team_t team_reduce_world_dup; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -template +template __global__ void TeamReductionTest(int loop, int skip, uint64_t *timer, T1 *s_buf, T1 *r_buf, int size, TestType type, ShmemContextType ctx_type, - roc_shmem_team_t team) { - __shared__ roc_shmem_ctx_t ctx; + rocshmem_team_t team) { + __shared__ rocshmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); - int n_pes = roc_shmem_ctx_n_pes(ctx); + int n_pes = rocshmem_ctx_n_pes(ctx); __syncthreads(); uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip && hipThreadIdx_x == 0) { - start = roc_shmem_timer(); + start = rocshmem_timer(); } wg_team_reduce(ctx, team, r_buf, s_buf, size); - roc_shmem_ctx_wg_barrier_all(ctx); + rocshmem_ctx_wg_barrier_all(ctx); } __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ -template +template TeamReductionTester::TeamReductionTester( TesterArguments args, std::function f1, std::function(const T1 &, const T1 &)> f2) : Tester(args), init_buf{f1}, verify_buf{f2} { - s_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); - r_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); + s_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); + r_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); } -template +template TeamReductionTester::~TeamReductionTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); + rocshmem_free(s_buf); + rocshmem_free(r_buf); } -template +template void TeamReductionTester::preLaunchKernel() { - int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID; - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + team_reduce_world_dup = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &team_reduce_world_dup); } -template +template void TeamReductionTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; @@ -146,21 +146,21 @@ void TeamReductionTester::launchKernel(dim3 gridSize, dim3 blockSize, num_timed_msgs = loop; } -template +template void TeamReductionTester::postLaunchKernel() { - roc_shmem_team_destroy(team_reduce_world_dup); + rocshmem_team_destroy(team_reduce_world_dup); } -template +template void TeamReductionTester::resetBuffers(uint64_t size) { for (int i = 0; i < args.max_msg_size; i++) { init_buf(s_buf[i], r_buf[i]); } } -template +template void TeamReductionTester::verifyResults(uint64_t size) { - int n_pes = roc_shmem_n_pes(); + int n_pes = rocshmem_n_pes(); for (int i = 0; i < size; i++) { auto r = verify_buf(r_buf[i], (T1)n_pes); if (r.first == false) { diff --git a/tests/functional_tests/team_reduction_tester.hpp b/tests/functional_tests/team_reduction_tester.hpp index cb44e9ff17..9b7f9460ba 100644 --- a/tests/functional_tests/team_reduction_tester.hpp +++ b/tests/functional_tests/team_reduction_tester.hpp @@ -31,7 +31,7 @@ /****************************************************************************** * HOST TESTER CLASS *****************************************************************************/ -template +template class TeamReductionTester : public Tester { public: explicit TeamReductionTester( diff --git a/tests/functional_tests/test_driver.cpp b/tests/functional_tests/test_driver.cpp index 7acef2dbc0..6d4129a32e 100644 --- a/tests/functional_tests/test_driver.cpp +++ b/tests/functional_tests/test_driver.cpp @@ -20,7 +20,7 @@ * IN THE SOFTWARE. *****************************************************************************/ -#include +#include #include #include "tester.hpp" @@ -37,7 +37,7 @@ int main(int argc, char *argv[]) { /*** * Select a GPU */ - int rank = roc_shmem_my_pe(); + int rank = rocshmem_my_pe(); int ndevices, my_device = 0; CHECK_HIP(hipGetDeviceCount(&ndevices)); my_device = rank % ndevices; @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) { /** * Must initialize rocshmem to access arguments needed by the tester. */ - roc_shmem_init(); + rocshmem_init(); /** * Now grab the arguments from rocshmem. @@ -76,7 +76,7 @@ int main(int argc, char *argv[]) { * The rocshmem library needs to be cleaned up with this call. It pairs * with the init function above. */ - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/functional_tests/tester.cpp b/tests/functional_tests/tester.cpp index b73ae67b6c..5c848139b1 100644 --- a/tests/functional_tests/tester.cpp +++ b/tests/functional_tests/tester.cpp @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include "alltoall_tester.hpp" @@ -140,7 +140,7 @@ std::vector Tester::create(TesterArguments args) { case TeamReductionTestType: if (rank == 0) std::cout << "All-to-All Team-based Reduction ###" << std::endl; - testers.push_back(new TeamReductionTester( + testers.push_back(new TeamReductionTester( args, [](float& f1, float& f2) { f1 = 1; @@ -480,8 +480,8 @@ void Tester::execute() { printf("error = %d \n", err); } - // roc_shmem_dump_stats(); - roc_shmem_reset_stats(); + // rocshmem_dump_stats(); + rocshmem_reset_stats(); } barrier(); diff --git a/tests/functional_tests/tester.hpp b/tests/functional_tests/tester.hpp index d5da3a6bc7..b915612d10 100644 --- a/tests/functional_tests/tester.hpp +++ b/tests/functional_tests/tester.hpp @@ -23,7 +23,7 @@ #ifndef _TESTER_HPP_ #define _TESTER_HPP_ -#include +#include #include #include "tester_arguments.hpp" diff --git a/tests/functional_tests/tester_arguments.cpp b/tests/functional_tests/tester_arguments.cpp index a8c5067492..a9fe9d0482 100644 --- a/tests/functional_tests/tester_arguments.cpp +++ b/tests/functional_tests/tester_arguments.cpp @@ -24,7 +24,7 @@ #include #include -#include +#include #include "tester.hpp" @@ -120,7 +120,7 @@ TesterArguments::TesterArguments(int argc, char *argv[]) { void TesterArguments::show_usage(std::string executable_name) { std::cout << "Usage: " << executable_name << std::endl; - std::cout << "\t-t \n"; + std::cout << "\t-t \n"; std::cout << "\t-w \n"; std::cout << "\t-s \n"; std::cout << "\t-a \n"; @@ -132,8 +132,8 @@ void TesterArguments::show_usage(std::string executable_name) { } void TesterArguments::get_rocshmem_arguments() { - numprocs = roc_shmem_n_pes(); - myid = roc_shmem_my_pe(); + numprocs = rocshmem_n_pes(); + myid = rocshmem_my_pe(); TestType type = (TestType)algorithm; if ((type != BarrierAllTestType) && (type != SyncAllTestType) && diff --git a/tests/functional_tests/tester_arguments.hpp b/tests/functional_tests/tester_arguments.hpp index 88ff6a5537..5bdbfdbd0a 100644 --- a/tests/functional_tests/tester_arguments.hpp +++ b/tests/functional_tests/tester_arguments.hpp @@ -25,7 +25,7 @@ #include #include -#include +#include #include class TesterArguments { @@ -34,7 +34,7 @@ class TesterArguments { /** * Initialize rocshmem members - * Valid after roc_shmem_init function called. + * Valid after rocshmem_init function called. */ void get_rocshmem_arguments(); @@ -57,7 +57,7 @@ class TesterArguments { unsigned thread_access = 64; unsigned coal_coef = 64; unsigned op_type = 0; - unsigned shmem_context = rocshmem::ROC_SHMEM_CTX_WG_PRIVATE; + unsigned shmem_context = rocshmem::ROCSHMEM_CTX_WG_PRIVATE; /** * Arguments obtained from rocshmem diff --git a/tests/functional_tests/wave_level_primitives.cpp b/tests/functional_tests/wave_level_primitives.cpp index 598e5ad2e7..c434fae387 100644 --- a/tests/functional_tests/wave_level_primitives.cpp +++ b/tests/functional_tests/wave_level_primitives.cpp @@ -22,7 +22,7 @@ #include "wave_level_primitives.hpp" -#include +#include #include @@ -35,9 +35,9 @@ __global__ void WaveLevelPrimitiveTest(int loop, int skip, uint64_t *timer, char *s_buf, char *r_buf, int size, TestType type, ShmemContextType ctx_type, int wf_size) { - __shared__ roc_shmem_ctx_t ctx; - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); + __shared__ rocshmem_ctx_t ctx; + rocshmem_wg_init(); + rocshmem_wg_ctx_create(ctx_type, &ctx); /** * Calculate start index for each wavefront for tiled version @@ -52,34 +52,34 @@ __global__ void WaveLevelPrimitiveTest(int loop, int skip, uint64_t *timer, r_buf += idx; for (int i = 0; i < loop + skip; i++) { - if (i == skip) start = roc_shmem_timer(); + if (i == skip) start = rocshmem_timer(); switch (type) { case WAVEGetTestType: - roc_shmem_ctx_getmem_wave(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_getmem_wave(ctx, r_buf, s_buf, size, 1); break; case WAVEGetNBITestType: - roc_shmem_ctx_getmem_nbi_wave(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_getmem_nbi_wave(ctx, r_buf, s_buf, size, 1); break; case WAVEPutTestType: - roc_shmem_ctx_putmem_wave(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem_wave(ctx, r_buf, s_buf, size, 1); break; case WAVEPutNBITestType: - roc_shmem_ctx_putmem_nbi_wave(ctx, r_buf, s_buf, size, 1); + rocshmem_ctx_putmem_nbi_wave(ctx, r_buf, s_buf, size, 1); break; default: break; } } - roc_shmem_ctx_quiet(ctx); + rocshmem_ctx_quiet(ctx); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; + timer[hipBlockIdx_x] = rocshmem_timer() - start; } - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); + rocshmem_wg_ctx_destroy(&ctx); + rocshmem_wg_finalize(); } /****************************************************************************** @@ -88,14 +88,14 @@ __global__ void WaveLevelPrimitiveTest(int loop, int skip, uint64_t *timer, WaveLevelPrimitiveTester::WaveLevelPrimitiveTester(TesterArguments args) : Tester(args) { s_buf = static_cast( - roc_shmem_malloc(args.max_msg_size * args.num_wgs * num_warps)); + rocshmem_malloc(args.max_msg_size * args.num_wgs * num_warps)); r_buf = static_cast( - roc_shmem_malloc(args.max_msg_size * args.num_wgs * num_warps)); + rocshmem_malloc(args.max_msg_size * args.num_wgs * num_warps)); } WaveLevelPrimitiveTester::~WaveLevelPrimitiveTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); + rocshmem_free(s_buf); + rocshmem_free(r_buf); } void WaveLevelPrimitiveTester::resetBuffers(uint64_t size) { diff --git a/tests/sos_tests/asym_alloc.cpp b/tests/sos_tests/asym_alloc.cpp index 042a7b85ce..f6c7917132 100644 --- a/tests/sos_tests/asym_alloc.cpp +++ b/tests/sos_tests/asym_alloc.cpp @@ -35,7 +35,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -45,44 +45,44 @@ int main(int argc, char **argv) { int *buf, *buf_in; int me, npes, i, target; - roc_shmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + rocshmem_init(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); /* Each PE allocates space for "me + 1" integers */ bufsize = me + 1; - buf = (int *)roc_shmem_malloc(sizeof(int) * bufsize); + buf = (int *)rocshmem_malloc(sizeof(int) * bufsize); - if (NULL == buf) roc_shmem_global_exit(1); + if (NULL == buf) rocshmem_global_exit(1); for (i = 0; i < bufsize; i++) buf[i] = -1; - roc_shmem_barrier_all(); + rocshmem_barrier_all(); /* Write to neighbor's buffer */ target = (me + 1) % npes; buf_in = (int *)malloc(sizeof(int) * (target + 1)); if (!buf_in) { fprintf(stderr, "ERR - null buf_in pointer\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } for (i = 0; i < target + 1; i++) buf_in[i] = target; - roc_shmem_int_put(buf, buf_in, target + 1, target); + rocshmem_int_put(buf, buf_in, target + 1, target); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); /* Validate data was written correctly */ for (i = 0; i < me + 1; i++) { if (buf[i] != me) { printf("Error [%3d]: buf[%d] == %d, expected %d\n", me, i, buf[i], me); - roc_shmem_global_exit(2); + rocshmem_global_exit(2); } } free(buf_in); - roc_shmem_free(buf); - roc_shmem_finalize(); + rocshmem_free(buf); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/atomic_inc.cpp b/tests/sos_tests/atomic_inc.cpp index 815cce3398..10d53948da 100644 --- a/tests/sos_tests/atomic_inc.cpp +++ b/tests/sos_tests/atomic_inc.cpp @@ -30,10 +30,10 @@ */ /* - * test roc_shmem_int_atomic_inc() atomic_inc {-v|q} + * test rocshmem_int_atomic_inc() atomic_inc {-v|q} * {loop-cnt(default=10)(default=10)} where: -q == quiet, -v == verbose/debug * Loop for loop-cnt - * all PEs call roc_shmem_int_atomic_inc(), PE-0 totals + * all PEs call rocshmem_int_atomic_inc(), PE-0 totals * */ @@ -44,18 +44,18 @@ #include #include -#include +#include using namespace rocshmem; #define Rfprintf \ - if (roc_shmem_my_pe() == 0) fprintf + if (rocshmem_my_pe() == 0) fprintf #define Rprintf \ - if (roc_shmem_my_pe() == 0) printf + if (rocshmem_my_pe() == 0) printf #define RDfprintf \ - if (Verbose && roc_shmem_my_pe() == 0) fprintf + if (Verbose && rocshmem_my_pe() == 0) fprintf #define RDprintf \ - if (Verbose && roc_shmem_my_pe() == 0) printf + if (Verbose && rocshmem_my_pe() == 0) printf #define Vprintf \ if (Verbose) printf #define Vfprintf \ @@ -69,12 +69,12 @@ int main(int argc, char *argv[]) { int Announce = (NULL == getenv("MAKELEVEL")) ? 1 : 0; int *lock_cnt; - roc_shmem_init(); - my_rank = roc_shmem_my_pe(); - num_ranks = roc_shmem_n_pes(); + rocshmem_init(); + my_rank = rocshmem_my_pe(); + num_ranks = rocshmem_n_pes(); if (num_ranks == 1) { fprintf(stderr, "ERR - Requires > 1 PEs\n"); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } @@ -88,7 +88,7 @@ int main(int argc, char *argv[]) { break; default: Rfprintf(stderr, "ERR - unknown -%c ?\n", c); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } @@ -99,30 +99,30 @@ int main(int argc, char *argv[]) { loops = atoi(argv[optind++]); if (loops <= 0 || loops > 1000000) { Rfprintf(stderr, "ERR - loops arg out of bounds '%d'?\n", loops); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } - lock_cnt = (int *)roc_shmem_malloc(sizeof(int)); + lock_cnt = (int *)rocshmem_malloc(sizeof(int)); for (cloop = 1; cloop <= loops; cloop++) { *lock_cnt = 0; - roc_shmem_barrier_all(); /* sync all ranks */ + rocshmem_barrier_all(); /* sync all ranks */ for (c = 0; c < num_ranks; c++) - roc_shmem_int64_atomic_inc((int64_t *)lock_cnt, c); + rocshmem_int64_atomic_inc((int64_t *)lock_cnt, c); Vprintf("[%d] locked: lock_cnt(%d)\n", my_rank, *lock_cnt); - roc_shmem_int_wait_until(lock_cnt, ROC_SHMEM_CMP_GE, num_ranks); + rocshmem_int_wait_until(lock_cnt, ROCSHMEM_CMP_GE, num_ranks); - roc_shmem_barrier_all(); /* sync all ranks */ + rocshmem_barrier_all(); /* sync all ranks */ if ((*lock_cnt) != num_ranks) { printf("[%d] loop %d: bad lock_cnt %d, expected %d?\n", my_rank, cloop, *lock_cnt, num_ranks); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } if ((cloop % 10) == 0) { @@ -133,9 +133,9 @@ int main(int argc, char *argv[]) { Vprintf("[%d] of %d, Exit: lock_cnt %d\n", my_rank, num_ranks, *lock_cnt); - roc_shmem_free(lock_cnt); + rocshmem_free(lock_cnt); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/barrier.cpp b/tests/sos_tests/barrier.cpp index 12eef3746e..b6c401ce79 100644 --- a/tests/sos_tests/barrier.cpp +++ b/tests/sos_tests/barrier.cpp @@ -30,25 +30,25 @@ */ /* - * roc_shmem_barrier() test barrier {-V} {loop-cnt} + * rocshmem_barrier() test barrier {-V} {loop-cnt} */ #include #include #include #include -#include +#include using namespace rocshmem; #define Rfprintf \ - if (roc_shmem_my_pe() == 0) fprintf + if (rocshmem_my_pe() == 0) fprintf #define Rprintf \ - if (roc_shmem_my_pe() == 0) printf + if (rocshmem_my_pe() == 0) printf #define RDfprintf \ - if (Verbose && roc_shmem_my_pe() == 0) fprintf + if (Verbose && rocshmem_my_pe() == 0) fprintf #define RDprintf \ - if (Verbose && roc_shmem_my_pe() == 0) printf + if (Verbose && rocshmem_my_pe() == 0) printf int Verbose; @@ -57,12 +57,12 @@ int main(int argc, char* argv[]) { int rank, num_ranks; char* prog_name; - roc_shmem_init(); - rank = roc_shmem_my_pe(); - num_ranks = roc_shmem_n_pes(); + rocshmem_init(); + rank = rocshmem_my_pe(); + num_ranks = rocshmem_n_pes(); if (num_ranks == 1) { Rfprintf(stderr, "ERR - Requires > 1 PEs\n"); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } prog_name = strrchr(argv[0], '/'); @@ -78,7 +78,7 @@ int main(int argc, char* argv[]) { break; default: Rfprintf(stderr, "ERR - unknown -%c ?\n", c); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } @@ -89,7 +89,7 @@ int main(int argc, char* argv[]) { loops = atoi(argv[optind++]); if (loops <= 0 || loops > 1000000) { Rfprintf(stderr, "ERR - loops arg out of bounds '%d'?\n", loops); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } @@ -98,7 +98,7 @@ int main(int argc, char* argv[]) { // if ( j==0 || (j % 10) == 0 ) RDfprintf(stderr, "[%d] pre-barrier(%d)\n", rank, j); - roc_shmem_barrier_all(); /* sync sender and receiver */ + rocshmem_barrier_all(); /* sync sender and receiver */ // if ( j==0 || (j % 10) == 0 ) RDfprintf(stderr, "[%d] post barrier(%d)\n", rank, j); @@ -106,7 +106,7 @@ int main(int argc, char* argv[]) { RDprintf("%d(%d) Exit\n", rank, num_ranks); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/bcast.cpp b/tests/sos_tests/bcast.cpp index 41e7eb7d24..6b178dcf44 100644 --- a/tests/sos_tests/bcast.cpp +++ b/tests/sos_tests/bcast.cpp @@ -34,7 +34,7 @@ * * usage: bcast {-v|h} * - * Loop - roc_shmem_broadcast_all() with increasing data amount. + * Loop - rocshmem_broadcast_all() with increasing data amount. */ #include @@ -42,7 +42,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -58,19 +58,19 @@ int main(int argc, char *argv[]) { int nLongs = 0; long *pSync; - roc_shmem_init(); - mpe = roc_shmem_my_pe(); - num_pes = roc_shmem_n_pes(); + rocshmem_init(); + mpe = rocshmem_my_pe(); + num_pes = rocshmem_n_pes(); if (num_pes == 1) { printf("%s: Requires number of PEs > 1\n", argv[0]); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } if (sizeof(long) != 8) { printf("Test assumes 64-bit long (%zd)\n", sizeof(long)); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); return 0; } @@ -85,15 +85,15 @@ int main(int argc, char *argv[]) { Verbose = 1; } else if (strncmp(argv[1], "-h", 3) == 0) { fprintf(stderr, "usage: %s {-v(verbose)|h(help)}\n", pgm); - roc_shmem_finalize(); + rocshmem_finalize(); exit(1); } } - pSync = (long *)roc_shmem_malloc(ROC_SHMEM_BCAST_SYNC_SIZE); + pSync = (long *)rocshmem_malloc(ROCSHMEM_BCAST_SYNC_SIZE); - for (i = 0; i < ROC_SHMEM_BCAST_SYNC_SIZE; i += 1) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i += 1) { + pSync[i] = ROCSHMEM_SYNC_VALUE; } if (mpe == 0 && Verbose) { @@ -102,9 +102,9 @@ int main(int argc, char *argv[]) { for (cloop = 1; cloop <= loops; cloop++) { nLongs = nBytes / sizeof(long); - dst = (long *)roc_shmem_malloc(nBytes * 2); + dst = (long *)rocshmem_malloc(nBytes * 2); if (!dst) { - fprintf(stderr, "[%d] roc_shmem_malloc(%d) failed %s\n", mpe, nBytes, + fprintf(stderr, "[%d] rocshmem_malloc(%d) failed %s\n", mpe, nBytes, strerror(errno)); return 0; } @@ -114,9 +114,9 @@ int main(int argc, char *argv[]) { src[i] = i + 1; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_long_broadcast(ROC_SHMEM_CTX_DEFAULT, dst, src, nLongs, 1, 0, + rocshmem_ctx_long_broadcast(ROCSHMEM_CTX_DEFAULT, dst, src, nLongs, 1, 0, 0, num_pes, pSync); for (i = 0; i < nLongs; i++) { @@ -124,21 +124,21 @@ int main(int argc, char *argv[]) { if (1 != mpe && dst[i] != src[i]) { fprintf(stderr, "[%d] dst[%d] %ld != expected %ld\n", mpe, i, dst[i], src[i]); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } else if (1 == mpe && dst[i] != 0) { fprintf(stderr, "[%d] dst[%d] %ld != expected 0\n", mpe, i, dst[i]); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_free(dst); + rocshmem_free(dst); if (Verbose && mpe == 0) fprintf(stderr, "loop %2d Bcast %d, Done.\n", cloop, nBytes); nBytes += BCAST_INCR; } - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/bcast_flood.cpp b/tests/sos_tests/bcast_flood.cpp index e8ad1bfd3f..4976457d42 100644 --- a/tests/sos_tests/bcast_flood.cpp +++ b/tests/sos_tests/bcast_flood.cpp @@ -38,7 +38,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -68,9 +68,9 @@ int main(int argc, char **argv) { char *pgm; double start_time, time_taken; - roc_shmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + rocshmem_init(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); if ((pgm = strrchr(argv[0], '/'))) { pgm++; @@ -86,21 +86,21 @@ int main(int argc, char **argv) { case 'e': if ((elements = atoi_scaled(optarg)) <= 0) { fprintf(stderr, "ERR: Bad elements count %d\n", elements); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } break; case 'l': if ((loops = atoi_scaled(optarg)) <= 0) { fprintf(stderr, "ERR: Bad loop count %d\n", loops); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } break; case 'p': if ((ps_cnt = atoi_scaled(optarg)) <= 0) { fprintf(stderr, "ERR: Bad pSync[] elements %d\n", loops); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } break; @@ -115,32 +115,32 @@ int main(int argc, char **argv) { fprintf(stderr, "%s: unknown switch '-%c'?\n", pgm, i); usage(pgm); } - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } - ps_cnt *= ROC_SHMEM_BCAST_SYNC_SIZE; - pSync = (long *)roc_shmem_malloc(ps_cnt * sizeof(long)); + ps_cnt *= ROCSHMEM_BCAST_SYNC_SIZE; + pSync = (long *)rocshmem_malloc(ps_cnt * sizeof(long)); if (!pSync) { fprintf(stderr, "ERR - null pSync pointer\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } for (i = 0; i < ps_cnt; i++) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; + pSync[i] = ROCSHMEM_SYNC_VALUE; } - source = (int *)roc_shmem_malloc(elements * sizeof(*source)); + source = (int *)rocshmem_malloc(elements * sizeof(*source)); if (!source) { fprintf(stderr, "ERR - null source pointer\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } - target = (int *)roc_shmem_malloc(elements * sizeof(*target)); + target = (int *)rocshmem_malloc(elements * sizeof(*target)); if (!target) { fprintf(stderr, "ERR - null target pointer\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } for (i = 0; i < elements; i += 1) { source[i] = i + 1; @@ -151,20 +151,20 @@ int main(int argc, char **argv) { fprintf(stderr, "ps_cnt %d loops %d nElems %d\n", ps_cnt, loops, elements); } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); for (time_taken = 0.0, ps = i = 0; i < loops; i++) { start_time = shmem_wtime(); - roc_shmem_ctx_int_broadcast(ROC_SHMEM_CTX_DEFAULT, target, source, elements, + rocshmem_ctx_int_broadcast(ROCSHMEM_CTX_DEFAULT, target, source, elements, 0, 0, 0, npes, &pSync[ps]); - if (Serialize) roc_shmem_barrier_all(); + if (Serialize) rocshmem_barrier_all(); time_taken += (shmem_wtime() - start_time); if (ps_cnt > 1) { - ps += ROC_SHMEM_BCAST_SYNC_SIZE; + ps += ROCSHMEM_BCAST_SYNC_SIZE; if (ps >= ps_cnt) ps = 0; } } @@ -179,15 +179,15 @@ int main(int argc, char **argv) { if (Verbose > 1) fprintf(stderr, "[%d] pre B1\n", me); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (Verbose > 1) fprintf(stderr, "[%d] post B1\n", me); - roc_shmem_free(pSync); - roc_shmem_free(target); - roc_shmem_free(source); + rocshmem_free(pSync); + rocshmem_free(target); + rocshmem_free(source); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/big_reduction.cpp b/tests/sos_tests/big_reduction.cpp index 392829bb74..27665a010c 100644 --- a/tests/sos_tests/big_reduction.cpp +++ b/tests/sos_tests/big_reduction.cpp @@ -30,7 +30,7 @@ */ /* - * reduce across PEs with roc_shmem_max_to_all() + * reduce across PEs with rocshmem_max_to_all() * * usage: big_reduction {-v|h} */ @@ -39,14 +39,14 @@ #include #include -#include +#include using namespace rocshmem; #define N 128 #define MAX(a, b) ((a) > (b)) ? (a) : (b) -#define WRK_SIZE MAX(N / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE) +#define WRK_SIZE MAX(N / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE) int main(int argc, char *argv[]) { int i, Verbose = 0; @@ -65,33 +65,33 @@ int main(int argc, char *argv[]) { Verbose = 1; } else if (strncmp(argv[1], "-h", 3) == 0) { fprintf(stderr, "usage: %s {-v(verbose)|h(help)}\n", pgm); - roc_shmem_finalize(); + rocshmem_finalize(); exit(1); } } - roc_shmem_init(); + rocshmem_init(); - src = (long *)roc_shmem_malloc(N * sizeof(long)); + src = (long *)rocshmem_malloc(N * sizeof(long)); for (i = 0; i < N; i += 1) { - src[i] = roc_shmem_my_pe() + i; + src[i] = rocshmem_my_pe() + i; } - dst = (long *)roc_shmem_malloc(N * sizeof(long)); + dst = (long *)rocshmem_malloc(N * sizeof(long)); - pSync = (long *)roc_shmem_malloc(ROC_SHMEM_REDUCE_SYNC_SIZE * sizeof(long)); - for (i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i += 1) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; + pSync = (long *)rocshmem_malloc(ROCSHMEM_REDUCE_SYNC_SIZE * sizeof(long)); + for (i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i += 1) { + pSync[i] = ROCSHMEM_SYNC_VALUE; } - pWrk = (long *)roc_shmem_malloc(WRK_SIZE * sizeof(long)); + pWrk = (long *)rocshmem_malloc(WRK_SIZE * sizeof(long)); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_long_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst, src, N, 0, 0, - roc_shmem_n_pes(), pWrk, pSync); + rocshmem_ctx_long_max_to_all(ROCSHMEM_CTX_DEFAULT, dst, src, N, 0, 0, + rocshmem_n_pes(), pWrk, pSync); if (Verbose) { - printf("%d/%d\tdst =", roc_shmem_my_pe(), roc_shmem_n_pes()); + printf("%d/%d\tdst =", rocshmem_my_pe(), rocshmem_n_pes()); for (i = 0; i < N; i += 1) { printf(" %ld", dst[i]); } @@ -99,19 +99,19 @@ int main(int argc, char *argv[]) { } for (i = 0; i < N; i += 1) { - if (dst[i] != roc_shmem_n_pes() - 1 + i) { - printf("[%3d] Error: dst[%d] == %ld, expected %ld\n", roc_shmem_my_pe(), - i, dst[i], roc_shmem_n_pes() - 1 + (long)i); - roc_shmem_global_exit(1); + if (dst[i] != rocshmem_n_pes() - 1 + i) { + printf("[%3d] Error: dst[%d] == %ld, expected %ld\n", rocshmem_my_pe(), + i, dst[i], rocshmem_n_pes() - 1 + (long)i); + rocshmem_global_exit(1); } } - roc_shmem_free(dst); - roc_shmem_free(src); - roc_shmem_free(pSync); - roc_shmem_free(pWrk); + rocshmem_free(dst); + rocshmem_free(src); + rocshmem_free(pSync); + rocshmem_free(pWrk); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/bigget.cpp b/tests/sos_tests/bigget.cpp index d841f637c1..85c7f827f9 100644 --- a/tests/sos_tests/bigget.cpp +++ b/tests/sos_tests/bigget.cpp @@ -39,7 +39,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -78,8 +78,8 @@ static void usage(char *pgm) { " -v be verbose, multiple 'v' more verbose\n" " -e element-cnt (%d) # of int sized elements to get\n" " -l loops (%d) loop count.\n" - " -s synchronize: barrier after each roc_shmem_get()\n" - " -t track: output '.' for every 200 roc_shmem_get()s\n", + " -s synchronize: barrier after each rocshmem_get()\n" + " -t track: output '.' for every 200 rocshmem_get()s\n", pgm, NUM_ELEMENTS, DFLT_LOOPS); } @@ -99,9 +99,9 @@ int main(int argc, char **argv) { long bytes; double time_taken = 0.0, start_time; - roc_shmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + rocshmem_init(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); if ((pgm = strrchr(argv[0], '/'))) pgm++; @@ -116,14 +116,14 @@ int main(int argc, char **argv) { case 'e': if ((elements = atoi_scaled(optarg)) <= 0) { fprintf(stderr, "ERR: Bad elements count %d\n", elements); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } break; case 'l': if ((loops = atoi_scaled(optarg)) <= 0) { fprintf(stderr, "ERR: Bad loop count %d\n", loops); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } break; @@ -141,35 +141,35 @@ int main(int argc, char **argv) { fprintf(stderr, "%s: unknown switch '-%c'?\n", pgm, i); usage(pgm); } - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } target_pe = (me + 1) % npes; - total_time = (double *)roc_shmem_malloc(npes * sizeof(double)); + total_time = (double *)rocshmem_malloc(npes * sizeof(double)); if (!total_time) { - fprintf(stderr, "ERR: bad total_time roc_shmem_malloc(%ld)\n", + fprintf(stderr, "ERR: bad total_time rocshmem_malloc(%ld)\n", (elements * sizeof(double))); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } - Source = (int *)roc_shmem_malloc(elements * sizeof(*Source)); + Source = (int *)rocshmem_malloc(elements * sizeof(*Source)); if (!Source) { - fprintf(stderr, "ERR: bad Source roc_shmem_malloc(%ld)\n", + fprintf(stderr, "ERR: bad Source rocshmem_malloc(%ld)\n", (elements * sizeof(*Target))); - roc_shmem_free(total_time); - roc_shmem_global_exit(1); + rocshmem_free(total_time); + rocshmem_global_exit(1); } - Target = (int *)roc_shmem_malloc(elements * sizeof(*Target)); + Target = (int *)rocshmem_malloc(elements * sizeof(*Target)); if (!Target) { - fprintf(stderr, "ERR: bad Target roc_shmem_malloc(%ld)\n", + fprintf(stderr, "ERR: bad Target rocshmem_malloc(%ld)\n", (elements * sizeof(*Target))); - roc_shmem_free(Source); - roc_shmem_free(total_time); - roc_shmem_global_exit(1); + rocshmem_free(Source); + rocshmem_free(total_time); + rocshmem_global_exit(1); } for (i = 0; i < elements; i++) { @@ -183,31 +183,31 @@ int main(int argc, char **argv) { fprintf(stderr, "%s: INFO - %d loops, get %d (int) elements from PE+1\n", pgm, loops, elements); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); for (i = 0; i < loops; i++) { start_time = shmem_wtime(); - roc_shmem_int_get(Target, Source, elements, target_pe); + rocshmem_int_get(Target, Source, elements, target_pe); time_taken += shmem_wtime() - start_time; if (me == 0) { if (Track && i > 0 && ((i % 200) == 0)) fprintf(stderr, ".%d", i); } - if (Sync) roc_shmem_barrier_all(); + if (Sync) rocshmem_barrier_all(); } // collect time per node elapsed time. - roc_shmem_double_put(&total_time[me], &time_taken, 1, 0); + rocshmem_double_put(&total_time[me], &time_taken, 1, 0); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); for (i = 0; i < elements; i++) { if (Target[i] != i + 1) { printf("%d: Error Target[%d] = %d, expected %d\n", me, i, Target[i], i + 1); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } } @@ -224,11 +224,11 @@ int main(int argc, char **argv) { secs); } - roc_shmem_free(total_time); - roc_shmem_free(Target); - roc_shmem_free(Source); + rocshmem_free(total_time); + rocshmem_free(Target); + rocshmem_free(Source); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/bigput.cpp b/tests/sos_tests/bigput.cpp index ffdb3ff98c..8f3fda7851 100644 --- a/tests/sos_tests/bigput.cpp +++ b/tests/sos_tests/bigput.cpp @@ -39,7 +39,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -78,8 +78,8 @@ static void usage(char *pgm) { " -v be verbose, multiple 'v' more verbose\n" " -e element-cnt (%d) # of int sized elements to put\n" " -l loops (%d) loop count.\n" - " -s synchronize: barrier after each roc_shmem_put()\n" - " -t track: output '.' for every 200 roc_shmem_put()s\n", + " -s synchronize: barrier after each rocshmem_put()\n" + " -t track: output '.' for every 200 rocshmem_put()s\n", pgm, NUM_ELEMENTS, DFLT_LOOPS); } @@ -101,9 +101,9 @@ int main(int argc, char **argv) { long *pSync; double *pWrk; - roc_shmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + rocshmem_init(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); if ((pgm = strrchr(argv[0], '/'))) pgm++; @@ -118,14 +118,14 @@ int main(int argc, char **argv) { case 'e': if ((elements = atoi_scaled(optarg)) <= 0) { fprintf(stderr, "ERR: Bad elements count %d\n", elements); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } break; case 'l': if ((loops = atoi_scaled(optarg)) <= 0) { fprintf(stderr, "ERR: Bad loop count %d\n", loops); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } break; @@ -143,60 +143,60 @@ int main(int argc, char **argv) { fprintf(stderr, "%s: unknown switch '-%c'?\n", pgm, i); usage(pgm); } - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } - pSync = (long *)roc_shmem_malloc(ROC_SHMEM_BCAST_SYNC_SIZE); + pSync = (long *)rocshmem_malloc(ROCSHMEM_BCAST_SYNC_SIZE); if (!pSync) { - fprintf(stderr, "ERR: bad pSync roc_shmem_malloc(%ld)\n", - ROC_SHMEM_BCAST_SYNC_SIZE); - roc_shmem_global_exit(1); + fprintf(stderr, "ERR: bad pSync rocshmem_malloc(%ld)\n", + ROCSHMEM_BCAST_SYNC_SIZE); + rocshmem_global_exit(1); } - for (i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) - pSync[i] = ROC_SHMEM_SYNC_VALUE; + for (i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) + pSync[i] = ROCSHMEM_SYNC_VALUE; - pWrk = (double *)roc_shmem_malloc(ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE); + pWrk = (double *)rocshmem_malloc(ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE); if (!pWrk) { - fprintf(stderr, "ERR: bad pWrk roc_shmem_malloc(%ld)\n", - ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE); - roc_shmem_free(pSync); - roc_shmem_global_exit(1); + fprintf(stderr, "ERR: bad pWrk rocshmem_malloc(%ld)\n", + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE); + rocshmem_free(pSync); + rocshmem_global_exit(1); } target_PE = (me + 1) % npes; - total_time = (double *)roc_shmem_malloc(npes * sizeof(double)); + total_time = (double *)rocshmem_malloc(npes * sizeof(double)); if (!total_time) { - fprintf(stderr, "ERR: bad total_time roc_shmem_malloc(%ld)\n", + fprintf(stderr, "ERR: bad total_time rocshmem_malloc(%ld)\n", (elements * sizeof(double))); - roc_shmem_free(pSync); - roc_shmem_free(pWrk); - roc_shmem_global_exit(1); + rocshmem_free(pSync); + rocshmem_free(pWrk); + rocshmem_global_exit(1); } for (i = 0; i < npes; i++) total_time[i] = -1.0; - Source = (int *)roc_shmem_malloc(elements * sizeof(*Source)); + Source = (int *)rocshmem_malloc(elements * sizeof(*Source)); if (!Source) { - fprintf(stderr, "ERR: bad Source roc_shmem_malloc(%ld)\n", + fprintf(stderr, "ERR: bad Source rocshmem_malloc(%ld)\n", (elements * sizeof(*Target))); - roc_shmem_free(pSync); - roc_shmem_free(pWrk); - roc_shmem_free(total_time); - roc_shmem_global_exit(1); + rocshmem_free(pSync); + rocshmem_free(pWrk); + rocshmem_free(total_time); + rocshmem_global_exit(1); } - Target = (int *)roc_shmem_malloc(elements * sizeof(*Target)); + Target = (int *)rocshmem_malloc(elements * sizeof(*Target)); if (!Target) { - fprintf(stderr, "ERR: bad Target roc_shmem_malloc(%ld)\n", + fprintf(stderr, "ERR: bad Target rocshmem_malloc(%ld)\n", (elements * sizeof(*Target))); - roc_shmem_free(pSync); - roc_shmem_free(pWrk); - roc_shmem_free(Source); - roc_shmem_free(total_time); - roc_shmem_global_exit(1); + rocshmem_free(pSync); + rocshmem_free(pWrk); + rocshmem_free(Source); + rocshmem_free(total_time); + rocshmem_global_exit(1); } for (i = 0; i < elements; i++) { @@ -211,33 +211,33 @@ int main(int argc, char **argv) { "%s: INFO - %d loops, put %d (int) elements to PE+1 Max put ??\n", pgm, loops, elements); } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); for (i = 0; i < loops; i++) { start_time = shmem_wtime(); - roc_shmem_int_put(Target, Source, elements, target_PE); + rocshmem_int_put(Target, Source, elements, target_PE); time_taken += (shmem_wtime() - start_time); if (me == 0) { if (Track && i > 0 && ((i % 200) == 0)) fprintf(stderr, ".%d", i); } - if (Sync) roc_shmem_barrier_all(); + if (Sync) rocshmem_barrier_all(); } // collect time per node. - roc_shmem_double_put(&total_time[me], &time_taken, 1, 0); - roc_shmem_ctx_double_sum_to_all(ROC_SHMEM_CTX_DEFAULT, &sum_time, &time_taken, + rocshmem_double_put(&total_time[me], &time_taken, 1, 0); + rocshmem_ctx_double_sum_to_all(ROCSHMEM_CTX_DEFAULT, &sum_time, &time_taken, 1, 0, 0, npes, pWrk, pSync); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); for (i = 0; i < elements; i++) { if (Target[i] != i + 1) { printf("%d: Error Target[%d] = %d, expected %d\n", me, i, Target[i], i + 1); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } } @@ -259,17 +259,17 @@ int main(int argc, char **argv) { comp_time, sum_time); rate = ((double)bytes / (1024.0 * 1024.0)) / comp_time; - printf("%s: roc_shmem_int_put() %7.4f MB/sec (bytes %ld secs %7.4f)\n", pgm, + printf("%s: rocshmem_int_put() %7.4f MB/sec (bytes %ld secs %7.4f)\n", pgm, rate, bytes, sum_time); } - roc_shmem_free(pSync); - roc_shmem_free(pWrk); - roc_shmem_free(total_time); - roc_shmem_free(Target); - roc_shmem_free(Source); + rocshmem_free(pSync); + rocshmem_free(pWrk); + rocshmem_free(total_time); + rocshmem_free(Target); + rocshmem_free(Source); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/broadcast_active_set.cpp b/tests/sos_tests/broadcast_active_set.cpp index 0b0eeaa9dd..102aa426f1 100644 --- a/tests/sos_tests/broadcast_active_set.cpp +++ b/tests/sos_tests/broadcast_active_set.cpp @@ -29,7 +29,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -44,35 +44,35 @@ int main(void) { // long *barrier_psync0, *barrier_psync1; long long *src, *dst; - roc_shmem_init(); + rocshmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); - src = (long long *)roc_shmem_malloc(NELEM * sizeof(long long)); - dst = (long long *)roc_shmem_malloc(NELEM * sizeof(long long)); + src = (long long *)rocshmem_malloc(NELEM * sizeof(long long)); + dst = (long long *)rocshmem_malloc(NELEM * sizeof(long long)); for (i = 0; i < NELEM; i++) { src[i] = me; dst[i] = -1; } bcast_psync = - (long *)roc_shmem_malloc(ROC_SHMEM_BCAST_SYNC_SIZE * sizeof(long)); - for (i = 0; i < ROC_SHMEM_BCAST_SYNC_SIZE; i++) - bcast_psync[i] = ROC_SHMEM_SYNC_VALUE; + (long *)rocshmem_malloc(ROCSHMEM_BCAST_SYNC_SIZE * sizeof(long)); + for (i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) + bcast_psync[i] = ROCSHMEM_SYNC_VALUE; /* - barrier_psync0 = (long *) roc_shmem_malloc(ROC_SHMEM_BCAST_SYNC_SIZE * + barrier_psync0 = (long *) rocshmem_malloc(ROCSHMEM_BCAST_SYNC_SIZE * sizeof(long)); barrier_psync1 = (long *) - roc_shmem_malloc(ROC_SHMEM_BCAST_SYNC_SIZE * sizeof(long)); for (i = 0; i < - ROC_SHMEM_BARRIER_SYNC_SIZE; i++) { barrier_psync0[i] = ROC_SHMEM_SYNC_VALUE; - barrier_psync1[i] = ROC_SHMEM_SYNC_VALUE; + rocshmem_malloc(ROCSHMEM_BCAST_SYNC_SIZE * sizeof(long)); for (i = 0; i < + ROCSHMEM_BARRIER_SYNC_SIZE; i++) { barrier_psync0[i] = ROCSHMEM_SYNC_VALUE; + barrier_psync1[i] = ROCSHMEM_SYNC_VALUE; } */ if (me == 0) printf("Shrinking active set test\n"); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); /* A total of npes tests are performed, where the active set in each test * includes PEs i..npes-1 */ @@ -81,7 +81,7 @@ int main(void) { if (me == i) printf(" + active set size %d\n", npes - i); - roc_shmem_ctx_longlong_broadcast(ROC_SHMEM_CTX_DEFAULT, dst, src, NELEM, 0, + rocshmem_ctx_longlong_broadcast(ROCSHMEM_CTX_DEFAULT, dst, src, NELEM, 0, i, 0, npes - i, bcast_psync); /* Validate broadcasted data */ @@ -95,17 +95,17 @@ int main(void) { } } - // roc_shmem_barrier(i, 0, npes-i, (i % 2) ? barrier_psync0 : + // rocshmem_barrier(i, 0, npes-i, (i % 2) ? barrier_psync0 : // barrier_psync1); } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); for (i = 0; i < NELEM; i++) dst[i] = -1; if (me == 0) printf("Changing root test\n"); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); /* A total of npes tests are performed, where the root changes each time */ for (i = 0; i < npes; i++) { @@ -113,7 +113,7 @@ int main(void) { if (me == i) printf(" + root %d\n", i); - roc_shmem_ctx_longlong_broadcast(ROC_SHMEM_CTX_DEFAULT, dst, src, NELEM, i, + rocshmem_ctx_longlong_broadcast(ROCSHMEM_CTX_DEFAULT, dst, src, NELEM, i, 0, 0, npes, bcast_psync); /* Validate broadcasted data */ @@ -127,15 +127,15 @@ int main(void) { } } - // roc_shmem_barrier(0, 0, npes, barrier_psync0); + // rocshmem_barrier(0, 0, npes, barrier_psync0); } - roc_shmem_free(src); - roc_shmem_free(dst); + rocshmem_free(src); + rocshmem_free(dst); - roc_shmem_free(bcast_psync); + rocshmem_free(bcast_psync); - roc_shmem_finalize(); + rocshmem_finalize(); return errors != 0; } diff --git a/tests/sos_tests/circular_shift.cpp b/tests/sos_tests/circular_shift.cpp index 6a32495d01..50c518b636 100644 --- a/tests/sos_tests/circular_shift.cpp +++ b/tests/sos_tests/circular_shift.cpp @@ -31,7 +31,7 @@ /* circular shift bbb into aaa */ -#include +#include using namespace rocshmem; @@ -40,24 +40,24 @@ int main(int argc, char* argv[]) { int ret = 0; int aaa, *bbb; - roc_shmem_init(); + rocshmem_init(); - bbb = (int*)roc_shmem_malloc(sizeof(int)); + bbb = (int*)rocshmem_malloc(sizeof(int)); - *bbb = me = roc_shmem_my_pe(); - neighbor = (me + 1) % roc_shmem_n_pes(); + *bbb = me = rocshmem_my_pe(); + neighbor = (me + 1) % rocshmem_n_pes(); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_int_get(&aaa, bbb, 1, neighbor); + rocshmem_int_get(&aaa, bbb, 1, neighbor); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (aaa != neighbor) ret = 1; - roc_shmem_free(bbb); + rocshmem_free(bbb); - roc_shmem_finalize(); + rocshmem_finalize(); return ret; } diff --git a/tests/sos_tests/cxx_test_shmem_atomic_add.cpp b/tests/sos_tests/cxx_test_shmem_atomic_add.cpp index aa1d344e55..8e0df62500 100644 --- a/tests/sos_tests/cxx_test_shmem_atomic_add.cpp +++ b/tests/sos_tests/cxx_test_shmem_atomic_add.cpp @@ -35,7 +35,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -51,100 +51,100 @@ enum op { }; #ifdef ENABLE_DEPRECATED_TESTS -#define DEPRECATED_ADD(TYPENAME, ...) roc_shmem_##TYPENAME##_add(__VA_ARGS__) -#define DEPRECATED_FADD(TYPENAME, ...) roc_shmem_##TYPENAME##_fadd(__VA_ARGS__) +#define DEPRECATED_ADD(TYPENAME, ...) rocshmem_##TYPENAME##_add(__VA_ARGS__) +#define DEPRECATED_FADD(TYPENAME, ...) rocshmem_##TYPENAME##_fadd(__VA_ARGS__) #else #define DEPRECATED_ADD(TYPENAME, ...) \ - roc_shmem_##TYPENAME##_atomic_add(__VA_ARGS__) + rocshmem_##TYPENAME##_atomic_add(__VA_ARGS__) #define DEPRECATED_FADD(TYPENAME, ...) \ - roc_shmem_##TYPENAME##_atomic_fetch_add(__VA_ARGS__) + rocshmem_##TYPENAME##_atomic_fetch_add(__VA_ARGS__) #endif /* ENABLE_DEPRECATED_TESTS */ -#define SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) \ - case ATOMIC_FETCH_ADD_NBI: \ - roc_shmem_##TYPENAME##_atomic_fetch_add_nbi(&old, remote, \ - (TYPE)(mype + 1), i); \ - roc_shmem_quiet(); \ - if (old > (TYPE)(npes * (npes + 1) / 2)) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ - #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - break; \ - case CTX_ATOMIC_FETCH_ADD_NBI: \ - roc_shmem_ctx_##TYPENAME##_atomic_fetch_add_nbi( \ - ROC_SHMEM_CTX_DEFAULT, &old, remote, (TYPE)(mype + 1), i); \ - roc_shmem_quiet(); \ - if (old > (TYPE)(npes * (npes + 1) / 2)) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ - #TYPE); \ - rc = EXIT_FAILURE; \ - } \ +#define SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) \ + case ATOMIC_FETCH_ADD_NBI: \ + rocshmem_##TYPENAME##_atomic_fetch_add_nbi(&old, remote, \ + (TYPE)(mype + 1), i); \ + rocshmem_quiet(); \ + if (old > (TYPE)(npes * (npes + 1) / 2)) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ + #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + break; \ + case CTX_ATOMIC_FETCH_ADD_NBI: \ + rocshmem_ctx_##TYPENAME##_atomic_fetch_add_nbi( \ + ROCSHMEM_CTX_DEFAULT, &old, remote, (TYPE)(mype + 1), i); \ + rocshmem_quiet(); \ + if (old > (TYPE)(npes * (npes + 1) / 2)) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ + #TYPE); \ + rc = EXIT_FAILURE; \ + } \ break; -#define TEST_SHMEM_ADD(OP, TYPE, TYPENAME) \ - do { \ - TYPE *remote; \ - TYPE old; \ - const int mype = roc_shmem_my_pe(); \ - const int npes = roc_shmem_n_pes(); \ - remote = (TYPE *)roc_shmem_malloc(sizeof(TYPE)); \ - *remote = (TYPE)0; \ - roc_shmem_barrier_all(); \ - for (int i = 0; i < npes; i++) switch (OP) { \ - case ADD: \ - DEPRECATED_ADD(TYPENAME, remote, (TYPE)(mype + 1), i); \ - break; \ - case ATOMIC_ADD: \ - roc_shmem_##TYPENAME##_atomic_add(remote, (TYPE)(mype + 1), i); \ - break; \ - case CTX_ATOMIC_ADD: \ - roc_shmem_ctx_##TYPENAME##_atomic_add(ROC_SHMEM_CTX_DEFAULT, remote, \ - (TYPE)(mype + 1), i); \ - break; \ - case FADD: \ - old = DEPRECATED_FADD(TYPENAME, remote, (TYPE)(mype + 1), i); \ - if (old > (TYPE)(npes * (npes + 1) / 2)) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ - #OP, #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - break; \ - case ATOMIC_FETCH_ADD: \ - old = roc_shmem_##TYPENAME##_atomic_fetch_add(remote, \ - (TYPE)(mype + 1), i); \ - if (old > (TYPE)(npes * (npes + 1) / 2)) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ - #OP, #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - break; \ - case CTX_ATOMIC_FETCH_ADD: \ - old = roc_shmem_ctx_##TYPENAME##_atomic_fetch_add( \ - ROC_SHMEM_CTX_DEFAULT, remote, (TYPE)(mype + 1), i); \ - if (old > (TYPE)(npes * (npes + 1) / 2)) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ - #OP, #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - break; \ - /*SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME)*/ \ - default: \ - printf("Invalid operation (%d)\n", OP); \ - roc_shmem_global_exit(1); \ - } \ - roc_shmem_barrier_all(); \ - if ((*remote) != (TYPE)(npes * (npes + 1) / 2)) { \ - printf("PE %i observed error with TEST_SHMEM_ADD(%s, %s)\n", mype, #OP, \ - #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - roc_shmem_free(remote); \ - if (rc == EXIT_FAILURE) roc_shmem_global_exit(1); \ +#define TEST_SHMEM_ADD(OP, TYPE, TYPENAME) \ + do { \ + TYPE *remote; \ + TYPE old; \ + const int mype = rocshmem_my_pe(); \ + const int npes = rocshmem_n_pes(); \ + remote = (TYPE *)rocshmem_malloc(sizeof(TYPE)); \ + *remote = (TYPE)0; \ + rocshmem_barrier_all(); \ + for (int i = 0; i < npes; i++) switch (OP) { \ + case ADD: \ + DEPRECATED_ADD(TYPENAME, remote, (TYPE)(mype + 1), i); \ + break; \ + case ATOMIC_ADD: \ + rocshmem_##TYPENAME##_atomic_add(remote, (TYPE)(mype + 1), i); \ + break; \ + case CTX_ATOMIC_ADD: \ + rocshmem_ctx_##TYPENAME##_atomic_add(ROCSHMEM_CTX_DEFAULT, remote, \ + (TYPE)(mype + 1), i); \ + break; \ + case FADD: \ + old = DEPRECATED_FADD(TYPENAME, remote, (TYPE)(mype + 1), i); \ + if (old > (TYPE)(npes * (npes + 1) / 2)) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + break; \ + case ATOMIC_FETCH_ADD: \ + old = rocshmem_##TYPENAME##_atomic_fetch_add(remote, \ + (TYPE)(mype + 1), i); \ + if (old > (TYPE)(npes * (npes + 1) / 2)) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + break; \ + case CTX_ATOMIC_FETCH_ADD: \ + old = rocshmem_ctx_##TYPENAME##_atomic_fetch_add( \ + ROCSHMEM_CTX_DEFAULT, remote, (TYPE)(mype + 1), i); \ + if (old > (TYPE)(npes * (npes + 1) / 2)) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + break; \ + /*SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME)*/ \ + default: \ + printf("Invalid operation (%d)\n", OP); \ + rocshmem_global_exit(1); \ + } \ + rocshmem_barrier_all(); \ + if ((*remote) != (TYPE)(npes * (npes + 1) / 2)) { \ + printf("PE %i observed error with TEST_SHMEM_ADD(%s, %s)\n", mype, #OP, \ + #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + rocshmem_free(remote); \ + if (rc == EXIT_FAILURE) rocshmem_global_exit(1); \ } while (false) int main(int argc, char *argv[]) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; @@ -256,6 +256,6 @@ int main(int argc, char *argv[]) { TEST_SHMEM_ADD(CTX_ATOMIC_FETCH_ADD_NBI, ptrdiff_t, ptrdiff); */ - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/cxx_test_shmem_atomic_cswap.cpp b/tests/sos_tests/cxx_test_shmem_atomic_cswap.cpp index 4656ae976e..c649e13b5f 100644 --- a/tests/sos_tests/cxx_test_shmem_atomic_cswap.cpp +++ b/tests/sos_tests/cxx_test_shmem_atomic_cswap.cpp @@ -35,7 +35,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -49,68 +49,68 @@ enum op { #ifdef ENABLE_DEPRECATED_TESTS #define DEPRECATED_CSWAP(TYPENAME, ...) \ - roc_shmem_##TYPENAME##_cswap(__VA_ARGS__) + rocshmem_##TYPENAME##_cswap(__VA_ARGS__) #else #define DEPRECATED_CSWAP(TYPENAME, ...) \ - roc_shmem_##TYPENAME##_atomic_compare_swap(__VA_ARGS__) + rocshmem_##TYPENAME##_atomic_compare_swap(__VA_ARGS__) #endif /* ENABLE_DEPRECATED_TESTS */ -#define SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) \ - case ATOMIC_COMPARE_SWAP_NBI: \ - roc_shmem_##TYPENAME##_atomic_compare_swap_nbi( \ - &old, remote, (TYPE)npes, (TYPE)mype, (mype + 1) % npes); \ - break; \ - case CTX_ATOMIC_COMPARE_SWAP_NBI: \ - roc_shmem_ctx_##TYPENAME##_atomic_compare_swap_nbi( \ - ROC_SHMEM_CTX_DEFAULT, &old, remote, (TYPE)npes, (TYPE)mype, \ - (mype + 1) % npes); \ +#define SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) \ + case ATOMIC_COMPARE_SWAP_NBI: \ + rocshmem_##TYPENAME##_atomic_compare_swap_nbi( \ + &old, remote, (TYPE)npes, (TYPE)mype, (mype + 1) % npes); \ + break; \ + case CTX_ATOMIC_COMPARE_SWAP_NBI: \ + rocshmem_ctx_##TYPENAME##_atomic_compare_swap_nbi( \ + ROCSHMEM_CTX_DEFAULT, &old, remote, (TYPE)npes, (TYPE)mype, \ + (mype + 1) % npes); \ break; -#define TEST_SHMEM_CSWAP(OP, TYPE, TYPENAME) \ - do { \ - TYPE *remote; \ - TYPE old; \ - const int mype = roc_shmem_my_pe(); \ - const int npes = roc_shmem_n_pes(); \ - remote = (TYPE *)roc_shmem_malloc(sizeof(TYPE)); \ - *remote = npes; \ - roc_shmem_barrier_all(); \ - switch (OP) { \ - case CSWAP: \ - old = DEPRECATED_CSWAP(TYPENAME, remote, (TYPE)npes, (TYPE)mype, \ - (mype + 1) % npes); \ - break; \ - case ATOMIC_COMPARE_SWAP: \ - old = roc_shmem_##TYPENAME##_atomic_compare_swap( \ - remote, (TYPE)npes, (TYPE)mype, (mype + 1) % npes); \ - break; \ - case CTX_ATOMIC_COMPARE_SWAP: \ - old = roc_shmem_ctx_##TYPENAME##_atomic_compare_swap( \ - ROC_SHMEM_CTX_DEFAULT, remote, (TYPE)npes, (TYPE)mype, \ - (mype + 1) % npes); \ - break; \ - /* SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) */ \ - default: \ - printf("invalid operation (%d)\n", OP); \ - roc_shmem_global_exit(1); \ - } \ - roc_shmem_barrier_all(); \ - if ((*remote) != (TYPE)((mype + npes - 1) % npes)) { \ - printf("PE %i observed error with TEST_SHMEM_CSWAP(%s, %s)\n", mype, \ - #OP, #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - if (old != (TYPE)npes) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ - #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - roc_shmem_free(remote); \ - if (rc == EXIT_FAILURE) roc_shmem_global_exit(1); \ +#define TEST_SHMEM_CSWAP(OP, TYPE, TYPENAME) \ + do { \ + TYPE *remote; \ + TYPE old; \ + const int mype = rocshmem_my_pe(); \ + const int npes = rocshmem_n_pes(); \ + remote = (TYPE *)rocshmem_malloc(sizeof(TYPE)); \ + *remote = npes; \ + rocshmem_barrier_all(); \ + switch (OP) { \ + case CSWAP: \ + old = DEPRECATED_CSWAP(TYPENAME, remote, (TYPE)npes, (TYPE)mype, \ + (mype + 1) % npes); \ + break; \ + case ATOMIC_COMPARE_SWAP: \ + old = rocshmem_##TYPENAME##_atomic_compare_swap( \ + remote, (TYPE)npes, (TYPE)mype, (mype + 1) % npes); \ + break; \ + case CTX_ATOMIC_COMPARE_SWAP: \ + old = rocshmem_ctx_##TYPENAME##_atomic_compare_swap( \ + ROCSHMEM_CTX_DEFAULT, remote, (TYPE)npes, (TYPE)mype, \ + (mype + 1) % npes); \ + break; \ + /* SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) */ \ + default: \ + printf("invalid operation (%d)\n", OP); \ + rocshmem_global_exit(1); \ + } \ + rocshmem_barrier_all(); \ + if ((*remote) != (TYPE)((mype + npes - 1) % npes)) { \ + printf("PE %i observed error with TEST_SHMEM_CSWAP(%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + if (old != (TYPE)npes) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ + #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + rocshmem_free(remote); \ + if (rc == EXIT_FAILURE) rocshmem_global_exit(1); \ } while (false) int main(int argc, char *argv[]) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; @@ -181,6 +181,6 @@ int main(int argc, char *argv[]) { // TEST_SHMEM_CSWAP(CTX_ATOMIC_COMPARE_SWAP_NBI, size_t, size); // TEST_SHMEM_CSWAP(CTX_ATOMIC_COMPARE_SWAP_NBI, ptrdiff_t, ptrdiff); - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/cxx_test_shmem_atomic_fetch.cpp b/tests/sos_tests/cxx_test_shmem_atomic_fetch.cpp index 92cfa68acf..77082761dc 100644 --- a/tests/sos_tests/cxx_test_shmem_atomic_fetch.cpp +++ b/tests/sos_tests/cxx_test_shmem_atomic_fetch.cpp @@ -35,7 +35,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -49,47 +49,47 @@ enum op { #ifdef ENABLE_DEPRECATED_TESTS #define DEPRECATED_FETCH(TYPENAME, ...) \ - roc_shmem_##TYPENAME##_fetch(__VA_ARGS__) + rocshmem_##TYPENAME##_fetch(__VA_ARGS__) #else #define DEPRECATED_FETCH(TYPENAME, ...) \ - roc_shmem_##TYPENAME##_atomic_fetch(__VA_ARGS__) + rocshmem_##TYPENAME##_atomic_fetch(__VA_ARGS__) #endif /* ENABLE_DEPRECATED_TESTS */ #define SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) \ case ATOMIC_FETCH_NBI: \ - roc_shmem_##TYPENAME##_atomic_fetch_nbi(&val, remote, (mype + 1) % npes); \ - roc_shmem_quiet(); \ + rocshmem_##TYPENAME##_atomic_fetch_nbi(&val, remote, (mype + 1) % npes); \ + rocshmem_quiet(); \ break; \ case CTX_ATOMIC_FETCH_NBI: \ - roc_shmem_ctx_##TYPENAME##_atomic_fetch_nbi(ROC_SHMEM_CTX_DEFAULT, &val, \ + rocshmem_ctx_##TYPENAME##_atomic_fetch_nbi(ROCSHMEM_CTX_DEFAULT, &val, \ remote, (mype + 1) % npes); \ - roc_shmem_quiet(); \ + rocshmem_quiet(); \ break; #define TEST_SHMEM_FETCH(OP, TYPE, TYPENAME) \ do { \ TYPE *remote; \ TYPE val; \ - const int mype = roc_shmem_my_pe(); \ - const int npes = roc_shmem_n_pes(); \ - remote = (TYPE *)roc_shmem_malloc(sizeof(TYPE)); \ + const int mype = rocshmem_my_pe(); \ + const int npes = rocshmem_n_pes(); \ + remote = (TYPE *)rocshmem_malloc(sizeof(TYPE)); \ *remote = (TYPE)mype; \ - roc_shmem_barrier_all(); \ + rocshmem_barrier_all(); \ switch (OP) { \ case FETCH: \ val = DEPRECATED_FETCH(TYPENAME, remote, (mype + 1) % npes); \ break; \ case ATOMIC_FETCH: \ - val = roc_shmem_##TYPENAME##_atomic_fetch(remote, (mype + 1) % npes); \ + val = rocshmem_##TYPENAME##_atomic_fetch(remote, (mype + 1) % npes); \ break; \ case CTX_ATOMIC_FETCH: \ - val = roc_shmem_ctx_##TYPENAME##_atomic_fetch( \ - ROC_SHMEM_CTX_DEFAULT, remote, (mype + 1) % npes); \ + val = rocshmem_ctx_##TYPENAME##_atomic_fetch( \ + ROCSHMEM_CTX_DEFAULT, remote, (mype + 1) % npes); \ break; \ /* SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) */ \ default: \ printf("Invalid operation (%d)\n", OP); \ - roc_shmem_global_exit(1); \ + rocshmem_global_exit(1); \ } \ if (val != (TYPE)((mype + 1) % npes)) { \ printf( \ @@ -98,12 +98,12 @@ enum op { mype, #OP, #TYPE); \ rc = EXIT_FAILURE; \ } \ - roc_shmem_free(remote); \ - if (rc == EXIT_FAILURE) roc_shmem_global_exit(1); \ + rocshmem_free(remote); \ + if (rc == EXIT_FAILURE) rocshmem_global_exit(1); \ } while (false) int main(int argc, char *argv[]) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; @@ -186,6 +186,6 @@ int main(int argc, char *argv[]) { TEST_SHMEM_FETCH(CTX_ATOMIC_FETCH_NBI, ptrdiff_t, ptrdiff); */ - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/cxx_test_shmem_atomic_inc.cpp b/tests/sos_tests/cxx_test_shmem_atomic_inc.cpp index a6b2424864..b8c5174a84 100644 --- a/tests/sos_tests/cxx_test_shmem_atomic_inc.cpp +++ b/tests/sos_tests/cxx_test_shmem_atomic_inc.cpp @@ -35,7 +35,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -51,98 +51,98 @@ enum op { }; #ifdef ENABLE_DEPRECATED_TESTS -#define DEPRECATED_INC(TYPENAME, ...) roc_shmem_##TYPENAME##_inc(__VA_ARGS__) -#define DEPRECATED_FINC(TYPENAME, ...) roc_shmem_##TYPENAME##_finc(__VA_ARGS__) +#define DEPRECATED_INC(TYPENAME, ...) rocshmem_##TYPENAME##_inc(__VA_ARGS__) +#define DEPRECATED_FINC(TYPENAME, ...) rocshmem_##TYPENAME##_finc(__VA_ARGS__) #else #define DEPRECATED_INC(TYPENAME, ...) \ - roc_shmem_##TYPENAME##_atomic_inc(__VA_ARGS__) + rocshmem_##TYPENAME##_atomic_inc(__VA_ARGS__) #define DEPRECATED_FINC(TYPENAME, ...) \ - roc_shmem_##TYPENAME##_atomic_fetch_inc(__VA_ARGS__) + rocshmem_##TYPENAME##_atomic_fetch_inc(__VA_ARGS__) #endif /* ENABLE_DEPRECATED_TESTS */ -#define SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) \ - case ATOMIC_FETCH_INC_NBI: \ - roc_shmem_##TYPENAME##_atomic_fetch_inc_nbi(&old, remote, i); \ - roc_shmem_quiet(); \ - if (old > (TYPE)npes) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ - #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - break; \ - case CTX_ATOMIC_FETCH_INC_NBI: \ - roc_shmem_ctx_##TYPENAME##_atomic_fetch_inc_nbi(ROC_SHMEM_CTX_DEFAULT, \ - &old, remote, i); \ - roc_shmem_quiet(); \ - if (old > (TYPE)npes) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ - #TYPE); \ - rc = EXIT_FAILURE; \ - } \ +#define SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME) \ + case ATOMIC_FETCH_INC_NBI: \ + rocshmem_##TYPENAME##_atomic_fetch_inc_nbi(&old, remote, i); \ + rocshmem_quiet(); \ + if (old > (TYPE)npes) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ + #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + break; \ + case CTX_ATOMIC_FETCH_INC_NBI: \ + rocshmem_ctx_##TYPENAME##_atomic_fetch_inc_nbi(ROCSHMEM_CTX_DEFAULT, \ + &old, remote, i); \ + rocshmem_quiet(); \ + if (old > (TYPE)npes) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, #OP, \ + #TYPE); \ + rc = EXIT_FAILURE; \ + } \ break; -#define TEST_SHMEM_INC(OP, TYPE, TYPENAME) \ - do { \ - TYPE *remote; \ - TYPE old; \ - const int mype = roc_shmem_my_pe(); \ - const int npes = roc_shmem_n_pes(); \ - remote = (TYPE *)roc_shmem_malloc(sizeof(TYPE)); \ - *remote = (TYPE)0; \ - roc_shmem_barrier_all(); \ - for (int i = 0; i < npes; i++) switch (OP) { \ - case INC: \ - DEPRECATED_INC(TYPENAME, remote, i); \ - break; \ - case ATOMIC_INC: \ - roc_shmem_##TYPENAME##_atomic_inc(remote, i); \ - break; \ - case CTX_ATOMIC_INC: \ - roc_shmem_ctx_##TYPENAME##_atomic_inc(ROC_SHMEM_CTX_DEFAULT, remote, \ - i); \ - break; \ - case FINC: \ - old = DEPRECATED_FINC(TYPENAME, remote, i); \ - if (old > (TYPE)npes) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ - #OP, #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - break; \ - case ATOMIC_FETCH_INC: \ - old = roc_shmem_##TYPENAME##_atomic_fetch_inc(remote, i); \ - if (old > (TYPE)npes) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ - #OP, #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - break; \ - case CTX_ATOMIC_FETCH_INC: \ - old = roc_shmem_ctx_##TYPENAME##_atomic_fetch_inc( \ - ROC_SHMEM_CTX_DEFAULT, remote, i); \ - if (old > (TYPE)npes) { \ - printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ - #OP, #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - break; \ - /*SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME)*/ \ - default: \ - printf("Invalid operation (%d)\n", OP); \ - roc_shmem_global_exit(1); \ - } \ - roc_shmem_barrier_all(); \ - if ((*remote) != (TYPE)npes) { \ - printf("PE %i observed error with TEST_SHMEM_INC(%s, %s)\n", mype, #OP, \ - #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - roc_shmem_free(remote); \ - if (rc == EXIT_FAILURE) roc_shmem_global_exit(1); \ +#define TEST_SHMEM_INC(OP, TYPE, TYPENAME) \ + do { \ + TYPE *remote; \ + TYPE old; \ + const int mype = rocshmem_my_pe(); \ + const int npes = rocshmem_n_pes(); \ + remote = (TYPE *)rocshmem_malloc(sizeof(TYPE)); \ + *remote = (TYPE)0; \ + rocshmem_barrier_all(); \ + for (int i = 0; i < npes; i++) switch (OP) { \ + case INC: \ + DEPRECATED_INC(TYPENAME, remote, i); \ + break; \ + case ATOMIC_INC: \ + rocshmem_##TYPENAME##_atomic_inc(remote, i); \ + break; \ + case CTX_ATOMIC_INC: \ + rocshmem_ctx_##TYPENAME##_atomic_inc(ROCSHMEM_CTX_DEFAULT, remote, \ + i); \ + break; \ + case FINC: \ + old = DEPRECATED_FINC(TYPENAME, remote, i); \ + if (old > (TYPE)npes) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + break; \ + case ATOMIC_FETCH_INC: \ + old = rocshmem_##TYPENAME##_atomic_fetch_inc(remote, i); \ + if (old > (TYPE)npes) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + break; \ + case CTX_ATOMIC_FETCH_INC: \ + old = rocshmem_ctx_##TYPENAME##_atomic_fetch_inc( \ + ROCSHMEM_CTX_DEFAULT, remote, i); \ + if (old > (TYPE)npes) { \ + printf("PE %i error inconsistent value of old (%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + break; \ + /*SHMEM_NBI_OPS_CASES(OP, TYPE, TYPENAME)*/ \ + default: \ + printf("Invalid operation (%d)\n", OP); \ + rocshmem_global_exit(1); \ + } \ + rocshmem_barrier_all(); \ + if ((*remote) != (TYPE)npes) { \ + printf("PE %i observed error with TEST_SHMEM_INC(%s, %s)\n", mype, #OP, \ + #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + rocshmem_free(remote); \ + if (rc == EXIT_FAILURE) rocshmem_global_exit(1); \ } while (false) int main(int argc, char *argv[]) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; @@ -254,6 +254,6 @@ int main(int argc, char *argv[]) { TEST_SHMEM_INC(CTX_ATOMIC_FETCH_INC_NBI, ptrdiff_t, ptrdiff); */ - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/cxx_test_shmem_g.cpp b/tests/sos_tests/cxx_test_shmem_g.cpp index b7fd676632..a7dc2ac0ba 100644 --- a/tests/sos_tests/cxx_test_shmem_g.cpp +++ b/tests/sos_tests/cxx_test_shmem_g.cpp @@ -35,36 +35,36 @@ #include #include -#include +#include using namespace rocshmem; -#define TEST_SHMEM_G(USE_CTX, TYPE, TYPENAME) \ - do { \ - TYPE* remote; \ - remote = (TYPE*)roc_shmem_malloc(sizeof(TYPE)); \ - TYPE val; \ - const int mype = roc_shmem_my_pe(); \ - const int npes = roc_shmem_n_pes(); \ - *remote = (TYPE)mype; \ - roc_shmem_barrier_all(); \ - if (USE_CTX) \ - val = roc_shmem_ctx_##TYPENAME##_g(ROC_SHMEM_CTX_DEFAULT, remote, \ - (mype + 1) % npes); \ - else \ - val = roc_shmem_##TYPENAME##_g(remote, (mype + 1) % npes); \ - if (val != (TYPE)((mype + 1) % npes)) { \ - printf( \ - "PE %i received incorrect value with" \ - "TEST_SHMEM_G(%d, %s)\n", \ - mype, (int)(USE_CTX), #TYPE); \ - rc = EXIT_FAILURE; \ - roc_shmem_global_exit(1); \ - } \ +#define TEST_SHMEM_G(USE_CTX, TYPE, TYPENAME) \ + do { \ + TYPE* remote; \ + remote = (TYPE*)rocshmem_malloc(sizeof(TYPE)); \ + TYPE val; \ + const int mype = rocshmem_my_pe(); \ + const int npes = rocshmem_n_pes(); \ + *remote = (TYPE)mype; \ + rocshmem_barrier_all(); \ + if (USE_CTX) \ + val = rocshmem_ctx_##TYPENAME##_g(ROCSHMEM_CTX_DEFAULT, remote, \ + (mype + 1) % npes); \ + else \ + val = rocshmem_##TYPENAME##_g(remote, (mype + 1) % npes); \ + if (val != (TYPE)((mype + 1) % npes)) { \ + printf( \ + "PE %i received incorrect value with" \ + "TEST_SHMEM_G(%d, %s)\n", \ + mype, (int)(USE_CTX), #TYPE); \ + rc = EXIT_FAILURE; \ + rocshmem_global_exit(1); \ + } \ } while (false) int main(int argc, char* argv[]) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; TEST_SHMEM_G(0, float, float); @@ -117,6 +117,6 @@ int main(int argc, char* argv[]) { // TEST_SHMEM_G(1, size_t, size); // TEST_SHMEM_G(1, ptrdiff_t, ptrdiff); - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/cxx_test_shmem_p.cpp b/tests/sos_tests/cxx_test_shmem_p.cpp index 47407cdbf5..9dc25b7005 100644 --- a/tests/sos_tests/cxx_test_shmem_p.cpp +++ b/tests/sos_tests/cxx_test_shmem_p.cpp @@ -35,34 +35,34 @@ #include #include -#include +#include using namespace rocshmem; #define TEST_SHMEM_P(USE_CTX, TYPE, TYPENAME) \ do { \ TYPE* remote; \ - remote = (TYPE*)roc_shmem_malloc(sizeof(TYPE)); \ - const int mype = roc_shmem_my_pe(); \ - const int npes = roc_shmem_n_pes(); \ + remote = (TYPE*)rocshmem_malloc(sizeof(TYPE)); \ + const int mype = rocshmem_my_pe(); \ + const int npes = rocshmem_n_pes(); \ if (USE_CTX) \ - roc_shmem_ctx_##TYPENAME##_p(ROC_SHMEM_CTX_DEFAULT, remote, (TYPE)mype, \ + rocshmem_ctx_##TYPENAME##_p(ROCSHMEM_CTX_DEFAULT, remote, (TYPE)mype, \ (mype + 1) % npes); \ else \ - roc_shmem_##TYPENAME##_p(remote, (TYPE)mype, (mype + 1) % npes); \ - roc_shmem_barrier_all(); \ + rocshmem_##TYPENAME##_p(remote, (TYPE)mype, (mype + 1) % npes); \ + rocshmem_barrier_all(); \ if ((*remote) != (TYPE)((mype + npes - 1) % npes)) { \ printf( \ "PE %i received incorrect value with " \ "TEST_SHMEM_P(%d, %s)\n", \ mype, (int)(USE_CTX), #TYPE); \ rc = EXIT_FAILURE; \ - roc_shmem_global_exit(1); \ + rocshmem_global_exit(1); \ } \ } while (false) int main(int argc, char* argv[]) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; TEST_SHMEM_P(0, float, float); @@ -115,6 +115,6 @@ int main(int argc, char* argv[]) { // TEST_SHMEM_P(1, size_t, size); // TEST_SHMEM_P(1, ptrdiff_t, ptrdiff); - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/cxx_test_shmem_test.cpp b/tests/sos_tests/cxx_test_shmem_test.cpp index 3a2f6ddb4a..299847c7f0 100644 --- a/tests/sos_tests/cxx_test_shmem_test.cpp +++ b/tests/sos_tests/cxx_test_shmem_test.cpp @@ -35,33 +35,33 @@ #include #include -#include +#include using namespace rocshmem; -#define TEST_SHMEM_TEST(TYPE, TYPENAME) \ - do { \ - TYPE *remote; \ - remote = (TYPE *)roc_shmem_malloc(sizeof(TYPE)); \ - *remote = 0; \ - const int mype = roc_shmem_my_pe(); \ - const int npes = roc_shmem_n_pes(); \ - roc_shmem_##TYPENAME##_p(remote, (TYPE)mype + 1, (mype + 1) % npes); \ - while (!roc_shmem_##TYPENAME##_test(remote, ROC_SHMEM_CMP_NE, 0)) \ - ; \ - if ((*remote) != (TYPE)((mype + npes - 1) % npes) + 1) { \ - printf( \ - "PE %i received incorrect value with " \ - "TEST_SHMEM_TEST(%s)\n", \ - mype, #TYPE); \ - rc = EXIT_FAILURE; \ - roc_shmem_global_exit(1); \ - } \ - roc_shmem_free(remote); \ +#define TEST_SHMEM_TEST(TYPE, TYPENAME) \ + do { \ + TYPE *remote; \ + remote = (TYPE *)rocshmem_malloc(sizeof(TYPE)); \ + *remote = 0; \ + const int mype = rocshmem_my_pe(); \ + const int npes = rocshmem_n_pes(); \ + rocshmem_##TYPENAME##_p(remote, (TYPE)mype + 1, (mype + 1) % npes); \ + while (!rocshmem_##TYPENAME##_test(remote, ROCSHMEM_CMP_NE, 0)) \ + ; \ + if ((*remote) != (TYPE)((mype + npes - 1) % npes) + 1) { \ + printf( \ + "PE %i received incorrect value with " \ + "TEST_SHMEM_TEST(%s)\n", \ + mype, #TYPE); \ + rc = EXIT_FAILURE; \ + rocshmem_global_exit(1); \ + } \ + rocshmem_free(remote); \ } while (false) int main(int argc, char *argv[]) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; TEST_SHMEM_TEST(short, short); @@ -79,6 +79,6 @@ int main(int argc, char *argv[]) { // TEST_SHMEM_TEST(size_t, size); // TEST_SHMEM_TEST(ptrdiff_t, ptrdiff); - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/cxx_test_shmem_wait_until.cpp b/tests/sos_tests/cxx_test_shmem_wait_until.cpp index 7102750748..4dfa985bd9 100644 --- a/tests/sos_tests/cxx_test_shmem_wait_until.cpp +++ b/tests/sos_tests/cxx_test_shmem_wait_until.cpp @@ -35,32 +35,32 @@ #include #include -#include +#include using namespace rocshmem; -#define TEST_SHMEM_WAIT_UNTIL(TYPE, TYPENAME) \ - do { \ - TYPE *remote = 0; \ - remote = (TYPE *)roc_shmem_malloc(sizeof(TYPE)); \ - *remote = 0; \ - const int mype = roc_shmem_my_pe(); \ - const int npes = roc_shmem_n_pes(); \ - roc_shmem_##TYPENAME##_p(remote, (TYPE)mype + 1, (mype + 1) % npes); \ - roc_shmem_##TYPENAME##_wait_until(remote, ROC_SHMEM_CMP_NE, 0); \ - if ((*remote) != (TYPE)((mype + npes - 1) % npes) + 1) { \ - printf( \ - "PE %i received incorrect value with " \ - "TEST_SHMEM_WAIT_UNTIL(%s)\n", \ - mype, #TYPE); \ - rc = EXIT_FAILURE; \ - roc_shmem_global_exit(1); \ - } \ - roc_shmem_free(remote); \ +#define TEST_SHMEM_WAIT_UNTIL(TYPE, TYPENAME) \ + do { \ + TYPE *remote = 0; \ + remote = (TYPE *)rocshmem_malloc(sizeof(TYPE)); \ + *remote = 0; \ + const int mype = rocshmem_my_pe(); \ + const int npes = rocshmem_n_pes(); \ + rocshmem_##TYPENAME##_p(remote, (TYPE)mype + 1, (mype + 1) % npes); \ + rocshmem_##TYPENAME##_wait_until(remote, ROCSHMEM_CMP_NE, 0); \ + if ((*remote) != (TYPE)((mype + npes - 1) % npes) + 1) { \ + printf( \ + "PE %i received incorrect value with " \ + "TEST_SHMEM_WAIT_UNTIL(%s)\n", \ + mype, #TYPE); \ + rc = EXIT_FAILURE; \ + rocshmem_global_exit(1); \ + } \ + rocshmem_free(remote); \ } while (false) int main(int argc, char *argv[]) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; TEST_SHMEM_WAIT_UNTIL(short, short); @@ -78,6 +78,6 @@ int main(int argc, char *argv[]) { // TEST_SHMEM_WAIT_UNTIL(size_t, size); // TEST_SHMEM_WAIT_UNTIL(ptrdiff_t, ptrdiff); - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/get1.cpp b/tests/sos_tests/get1.cpp index f83de90c7d..951144ee27 100644 --- a/tests/sos_tests/get1.cpp +++ b/tests/sos_tests/get1.cpp @@ -32,7 +32,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -42,43 +42,43 @@ int main(int argc, char *argv[]) { int i; long *source; - roc_shmem_init(); + rocshmem_init(); - if (roc_shmem_n_pes() == 1) { + if (rocshmem_n_pes() == 1) { printf("%s: Requires number of PEs > 1\n", argv[0]); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } - source = (long *)roc_shmem_malloc(10 * sizeof(long)); + source = (long *)rocshmem_malloc(10 * sizeof(long)); for (i = 0; i < 10; i++) { source[i] = i + 1; } - roc_shmem_barrier_all(); /* sync sender and receiver */ + rocshmem_barrier_all(); /* sync sender and receiver */ - if (roc_shmem_my_pe() == 0) { + if (rocshmem_my_pe() == 0) { memset(target, 0, sizeof(target)); /* put 10 elements into target on PE 1 */ - roc_shmem_long_get(target, source, 10, 1); + rocshmem_long_get(target, source, 10, 1); } - roc_shmem_barrier_all(); /* sync sender and receiver */ + rocshmem_barrier_all(); /* sync sender and receiver */ - if (roc_shmem_my_pe() == 0) { + if (rocshmem_my_pe() == 0) { if (0 != memcmp(source, target, sizeof(long) * 10)) { - fprintf(stderr, "[%d] Src & Target mismatch?\n", roc_shmem_my_pe()); + fprintf(stderr, "[%d] Src & Target mismatch?\n", rocshmem_my_pe()); for (i = 0; i < 10; ++i) { printf("%ld,%ld ", source[i], target[i]); } printf("\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } } - roc_shmem_free(source); + rocshmem_free(source); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/get_nbi.cpp b/tests/sos_tests/get_nbi.cpp index c56c9c91f3..0455b4ea32 100644 --- a/tests/sos_tests/get_nbi.cpp +++ b/tests/sos_tests/get_nbi.cpp @@ -36,7 +36,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -47,39 +47,39 @@ int main(int argc, char *argv[]) { int failed = 0; long *source; - roc_shmem_init(); + rocshmem_init(); - source = (long *)roc_shmem_malloc(10 * sizeof(long)); + source = (long *)rocshmem_malloc(10 * sizeof(long)); for (i = 0; i < 10; i++) { source[i] = i + 1; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - if (roc_shmem_my_pe() == 0) { - num_pes = roc_shmem_n_pes(); + if (rocshmem_my_pe() == 0) { + num_pes = rocshmem_n_pes(); for (j = 0; j < num_pes; j++) { memset(target, 0, sizeof(long) * 10); - roc_shmem_long_get_nbi(target, source, 10, j); - roc_shmem_quiet(); + rocshmem_long_get_nbi(target, source, 10, j); + rocshmem_quiet(); for (i = 0; i < 10; i++) { if (source[i] != target[i]) { fprintf(stderr, "[%d] get_nbi from PE %d: target[%d] = %ld, expected %ld\n", - roc_shmem_my_pe(), j, i, target[i], source[i]); + rocshmem_my_pe(), j, i, target[i], source[i]); failed = 1; } } - if (failed) roc_shmem_global_exit(1); + if (failed) rocshmem_global_exit(1); } } - roc_shmem_free(source); + rocshmem_free(source); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/global_exit.cpp b/tests/sos_tests/global_exit.cpp index 2b182751da..440bfb65f6 100644 --- a/tests/sos_tests/global_exit.cpp +++ b/tests/sos_tests/global_exit.cpp @@ -29,22 +29,22 @@ #include #include -#include +#include using namespace rocshmem; int main(int argc, char* argv[]) { - roc_shmem_init(); + rocshmem_init(); - if (roc_shmem_my_pe() == 0) { - roc_shmem_global_exit(0); + if (rocshmem_my_pe() == 0) { + rocshmem_global_exit(0); abort(); } /* All other PEs wait in this barrier */ - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/hello.cpp b/tests/sos_tests/hello.cpp index 943f66701c..9dc4fc6a39 100644 --- a/tests/sos_tests/hello.cpp +++ b/tests/sos_tests/hello.cpp @@ -32,7 +32,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -41,22 +41,22 @@ int main(int argc, char* argv[], char* envp[]) { /* ** Starts/Initializes SHMEM/OpenSHMEM */ - roc_shmem_init(); + rocshmem_init(); /* ** Fetch the number or processes ** Some implementations use num_pes(); */ - myshmem_n_pes = roc_shmem_n_pes(); + myshmem_n_pes = rocshmem_n_pes(); /* ** Assign my process ID to me */ - me = roc_shmem_my_pe(); + me = rocshmem_my_pe(); if (NULL == getenv("MAKELEVEL")) { printf("Hello World from %d of %d\n", me, myshmem_n_pes); } - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/lfinc.cpp b/tests/sos_tests/lfinc.cpp index 9402520c7e..d79bfb2cac 100644 --- a/tests/sos_tests/lfinc.cpp +++ b/tests/sos_tests/lfinc.cpp @@ -29,7 +29,7 @@ * SOFTWARE. */ -/* long_finc neighbor - Perf test roc_shmem_atomic_fetch_inc(); */ +/* long_finc neighbor - Perf test rocshmem_atomic_fetch_inc(); */ #include #include @@ -38,7 +38,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -63,41 +63,41 @@ int main(int argc, char *argv[]) { if (argc > 1) loops = atoi(argv[1]); - roc_shmem_init(); + rocshmem_init(); - my_pe = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + my_pe = rocshmem_my_pe(); + npes = rocshmem_n_pes(); if (loops <= 0) { if (my_pe == 0) printf("Error: loops must be greater than 0\n"); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } - data = (long *)roc_shmem_malloc(data_sz); + data = (long *)rocshmem_malloc(data_sz); if (!data) { - fprintf(stderr, "[%d] roc_shmem_malloc(%ld) failure? %d\n", my_pe, data_sz, + fprintf(stderr, "[%d] rocshmem_malloc(%ld) failure? %d\n", my_pe, data_sz, errno); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } memset((void *)data, 0, data_sz); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); neighbor = (my_pe + 1) % npes; start_time = shmem_wtime(); for (j = 0, elapsed = 0.0; j < loops; j++) { start_time = shmem_wtime(); - lval = roc_shmem_int64_atomic_fetch_inc((int64_t *)&data[1], neighbor); + lval = rocshmem_int64_atomic_fetch_inc((int64_t *)&data[1], neighbor); elapsed += shmem_wtime() - start_time; if (lval != (long)j) { fprintf(stderr, "[%d] Test: FAIL previous val %ld != %d Exit.\n", my_pe, lval, j); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); rc = 0; if (data[1] != (long)loops) { @@ -115,13 +115,13 @@ int main(int argc, char *argv[]) { fprintf(stderr, "[%d] finc neighbor: PASSED.\n", my_pe); fprintf( stderr, - "[%d] %d loops of roc_shmem_int64_atomic_fetch_inc() in %6.4f secs\n" - " %2.6f usecs per roc_shmem_int64_atomic_fetch_inc()\n", + "[%d] %d loops of rocshmem_int64_atomic_fetch_inc() in %6.4f secs\n" + " %2.6f usecs per rocshmem_int64_atomic_fetch_inc()\n", my_pe, loops, elapsed, ((elapsed * 100000.0) / (double)loops)); } - roc_shmem_free(data); + rocshmem_free(data); - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/many-ctx.cpp b/tests/sos_tests/many-ctx.cpp index 08b674873e..c71f6013e6 100644 --- a/tests/sos_tests/many-ctx.cpp +++ b/tests/sos_tests/many-ctx.cpp @@ -28,7 +28,7 @@ #include #include -#include +#include #define NUM_CTX 32 @@ -37,42 +37,42 @@ using namespace rocshmem; int main(int argc, char **argv) { int me, npes, i; int errors = 0; - roc_shmem_ctx_t ctx[NUM_CTX]; + rocshmem_ctx_t ctx[NUM_CTX]; - roc_shmem_init(); + rocshmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); - int64_t *data = (int64_t *)roc_shmem_malloc(sizeof(int64_t)); + int64_t *data = (int64_t *)rocshmem_malloc(sizeof(int64_t)); /* Initialize the counter */ memset(data, 0, sizeof(int64_t)); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); for (i = 0; i < NUM_CTX; i++) { - int err = roc_shmem_ctx_create(0, &ctx[i]); + int err = rocshmem_ctx_create(0, &ctx[i]); if (err) { printf("%d: Warning, could not create context %d (%d)\n", me, i, err); - ctx[i] = ROC_SHMEM_CTX_DEFAULT; + ctx[i] = ROCSHMEM_CTX_DEFAULT; } } for (i = 0; i < NUM_CTX; i++) - roc_shmem_ctx_int64_atomic_inc(ctx[i], data, (me + 1) % npes); + rocshmem_ctx_int64_atomic_inc(ctx[i], data, (me + 1) % npes); - for (i = 0; i < NUM_CTX; i++) roc_shmem_ctx_quiet(ctx[i]); + for (i = 0; i < NUM_CTX; i++) rocshmem_ctx_quiet(ctx[i]); - roc_shmem_sync_all(); + rocshmem_sync_all(); if ((*data) != NUM_CTX) { printf("%d: error expected %d, got %ld\n", me, NUM_CTX, (*data)); ++errors; } - roc_shmem_free(data); + rocshmem_free(data); - roc_shmem_finalize(); + rocshmem_finalize(); return errors; } diff --git a/tests/sos_tests/max_reduction.cpp b/tests/sos_tests/max_reduction.cpp index b5025ea044..24897b0af0 100644 --- a/tests/sos_tests/max_reduction.cpp +++ b/tests/sos_tests/max_reduction.cpp @@ -37,14 +37,14 @@ #include #include -#include +#include using namespace rocshmem; #define N 3 #define MAX(a, b) ((a) > (b)) ? (a) : (b) -#define WRK_SIZE MAX(N / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE) +#define WRK_SIZE MAX(N / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE) int main(int argc, char *argv[]) { int i, Verbose = 0; @@ -63,34 +63,34 @@ int main(int argc, char *argv[]) { Verbose = 1; } else if (strncmp(argv[1], "-h", 3) == 0) { fprintf(stderr, "usage: %s {v(verbose)|h(help)}\n", pgm); - roc_shmem_finalize(); + rocshmem_finalize(); exit(1); } } - roc_shmem_init(); + rocshmem_init(); - src = (long *)roc_shmem_malloc(N * sizeof(long)); + src = (long *)rocshmem_malloc(N * sizeof(long)); for (i = 0; i < N; i += 1) { - src[i] = roc_shmem_my_pe() + i; + src[i] = rocshmem_my_pe() + i; } - dst = (long *)roc_shmem_malloc(N * sizeof(long)); + dst = (long *)rocshmem_malloc(N * sizeof(long)); - pSync = (long *)roc_shmem_malloc(ROC_SHMEM_REDUCE_SYNC_SIZE * sizeof(long)); - for (i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i += 1) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; + pSync = (long *)rocshmem_malloc(ROCSHMEM_REDUCE_SYNC_SIZE * sizeof(long)); + for (i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i += 1) { + pSync[i] = ROCSHMEM_SYNC_VALUE; } - pWrk = (long *)roc_shmem_malloc(WRK_SIZE * sizeof(long)); + pWrk = (long *)rocshmem_malloc(WRK_SIZE * sizeof(long)); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_long_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst, src, N, 0, 0, - roc_shmem_n_pes(), pWrk, pSync); + rocshmem_ctx_long_max_to_all(ROCSHMEM_CTX_DEFAULT, dst, src, N, 0, 0, + rocshmem_n_pes(), pWrk, pSync); if (Verbose) { - printf("%d/%d\tdst =", roc_shmem_my_pe(), roc_shmem_n_pes()); + printf("%d/%d\tdst =", rocshmem_my_pe(), rocshmem_n_pes()); for (i = 0; i < N; i += 1) { printf(" %ld", dst[i]); } @@ -98,19 +98,19 @@ int main(int argc, char *argv[]) { } for (i = 0; i < N; i += 1) { - if (dst[i] != roc_shmem_n_pes() - 1 + i) { - printf("[%3d] Error: dst[%d] == %ld, expected %ld\n", roc_shmem_my_pe(), - i, dst[i], roc_shmem_n_pes() - 1 + (long)i); - roc_shmem_global_exit(1); + if (dst[i] != rocshmem_n_pes() - 1 + i) { + printf("[%3d] Error: dst[%d] == %ld, expected %ld\n", rocshmem_my_pe(), + i, dst[i], rocshmem_n_pes() - 1 + (long)i); + rocshmem_global_exit(1); } } - roc_shmem_free(dst); - roc_shmem_free(src); - roc_shmem_free(pSync); - roc_shmem_free(pWrk); + rocshmem_free(dst); + rocshmem_free(src); + rocshmem_free(pSync); + rocshmem_free(pWrk); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/micro_unit_shmem.cpp b/tests/sos_tests/micro_unit_shmem.cpp index 267eab8482..0391a93c01 100644 --- a/tests/sos_tests/micro_unit_shmem.cpp +++ b/tests/sos_tests/micro_unit_shmem.cpp @@ -38,7 +38,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -54,7 +54,7 @@ int debug; static inline void wait_until(long *wait_var, int iterations, int pe) { if (debug) printf("PE %d waiting...%ld\n", pe, *wait_var); - roc_shmem_long_wait_until(wait_var, ROC_SHMEM_CMP_EQ, iterations); + rocshmem_long_wait_until(wait_var, ROCSHMEM_CMP_EQ, iterations); if (debug) printf("PE %d wait_until passed\n", pe); } @@ -71,7 +71,7 @@ static inline void post_op_check(const char *op, int check_var, int iterations, if (check_var != iterations) { fprintf(stderr, "%s ERR: PE %d source = %d != %d\n", op, pe, check_var, iterations); - roc_shmem_global_exit(EXIT_FAILURE); + rocshmem_global_exit(EXIT_FAILURE); } } @@ -82,11 +82,11 @@ static inline void putfence(int me, int iterations, int T) { if (me == 0) { for (i = 1; i < iterations; i++) { - roc_shmem_long_p(&target[T], i, 1); - roc_shmem_fence(); + rocshmem_long_p(&target[T], i, 1); + rocshmem_fence(); } - roc_shmem_long_p(&target[T], i, 1); + rocshmem_long_p(&target[T], i, 1); } else wait_until(&target[T], iterations, 1); @@ -101,13 +101,13 @@ static inline void gettest(int me, int iterations, int T, int S, int P) { if (me == 1) { pre_op_check(__func__, target[T], iterations, 1); - roc_shmem_long_p(&source[S], iterations, 0); - roc_shmem_fence(); + rocshmem_long_p(&source[S], iterations, 0); + rocshmem_fence(); for (i = 0; i < iterations; i++) - target[T] = roc_shmem_long_g(&source[S], 0); + target[T] = rocshmem_long_g(&source[S], 0); - roc_shmem_long_p(&sync_pes[P], iterations, 0); + rocshmem_long_p(&sync_pes[P], iterations, 0); post_op_check("get", target[T], iterations, 1); @@ -125,14 +125,14 @@ static inline void atomic_inc(int me, int iterations, int T) { if (me == 1) pre_op_check(__func__, target[T], iterations, 1); target[T] = 0; - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (me == 0) { for (i = 0; i < iterations; i++) { - roc_shmem_int64_atomic_inc((int64_t *)&target[T], 1); - roc_shmem_fence(); + rocshmem_int64_atomic_inc((int64_t *)&target[T], 1); + rocshmem_fence(); } - roc_shmem_int64_atomic_inc((int64_t *)&target[T], 1); + rocshmem_int64_atomic_inc((int64_t *)&target[T], 1); if (debug) printf("PE 0 done with operation\n"); @@ -150,14 +150,14 @@ static inline void atomic_add(int me, int iterations, int T) { if (me == 0) pre_op_check(__func__, target[T], iterations, 0); target[T] = 0; - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (me == 1) { for (i = 0; i < iterations; i++) { - roc_shmem_int64_atomic_add((int64_t *)&target[T], 1, 0); - roc_shmem_fence(); + rocshmem_int64_atomic_add((int64_t *)&target[T], 1, 0); + rocshmem_fence(); } - roc_shmem_int64_atomic_add((int64_t *)&target[T], 1, 0); + rocshmem_int64_atomic_add((int64_t *)&target[T], 1, 0); if (debug) printf("PE 1 done with operation\n"); @@ -178,7 +178,7 @@ static inline void swaptest(int me, int iterations, int T, int S, int P) target[T] = tswap; source[S] = sswap; - roc_shmem_barrier_all(); /* Ensure target/source initialization completed */ + rocshmem_barrier_all(); /* Ensure target/source initialization completed */ /* if (me == 0) @@ -186,9 +186,9 @@ static inline void swaptest(int me, int iterations, int T, int S, int P) if (me == 0) { for (i = 0; i < iterations; i++) - source[S] = roc_shmem_long_atomic_swap(&target[T], source[S], 1); + source[S] = rocshmem_long_atomic_swap(&target[T], source[S], 1); - roc_shmem_long_p(&sync_pes[P], i, 1); + rocshmem_long_p(&sync_pes[P], i, 1); if (debug) printf("AFTER flag PE 0 value of source is %d" @@ -199,7 +199,7 @@ static inline void swaptest(int me, int iterations, int T, int S, int P) (source[S] != sswap))) { fprintf(stderr, "swap ERR: PE 0 source = %d\n", source[S]); - roc_shmem_global_exit(EXIT_FAILURE); + rocshmem_global_exit(EXIT_FAILURE); } } else { @@ -210,7 +210,7 @@ static inline void swaptest(int me, int iterations, int T, int S, int P) (target[T] != tswap))) { fprintf(stderr, "swap ERR: PE 0 target = %d \n", target[T]); - roc_shmem_global_exit(EXIT_FAILURE); + rocshmem_global_exit(EXIT_FAILURE); } } @@ -227,16 +227,16 @@ static inline void cswaptest(int me, int iterations, int T, int S, int P) { source[S] = -100; target[T] = 0; - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (me == 1) { pre_op_check(__func__, source[S], iterations, 1); for (i = 0; i < iterations; i++) - source[S] = roc_shmem_int64_atomic_compare_swap((int64_t *)&(target[T]), + source[S] = rocshmem_int64_atomic_compare_swap((int64_t *)&(target[T]), i, (i + 1), 0); - roc_shmem_long_p(&sync_pes[P], i, 0); + rocshmem_long_p(&sync_pes[P], i, 0); post_op_check("compare_swap", source[S], (iterations - 1), 1); @@ -246,7 +246,7 @@ static inline void cswaptest(int me, int iterations, int T, int S, int P) { if (target[T] != iterations) { fprintf(stderr, "compare_swap ERR: PE 1 target = %ld != %d\n", target[T], iterations); - roc_shmem_global_exit(EXIT_FAILURE); + rocshmem_global_exit(EXIT_FAILURE); } } @@ -261,7 +261,7 @@ static inline void fetchatomic_add(int me, int iterations, int T, int S) { if (me == 1) pre_op_check(__func__, target[T], iterations, 1); target[T] = 0; - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (me == 0) { if (debug) { @@ -272,10 +272,10 @@ static inline void fetchatomic_add(int me, int iterations, int T, int S) { } for (i = 0; i < iterations; i++) { - source[S] = roc_shmem_int64_atomic_fetch_add((int64_t *)&target[T], 1, 1); - roc_shmem_fence(); + source[S] = rocshmem_int64_atomic_fetch_add((int64_t *)&target[T], 1, 1); + rocshmem_fence(); } - source[S] = roc_shmem_int64_atomic_fetch_add((int64_t *)&target[T], 1, 1); + source[S] = rocshmem_int64_atomic_fetch_add((int64_t *)&target[T], 1, 1); post_op_check("fetch_add", source[S], iterations, 0); @@ -293,7 +293,7 @@ static inline void fetchatomic_inc(int me, int iterations, int T, int S) { if (me == 0) pre_op_check(__func__, target[T], iterations, 0); target[T] = 0; - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (me == 1) { if (debug) { @@ -301,8 +301,8 @@ static inline void fetchatomic_inc(int me, int iterations, int T, int S) { } for (i = 0; i < iterations; i++) { - source[S] = roc_shmem_int64_atomic_fetch_inc((int64_t *)&target[T], 0); - roc_shmem_fence(); + source[S] = rocshmem_int64_atomic_fetch_inc((int64_t *)&target[T], 0); + rocshmem_fence(); } post_op_check("fetch_inc", source[S], (iterations - 1), 1); @@ -321,20 +321,20 @@ int main(int argc, char **argv) { const int DEFAULT_ITR = 7; int iterations = DEFAULT_ITR; - roc_shmem_init(); + rocshmem_init(); - me = roc_shmem_my_pe(); - nproc = roc_shmem_n_pes(); + me = rocshmem_my_pe(); + nproc = rocshmem_n_pes(); - target = (long *)roc_shmem_malloc(NUM_WRITE * sizeof(long)); - source = (long *)roc_shmem_malloc(NUM_READ * sizeof(long)); - sync_pes = (long *)roc_shmem_malloc(NUM_SYNC * sizeof(long)); + target = (long *)rocshmem_malloc(NUM_WRITE * sizeof(long)); + source = (long *)rocshmem_malloc(NUM_READ * sizeof(long)); + sync_pes = (long *)rocshmem_malloc(NUM_SYNC * sizeof(long)); memset(target, -1, NUM_WRITE * sizeof(int)); memset(source, -1, NUM_READ * sizeof(int)); memset(sync_pes, -1, NUM_SYNC * sizeof(int)); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (nproc != 2) { if (me == 0) { @@ -344,7 +344,7 @@ int main(int argc, char **argv) { " are using %d\n", nproc); } - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } @@ -402,7 +402,7 @@ int main(int argc, char **argv) { "to run individual tests: -i , -v" ", -d, -p, -g, -a, -A, -s, -c, -f, -F, -h\n"); } - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } @@ -425,11 +425,11 @@ int main(int argc, char **argv) { printf("PE 0 Successful exit\n"); } - roc_shmem_free(target); - roc_shmem_free(source); - roc_shmem_free(sync_pes); + rocshmem_free(target); + rocshmem_free(source); + rocshmem_free(sync_pes); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/mt_a2a.cpp b/tests/sos_tests/mt_a2a.cpp index 4f87181b23..1990414ec7 100644 --- a/tests/sos_tests/mt_a2a.cpp +++ b/tests/sos_tests/mt_a2a.cpp @@ -35,7 +35,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -57,17 +57,17 @@ static void *thread_main(void *arg) { /* TEST CONCURRENT ATOMICS */ val = me; for (i = 1; i <= npes; i++) - roc_shmem_int64_atomic_add(&dest[tid], val, (me + i) % npes); + rocshmem_int64_atomic_add(&dest[tid], val, (me + i) % npes); /* Ensure that fence does not overlap with communication calls */ pthread_barrier_wait(&fencebar); - if (tid == 0) roc_shmem_fence(); + if (tid == 0) rocshmem_fence(); pthread_barrier_wait(&fencebar); for (i = 1; i <= npes; i++) - roc_shmem_int64_atomic_inc(&flag[tid], (me + i) % npes); + rocshmem_int64_atomic_inc(&flag[tid], (me + i) % npes); - roc_shmem_long_wait_until(&flag[tid], ROC_SHMEM_CMP_EQ, npes); + rocshmem_long_wait_until(&flag[tid], ROCSHMEM_CMP_EQ, npes); expected = (npes - 1) * npes / 2; if (dest[tid] != expected || flag[tid] != npes) { @@ -81,21 +81,21 @@ static void *thread_main(void *arg) { } pthread_barrier_wait(&fencebar); - if (0 == tid) roc_shmem_barrier_all(); + if (0 == tid) rocshmem_barrier_all(); pthread_barrier_wait(&fencebar); /* TEST CONCURRENT PUTS */ val = -1; - roc_shmem_long_put(&dest[tid], &val, 1, (me + 1) % npes); + rocshmem_long_put(&dest[tid], &val, 1, (me + 1) % npes); /* Ensure that all puts are issued before the shmem barrier is called. */ pthread_barrier_wait(&fencebar); - if (0 == tid) roc_shmem_barrier_all(); + if (0 == tid) rocshmem_barrier_all(); pthread_barrier_wait(&fencebar); /* TEST CONCURRENT GETS */ for (i = 1; i <= npes; i++) { - roc_shmem_long_get(&val, &dest[tid], 1, (me + i) % npes); + rocshmem_long_get(&val, &dest[tid], 1, (me + i) % npes); expected = -1; if (val != expected) { @@ -110,7 +110,7 @@ static void *thread_main(void *arg) { } pthread_barrier_wait(&fencebar); - if (0 == tid) roc_shmem_barrier_all(); + if (0 == tid) rocshmem_barrier_all(); return NULL; } @@ -120,21 +120,21 @@ int main(int argc, char **argv) { pthread_t threads[T]; int t_arg[T]; - roc_shmem_init_thread(ROC_SHMEM_THREAD_MULTIPLE, &tl); + rocshmem_init_thread(ROCSHMEM_THREAD_MULTIPLE, &tl); - if (tl != ROC_SHMEM_THREAD_MULTIPLE) { + if (tl != ROCSHMEM_THREAD_MULTIPLE) { printf("Init failed (requested thread level %d, got %d)\n", - ROC_SHMEM_THREAD_MULTIPLE, tl); - roc_shmem_global_exit(1); + ROCSHMEM_THREAD_MULTIPLE, tl); + rocshmem_global_exit(1); } - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); pthread_barrier_init(&fencebar, NULL, T); - dest = (long *)roc_shmem_malloc(sizeof(long) * T); - flag = (long *)roc_shmem_malloc(sizeof(long) * T); + dest = (long *)rocshmem_malloc(sizeof(long) * T); + flag = (long *)rocshmem_malloc(sizeof(long) * T); if (me == 0) printf("Starting multithreaded test on %d PEs, %d threads/PE\n", npes, T); @@ -163,9 +163,9 @@ int main(int argc, char **argv) { printf("Success\n"); } - roc_shmem_free(dest); - roc_shmem_free(flag); + rocshmem_free(dest); + rocshmem_free(flag); - roc_shmem_finalize(); + rocshmem_finalize(); return (errors == 0) ? 0 : 1; } diff --git a/tests/sos_tests/mt_contention.cpp b/tests/sos_tests/mt_contention.cpp index ff14b77946..f3cd0b14db 100644 --- a/tests/sos_tests/mt_contention.cpp +++ b/tests/sos_tests/mt_contention.cpp @@ -33,7 +33,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -53,9 +53,9 @@ static void *thread_main(void *arg) { * with overlapping AMOs behaves correctly. */ for (i = 1; i <= npes; i++) - roc_shmem_int64_atomic_add(dest, tid, (me + i) % npes); + rocshmem_int64_atomic_add(dest, tid, (me + i) % npes); - roc_shmem_quiet(); + rocshmem_quiet(); return NULL; } @@ -65,21 +65,21 @@ int main(int argc, char **argv) { pthread_t threads[T]; int t_arg[T]; - roc_shmem_init_thread(ROC_SHMEM_THREAD_MULTIPLE, &tl); + rocshmem_init_thread(ROCSHMEM_THREAD_MULTIPLE, &tl); - if (tl != ROC_SHMEM_THREAD_MULTIPLE) { + if (tl != ROCSHMEM_THREAD_MULTIPLE) { printf("Init failed (requested thread level %d, got %d)\n", - ROC_SHMEM_THREAD_MULTIPLE, tl); - roc_shmem_global_exit(1); + ROCSHMEM_THREAD_MULTIPLE, tl); + rocshmem_global_exit(1); } - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); if (me == 0) printf("Starting multithreaded test on %d PEs, %d threads/PE\n", npes, T); - dest = (long *)roc_shmem_malloc(sizeof(long)); + dest = (long *)rocshmem_malloc(sizeof(long)); *dest = 0; for (i = 0; i < T; i++) { @@ -95,7 +95,7 @@ int main(int argc, char **argv) { assert(0 == err); } - roc_shmem_sync_all(); + rocshmem_sync_all(); if ((*dest) != ((T - 1) * T / 2) * npes) { printf("%d: dest = %ld, expected %d\n", me, *dest, @@ -103,8 +103,8 @@ int main(int argc, char **argv) { errors++; } - roc_shmem_free(dest); + rocshmem_free(dest); - roc_shmem_finalize(); + rocshmem_finalize(); return (errors == 0) ? 0 : 1; } diff --git a/tests/sos_tests/pi.cpp b/tests/sos_tests/pi.cpp index 0a777e0f00..ea85e397a4 100644 --- a/tests/sos_tests/pi.cpp +++ b/tests/sos_tests/pi.cpp @@ -31,32 +31,32 @@ #include #include -#include +#include using namespace rocshmem; #define NUM_POINTS 10000 int main(int argc, char *argv[], char *envp[]) { - int me, myroc_shmem_n_pes; + int me, myrocshmem_n_pes; long long *inside, *total; /* ** Starts/Initializes SHMEM/OpenSHMEM */ - roc_shmem_init(); + rocshmem_init(); /* ** Fetch the number or processes ** Some implementations use num_pes(); */ - myroc_shmem_n_pes = roc_shmem_n_pes(); + myrocshmem_n_pes = rocshmem_n_pes(); /* ** Assign my process ID to me */ - me = roc_shmem_my_pe(); + me = rocshmem_my_pe(); - inside = (long long *)roc_shmem_malloc(sizeof(long long)); - total = (long long *)roc_shmem_malloc(sizeof(long long)); + inside = (long long *)rocshmem_malloc(sizeof(long long)); + total = (long long *)rocshmem_malloc(sizeof(long long)); *inside = *total = 0; srand(1 + me); @@ -71,15 +71,15 @@ int main(int argc, char *argv[], char *envp[]) { } } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); int errors = 0; if (me == 0) { - for (int i = 1; i < myroc_shmem_n_pes; ++i) { + for (int i = 1; i < myrocshmem_n_pes; ++i) { long long remoteInside, remoteTotal; - roc_shmem_longlong_get(&remoteInside, inside, 1, i); - roc_shmem_longlong_get(&remoteTotal, total, 1, i); + rocshmem_longlong_get(&remoteInside, inside, 1, i); + rocshmem_longlong_get(&remoteTotal, total, 1, i); (*total) += remoteTotal; (*inside) += remoteInside; } @@ -91,15 +91,15 @@ int main(int argc, char *argv[], char *envp[]) { } if (NULL == getenv("MAKELEVEL")) { - printf("Pi from %llu points on %d PEs: %lf\n", *total, myroc_shmem_n_pes, + printf("Pi from %llu points on %d PEs: %lf\n", *total, myrocshmem_n_pes, approx_pi); } } - roc_shmem_free(inside); - roc_shmem_free(total); + rocshmem_free(inside); + rocshmem_free(total); - roc_shmem_finalize(); + rocshmem_finalize(); return errors; } diff --git a/tests/sos_tests/ping.cpp b/tests/sos_tests/ping.cpp index 221b2a5ee6..fe6add096d 100644 --- a/tests/sos_tests/ping.cpp +++ b/tests/sos_tests/ping.cpp @@ -41,19 +41,19 @@ #include #include -#include +#include using namespace rocshmem; #define Rfprintf \ - if (roc_shmem_my_pe() == 0) fprintf + if (rocshmem_my_pe() == 0) fprintf #define Rprintf \ - if (roc_shmem_my_pe() == 0) printf + if (rocshmem_my_pe() == 0) printf #define RDprintf \ - if (Verbose && roc_shmem_my_pe() == 0) printf + if (Verbose && rocshmem_my_pe() == 0) printf #define RDfprintf \ - if (Verbose && roc_shmem_my_pe() == 0) fprintf + if (Verbose && rocshmem_my_pe() == 0) fprintf /* option flags */ #define OUTPUT_MOD 1 // output debug every X loops @@ -75,13 +75,13 @@ int main(int argc, char *argv[]) { char *prog_name; long *Target; - roc_shmem_init(); - proc = roc_shmem_my_pe(); - num_procs = roc_shmem_n_pes(); + rocshmem_init(); + proc = rocshmem_my_pe(); + num_procs = rocshmem_n_pes(); if (num_procs == 1) { Rfprintf(stderr, "ERR - Requires > 1 PEs\n"); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } @@ -106,17 +106,17 @@ int main(int argc, char *argv[]) { "ERR - output modulo arg out of " "bounds '%d'?]\n", output_mod); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } Rfprintf(stderr, "%s: output modulo %d\n", prog_name, output_mod); break; case 'h': Rfprintf(stderr, "usage: %s {nWords-2-put} {Loop-count}\n", prog_name); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; default: - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } @@ -128,7 +128,7 @@ int main(int argc, char *argv[]) { if (nWords <= 0 || nWords > TARGET_SZ) { Rfprintf(stderr, "ERR - nWords arg out of bounds '%d' [1..%d]?\n", nWords, TARGET_SZ); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } @@ -139,7 +139,7 @@ int main(int argc, char *argv[]) { loops = atoi(argv[optind++]); if (loops <= 0 || loops > 1000000) { Rfprintf(stderr, "ERR - loops arg out of bounds '%d'?\n", loops); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } @@ -148,33 +148,33 @@ int main(int argc, char *argv[]) { for (j = 0; j < nWords; j++) src[j] = VAL; - Target = (long *)roc_shmem_malloc(TARGET_SZ * sizeof(long)); + Target = (long *)rocshmem_malloc(TARGET_SZ * sizeof(long)); for (j = 0; j < loops; j++) { - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (Verbose && (j == 0 || (j % output_mod) == 0)) - fprintf(stderr, "[%d] +(%d)\n", roc_shmem_my_pe(), j); + fprintf(stderr, "[%d] +(%d)\n", rocshmem_my_pe(), j); if (proc == 0) { int p; for (p = 1; p < num_procs; p++) - roc_shmem_long_put(Target, src, nWords, p); + rocshmem_long_put(Target, src, nWords, p); } else { if (Slow) { /* wait for each put to complete */ for (k = 0; k < nWords; k++) - roc_shmem_long_wait_until(&Target[k], ROC_SHMEM_CMP_NE, proc); + rocshmem_long_wait_until(&Target[k], ROCSHMEM_CMP_NE, proc); } else { /* wait for last word to be written */ - roc_shmem_long_wait_until(&Target[nWords - 1], ROC_SHMEM_CMP_NE, proc); + rocshmem_long_wait_until(&Target[nWords - 1], ROCSHMEM_CMP_NE, proc); } } if (Verbose && (j == 0 || (j % output_mod) == 0)) - fprintf(stderr, "[%d] -(%d)\n", roc_shmem_my_pe(), j); + fprintf(stderr, "[%d] -(%d)\n", rocshmem_my_pe(), j); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (proc != 0) { for (k = 0; k < nWords; k++) { @@ -192,14 +192,14 @@ int main(int argc, char *argv[]) { memset(Target, 0, TARGET_SZ); } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (failures || Verbose) Rprintf("%d(%d) Exit(%d)\n", proc, num_procs, failures); - roc_shmem_free(Target); + rocshmem_free(Target); - roc_shmem_finalize(); + rocshmem_finalize(); return failures; } diff --git a/tests/sos_tests/put1.cpp b/tests/sos_tests/put1.cpp index 6780dab65c..8803d1e852 100644 --- a/tests/sos_tests/put1.cpp +++ b/tests/sos_tests/put1.cpp @@ -32,7 +32,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -40,38 +40,38 @@ int main(int argc, char *argv[]) { long source[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; long *target; - roc_shmem_init(); + rocshmem_init(); - if (roc_shmem_n_pes() == 1) { + if (rocshmem_n_pes() == 1) { printf("%s: Requires number of PEs > 1\n", argv[0]); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } - target = (long *)roc_shmem_malloc(10 * sizeof(long)); + target = (long *)rocshmem_malloc(10 * sizeof(long)); - if (roc_shmem_my_pe() == 0) { + if (rocshmem_my_pe() == 0) { /* put 10 elements into target on PE 1 */ - roc_shmem_long_put(target, source, 10, 1); + rocshmem_long_put(target, source, 10, 1); } - roc_shmem_barrier_all(); /* sync sender and receiver */ + rocshmem_barrier_all(); /* sync sender and receiver */ - if (roc_shmem_my_pe() == 1) { + if (rocshmem_my_pe() == 1) { if (0 != memcmp(source, target, sizeof(long) * 10)) { int i; - fprintf(stderr, "[%d] Src & Target mismatch?\n", roc_shmem_my_pe()); + fprintf(stderr, "[%d] Src & Target mismatch?\n", rocshmem_my_pe()); for (i = 0; i < 10; ++i) { printf("%ld,%ld ", source[i], target[i]); } printf("\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } } - roc_shmem_free(target); + rocshmem_free(target); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/put_ctx_mbw_mr.cpp b/tests/sos_tests/put_ctx_mbw_mr.cpp index 2cc991dc38..a9f9d816c6 100644 --- a/tests/sos_tests/put_ctx_mbw_mr.cpp +++ b/tests/sos_tests/put_ctx_mbw_mr.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -16,7 +16,7 @@ using namespace rocshmem; #define LARGE_MSG_TH 16384 #define DEF_LARGE_NUM_MESSAGES 64000 -/* An ROC_SHMEM+threads put message-rate +/* An rocSHMEM+threads put message-rate * and bandwidth benchmark. * * Always with 2 processes @@ -46,7 +46,7 @@ int run_bench(int rank, int size) { size_t buffer_size, contig_buffer_size; double *t_elapsed; double msg_rate, my_msg_rate, bandwidth, my_bandwidth; - roc_shmem_ctx_t *ctx; + rocshmem_ctx_t *ctx; char *dest_buf, *source_buf; num_messages = WINDOW_SIZE * (num_messages / num_threads / WINDOW_SIZE); @@ -54,7 +54,7 @@ int run_bench(int rank, int size) { t_elapsed = (double *)calloc(num_threads, sizeof(double)); /* Allocate array of ctxs */ - ctx = (roc_shmem_ctx_t *)malloc(sizeof(roc_shmem_ctx_t) * num_threads); + ctx = (rocshmem_ctx_t *)malloc(sizeof(rocshmem_ctx_t) * num_threads); /** * Allocate contiguous buffer for all the threads on the target. @@ -63,17 +63,17 @@ int run_bench(int rank, int size) { buffer_size = (message_size + CACHE_LINE_SIZE) * sizeof(char); contig_buffer_size = buffer_size * num_threads; - dest_buf = (char *)roc_shmem_malloc(contig_buffer_size); + dest_buf = (char *)rocshmem_malloc(contig_buffer_size); memset(dest_buf, 0, sizeof(contig_buffer_size)); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); /* Create windows */ for (i = 0; i < num_threads; i++) { - int err = roc_shmem_ctx_create(0, &ctx[i]); + int err = rocshmem_ctx_create(0, &ctx[i]); if (err) { printf("PE %d: Warning, could not create context %d (%d)\n", rank, i, err); - ctx[i] = ROC_SHMEM_CTX_DEFAULT; + ctx[i] = ROCSHMEM_CTX_DEFAULT; } } @@ -87,7 +87,7 @@ int run_bench(int rank, int size) { int tid; int win_i, win_post_i, win_posts; int my_message_size; - roc_shmem_ctx_t my_ctx; + rocshmem_ctx_t my_ctx; tid = omp_get_thread_num(); my_message_size = message_size; @@ -107,14 +107,14 @@ int run_bench(int rank, int size) { /* Warmup */ for (win_post_i = 0; win_post_i < win_posts; win_post_i++) { for (win_i = 0; win_i < WINDOW_SIZE; win_i++) { - roc_shmem_ctx_putmem_nbi(my_ctx, my_dest_buf, my_source_buf, + rocshmem_ctx_putmem_nbi(my_ctx, my_dest_buf, my_source_buf, my_message_size, rank + 1); } - roc_shmem_ctx_quiet(my_ctx); + rocshmem_ctx_quiet(my_ctx); } #pragma omp master - { roc_shmem_barrier_all(); } + { rocshmem_barrier_all(); } #pragma omp barrier /* Benchmark */ @@ -122,10 +122,10 @@ int run_bench(int rank, int size) { for (win_post_i = 0; win_post_i < win_posts; win_post_i++) { for (win_i = 0; win_i < WINDOW_SIZE; win_i++) { - roc_shmem_ctx_putmem_nbi(my_ctx, my_dest_buf, my_source_buf, + rocshmem_ctx_putmem_nbi(my_ctx, my_dest_buf, my_source_buf, my_message_size, rank + 1); } - roc_shmem_ctx_quiet(my_ctx); + rocshmem_ctx_quiet(my_ctx); } t_end = get_time(); @@ -138,14 +138,14 @@ int run_bench(int rank, int size) { /* Warmup */ #pragma omp master - { roc_shmem_barrier_all(); } + { rocshmem_barrier_all(); } #pragma omp barrier /* Benchmark */ } } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (rank % 2 == 0) { int thread_i; @@ -169,9 +169,9 @@ int run_bench(int rank, int size) { printf("%f\n", bandwidth); } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - for (i = 0; i < num_threads; i++) roc_shmem_ctx_destroy(ctx[i]); + for (i = 0; i < num_threads; i++) rocshmem_ctx_destroy(ctx[i]); free(ctx); free(t_elapsed); hipFree(source_buf); @@ -227,17 +227,17 @@ int main(int argc, char *argv[]) { if (num_messages == DEF_NUM_MESSAGES) num_messages = DEF_LARGE_NUM_MESSAGES; } - roc_shmem_init(); + rocshmem_init(); - size = roc_shmem_n_pes(); + size = rocshmem_n_pes(); if (size != 2) { printf("Run with only two processes.\n"); - roc_shmem_finalize(); + rocshmem_finalize(); } omp_set_num_threads(num_threads); - rank = roc_shmem_my_pe(); + rank = rocshmem_my_pe(); ret = run_bench(rank, size); if (ret) { @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) { ret = EXIT_FAILURE; } - roc_shmem_finalize(); + rocshmem_finalize(); return ret; } diff --git a/tests/sos_tests/put_nbi.cpp b/tests/sos_tests/put_nbi.cpp index 198cfd5ffd..ac8b8d1a80 100644 --- a/tests/sos_tests/put_nbi.cpp +++ b/tests/sos_tests/put_nbi.cpp @@ -36,7 +36,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -47,49 +47,49 @@ int main(int argc, char *argv[]) { int i, num_pes; int failed = 0; - roc_shmem_init(); + rocshmem_init(); - target = (long *)roc_shmem_malloc(sizeof(long) * 10); - flag = (int *)roc_shmem_malloc(sizeof(int)); + target = (long *)rocshmem_malloc(sizeof(long) * 10); + flag = (int *)rocshmem_malloc(sizeof(int)); if (!flag) { fprintf(stderr, "ERR - null flag pointer\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } *flag = 0; - num_pes = roc_shmem_n_pes(); + num_pes = rocshmem_n_pes(); if (target) { memset(target, 0, sizeof(long) * 10); } else { fprintf(stderr, "ERR - null target pointer\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - if (roc_shmem_my_pe() == 0) { + if (rocshmem_my_pe() == 0) { for (i = 0; i < num_pes; i++) { - roc_shmem_long_put_nbi(target, source, 10, i); - roc_shmem_fence(); - roc_shmem_int64_atomic_inc((int64_t *)flag, i); + rocshmem_long_put_nbi(target, source, 10, i); + rocshmem_fence(); + rocshmem_int64_atomic_inc((int64_t *)flag, i); } } - roc_shmem_int_wait_until(flag, ROC_SHMEM_CMP_EQ, 1); + rocshmem_int_wait_until(flag, ROCSHMEM_CMP_EQ, 1); for (i = 0; i < 10; i++) { if (target[i] != source[i]) { fprintf(stderr, "[%d] target[%d] = %ld, expected %ld\n", - roc_shmem_my_pe(), i, target[i], source[i]); + rocshmem_my_pe(), i, target[i], source[i]); failed = 1; } } - roc_shmem_free(target); - roc_shmem_free(flag); + rocshmem_free(target); + rocshmem_free(flag); - roc_shmem_finalize(); + rocshmem_finalize(); return failed; } diff --git a/tests/sos_tests/query_thread.cpp b/tests/sos_tests/query_thread.cpp index 1387318f7f..fea6288a10 100644 --- a/tests/sos_tests/query_thread.cpp +++ b/tests/sos_tests/query_thread.cpp @@ -27,7 +27,7 @@ #include -#include +#include using namespace rocshmem; @@ -35,22 +35,22 @@ int main(int argc, char* argv[]) { int provided; int tl, ret; - roc_shmem_init_thread(ROC_SHMEM_THREAD_FUNNELED, &tl); + rocshmem_init_thread(ROCSHMEM_THREAD_FUNNELED, &tl); - if (tl < ROC_SHMEM_THREAD_FUNNELED || ret != 0) { + if (tl < ROCSHMEM_THREAD_FUNNELED || ret != 0) { printf("Init failed (requested thread level %d, got %d)\n", - ROC_SHMEM_THREAD_FUNNELED, tl); - roc_shmem_global_exit(1); + ROCSHMEM_THREAD_FUNNELED, tl); + rocshmem_global_exit(1); } - roc_shmem_query_thread(&provided); - printf("%d: Query result for thread level %d\n", roc_shmem_my_pe(), provided); + rocshmem_query_thread(&provided); + printf("%d: Query result for thread level %d\n", rocshmem_my_pe(), provided); - if (provided < ROC_SHMEM_THREAD_FUNNELED) { + if (provided < ROCSHMEM_THREAD_FUNNELED) { printf("Error: thread support changed to an invalid level after init\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/reduce_active_set.cpp b/tests/sos_tests/reduce_active_set.cpp index f86f706377..e2a5506128 100644 --- a/tests/sos_tests/reduce_active_set.cpp +++ b/tests/sos_tests/reduce_active_set.cpp @@ -27,7 +27,7 @@ #include -#include +#include using namespace rocshmem; @@ -40,14 +40,14 @@ int main(void) { long *min_psync, *max_psync; long *min_pwrk, *max_pwrk; - roc_shmem_init(); + rocshmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); - src = (long *)roc_shmem_malloc(NELEM * sizeof(long)); - dst_max = (long *)roc_shmem_malloc(NELEM * sizeof(long)); - dst_min = (long *)roc_shmem_malloc(NELEM * sizeof(long)); + src = (long *)rocshmem_malloc(NELEM * sizeof(long)); + dst_max = (long *)rocshmem_malloc(NELEM * sizeof(long)); + dst_min = (long *)rocshmem_malloc(NELEM * sizeof(long)); for (i = 0; i < NELEM; i++) { src[i] = me; @@ -56,22 +56,22 @@ int main(void) { } max_psync = - (long *)roc_shmem_malloc(ROC_SHMEM_REDUCE_SYNC_SIZE * sizeof(long)); + (long *)rocshmem_malloc(ROCSHMEM_REDUCE_SYNC_SIZE * sizeof(long)); min_psync = - (long *)roc_shmem_malloc(ROC_SHMEM_REDUCE_SYNC_SIZE * sizeof(long)); - for (i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) { - max_psync[i] = ROC_SHMEM_SYNC_VALUE; - min_psync[i] = ROC_SHMEM_SYNC_VALUE; + (long *)rocshmem_malloc(ROCSHMEM_REDUCE_SYNC_SIZE * sizeof(long)); + for (i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) { + max_psync[i] = ROCSHMEM_SYNC_VALUE; + min_psync[i] = ROCSHMEM_SYNC_VALUE; } - max_pwrk = (long *)roc_shmem_malloc( - (NELEM / 2 + ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE) * sizeof(long)); - min_pwrk = (long *)roc_shmem_malloc( - (NELEM / 2 + ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE) * sizeof(long)); + max_pwrk = (long *)rocshmem_malloc( + (NELEM / 2 + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE) * sizeof(long)); + min_pwrk = (long *)rocshmem_malloc( + (NELEM / 2 + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE) * sizeof(long)); if (me == 0) printf("Shrinking active set test\n"); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); /* A total of npes tests are performed, where the active set in each test * includes PEs i..npes-1 */ @@ -81,7 +81,7 @@ int main(void) { if (me == i) printf(" + PE_start=%d, logPE_stride=0, PE_size=%d\n", i, npes - i); - roc_shmem_ctx_long_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst_max, src, NELEM, i, + rocshmem_ctx_long_max_to_all(ROCSHMEM_CTX_DEFAULT, dst_max, src, NELEM, i, 0, npes - i, max_pwrk, max_psync); /* Validate reduced data */ @@ -96,7 +96,7 @@ int main(void) { } } - roc_shmem_ctx_long_min_to_all(ROC_SHMEM_CTX_DEFAULT, dst_min, src, NELEM, i, + rocshmem_ctx_long_min_to_all(ROCSHMEM_CTX_DEFAULT, dst_min, src, NELEM, i, 0, npes - i, min_pwrk, min_psync); /* Validate reduced data */ @@ -112,17 +112,17 @@ int main(void) { } } - roc_shmem_free(src); - roc_shmem_free(dst_max); - roc_shmem_free(dst_min); + rocshmem_free(src); + rocshmem_free(dst_max); + rocshmem_free(dst_min); - roc_shmem_free(max_psync); - roc_shmem_free(min_psync); + rocshmem_free(max_psync); + rocshmem_free(min_psync); - roc_shmem_free(max_pwrk); - roc_shmem_free(min_pwrk); + rocshmem_free(max_pwrk); + rocshmem_free(min_pwrk); - roc_shmem_finalize(); + rocshmem_finalize(); return errors != 0; } diff --git a/tests/sos_tests/reduce_in_place.cpp b/tests/sos_tests/reduce_in_place.cpp index 891dd9e97a..625b0262a2 100644 --- a/tests/sos_tests/reduce_in_place.cpp +++ b/tests/sos_tests/reduce_in_place.cpp @@ -27,7 +27,7 @@ #include -#include +#include using namespace rocshmem; @@ -38,24 +38,24 @@ int main(void) { int errors = 0; long *psync, *pwrk, *src; - roc_shmem_init(); + rocshmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); - src = (long *)roc_shmem_malloc(NELEM * sizeof(long)); + src = (long *)rocshmem_malloc(NELEM * sizeof(long)); for (int i = 0; i < NELEM; i++) src[i] = me; - psync = (long *)roc_shmem_malloc(ROC_SHMEM_REDUCE_SYNC_SIZE * sizeof(long)); - for (int i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) - psync[i] = ROC_SHMEM_SYNC_VALUE; + psync = (long *)rocshmem_malloc(ROCSHMEM_REDUCE_SYNC_SIZE * sizeof(long)); + for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) + psync[i] = ROCSHMEM_SYNC_VALUE; - pwrk = (long *)roc_shmem_malloc( - (NELEM / 2 + ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE) * sizeof(long)); + pwrk = (long *)rocshmem_malloc( + (NELEM / 2 + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE) * sizeof(long)); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_long_max_to_all(ROC_SHMEM_CTX_DEFAULT, src, src, NELEM, 0, 0, + rocshmem_ctx_long_max_to_all(ROCSHMEM_CTX_DEFAULT, src, src, NELEM, 0, 0, npes, pwrk, psync); /* Validate reduced data */ @@ -68,11 +68,11 @@ int main(void) { } } - roc_shmem_free(src); - roc_shmem_free(psync); - roc_shmem_free(pwrk); + rocshmem_free(src); + rocshmem_free(psync); + rocshmem_free(pwrk); - roc_shmem_finalize(); + rocshmem_finalize(); return errors != 0; } diff --git a/tests/sos_tests/shmalloc.cpp b/tests/sos_tests/shmalloc.cpp index 461dc4a8e4..90a9b35be3 100644 --- a/tests/sos_tests/shmalloc.cpp +++ b/tests/sos_tests/shmalloc.cpp @@ -33,20 +33,20 @@ /* * usage: shmalloc [-p] [nWords] [loops] [incWords-per-loop] * where: -p == power-of-two allocation bump per loop - * [nWords] # of longs to roc_shmem_malloc()\n" + * [nWords] # of longs to rocshmem_malloc()\n" * [loops(1)] # of loops\n" * [incWords(2)] nWords += incWords per loop\n"); * Loop: - * PE* roc_shmem_malloc(nWords) + * PE* rocshmem_malloc(nWords) * set *DataType = 1 - * PE* roc_shmem_malloc(nWords) + * PE* rocshmem_malloc(nWords) * set *DataType = 2 - * PE* roc_shmem_malloc(nWords) + * PE* rocshmem_malloc(nWords) * set *DataType = 3 * * for(1...3) allocated ranges * verify - * roc_shmem_free() + * rocshmem_free() * end-loop */ @@ -56,7 +56,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -80,17 +80,17 @@ void usage(void); int getSize(char *); void usage(void) { - if (roc_shmem_my_pe() == 0) { + if (rocshmem_my_pe() == 0) { fprintf(stderr, "Usage: %s [-p] [nWords(%d)] [loops(%d)] [incWords(%d)]\n", pgm, DFLT_NWORDS, DFLT_LOOPS, DFLT_INCR); fprintf(stderr, - " -p == (2**0 ... 2**22) roc_shmem_malloc(), other args ignored\n" + " -p == (2**0 ... 2**22) rocshmem_malloc(), other args ignored\n" " -v == Verbose output\n" - " [nWords] # of longs to roc_shmem_malloc()\n" + " [nWords] # of longs to rocshmem_malloc()\n" " [loops] # of loops\n" " [incWords] nWords += incWords per loop\n"); } - roc_shmem_finalize(); + rocshmem_finalize(); exit(1); } @@ -133,9 +133,9 @@ int main(int argc, char **argv) { else pgm = argv[0]; - roc_shmem_init(); - me = roc_shmem_my_pe(); - nProcs = roc_shmem_n_pes(); + rocshmem_init(); + me = rocshmem_my_pe(); + nProcs = rocshmem_n_pes(); while ((c = getopt(argc, argv, "hpv")) != -1) switch (c) { case 'p': @@ -181,61 +181,61 @@ int main(int argc, char **argv) { for (l = 0; l < loops; l++) { /* - result = (DataType *)roc_shmem_malloc(0); + result = (DataType *)rocshmem_malloc(0); if (result != NULL) { perror ("Zero-length memory allocation has non-null result"); - roc_shmem_finalize(); + rocshmem_finalize(); exit (1); } */ result_sz = nProcs * (nWords * sizeof(DataType)); - result = (DataType *)roc_shmem_malloc(result_sz); + result = (DataType *)rocshmem_malloc(result_sz); if (!result) { perror("Failed result memory allocation"); - roc_shmem_finalize(); + rocshmem_finalize(); exit(1); } for (dp = result; dp < &result[(result_sz / sizeof(DataType))];) *dp++ = 1; target_sz = nWords * sizeof(DataType); - if (!(target = (DataType *)roc_shmem_malloc(target_sz))) { + if (!(target = (DataType *)rocshmem_malloc(target_sz))) { perror("Failed target memory allocation"); - roc_shmem_finalize(); + rocshmem_finalize(); exit(1); } for (dp = target; dp < &target[(target_sz / sizeof(DataType))];) *dp++ = 2; source_sz = 2 * nWords * sizeof(DataType); - if (!(source = (DataType *)roc_shmem_malloc(source_sz))) { + if (!(source = (DataType *)rocshmem_malloc(source_sz))) { perror("Failed source memory allocation"); - roc_shmem_finalize(); + rocshmem_finalize(); exit(1); } for (dp = source; dp < &source[(source_sz / sizeof(DataType))];) *dp++ = 3; - roc_shmem_barrier_all(); /* sync sender and receiver */ + rocshmem_barrier_all(); /* sync sender and receiver */ for (dp = source; dp < &source[(source_sz / sizeof(DataType))]; dp++) if (*dp != 3) { printf("source not consistent @ 3?\n"); break; } - roc_shmem_free(source); + rocshmem_free(source); for (dp = target; dp < &target[(target_sz / sizeof(DataType))]; dp++) if (*dp != 2) { printf("target not consistent @ 2?\n"); break; } - roc_shmem_free(target); + rocshmem_free(target); for (dp = result; dp < &result[(result_sz / sizeof(DataType))]; dp++) if (*dp != 1) { printf("result not consistent @ 1?\n"); break; } - roc_shmem_free(result); + rocshmem_free(result); if (loops > 1) { if (Verbose && me == 0) { @@ -249,7 +249,7 @@ int main(int argc, char **argv) { } } - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/shmem_team_b2b_collectives.cpp b/tests/sos_tests/shmem_team_b2b_collectives.cpp index b19b25352f..8ca9b3641a 100644 --- a/tests/sos_tests/shmem_team_b2b_collectives.cpp +++ b/tests/sos_tests/shmem_team_b2b_collectives.cpp @@ -27,7 +27,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -68,33 +68,33 @@ static void error_check(int *errors, int *total_errors, char *routine, int me) { int main(void) { int errors = 0, total_errors = 0; - roc_shmem_init(); - int me = roc_shmem_my_pe(); + rocshmem_init(); + int me = rocshmem_my_pe(); - long *dest = (long *)roc_shmem_malloc(NELEMS * sizeof(long)); - long *src = (long *)roc_shmem_malloc(NELEMS * sizeof(long)); + long *dest = (long *)rocshmem_malloc(NELEMS * sizeof(long)); + long *src = (long *)rocshmem_malloc(NELEMS * sizeof(long)); size_t i; for (i = 0; i < NELEMS; i++) { src[i] = me; } - TEST_B2B_COLLECTIVE("broadcast", roc_shmem_ctx_long_broadcast, - ROC_SHMEM_CTX_DEFAULT, ROC_SHMEM_TEAM_WORLD, dest, src, + TEST_B2B_COLLECTIVE("broadcast", rocshmem_ctx_long_broadcast, + ROCSHMEM_CTX_DEFAULT, ROCSHMEM_TEAM_WORLD, dest, src, NELEMS, 0); - TEST_B2B_COLLECTIVE("reduce", roc_shmem_ctx_long_sum_to_all, - ROC_SHMEM_CTX_DEFAULT, ROC_SHMEM_TEAM_WORLD, dest, src, + TEST_B2B_COLLECTIVE("reduce", rocshmem_ctx_long_sum_to_all, + ROCSHMEM_CTX_DEFAULT, ROCSHMEM_TEAM_WORLD, dest, src, NELEMS); - // TEST_B2B_COLLECTIVE("collect", roc_shmem_long_collect, SHMEM_TEAM_WORLD, - // dest, src, NELEMS); TEST_B2B_COLLECTIVE("fcollect", roc_shmem_long_fcollect, + // TEST_B2B_COLLECTIVE("collect", rocshmem_long_collect, SHMEM_TEAM_WORLD, + // dest, src, NELEMS); TEST_B2B_COLLECTIVE("fcollect", rocshmem_long_fcollect, // SHMEM_TEAM_WORLD, dest, src, NELEMS); TEST_B2B_COLLECTIVE("alltoall", - // roc_shmem_long_alltoall, SHMEM_TEAM_WORLD, dest, src, NELEMS); - // TEST_B2B_COLLECTIVE("alltoalls", roc_shmem_long_alltoalls, + // rocshmem_long_alltoall, SHMEM_TEAM_WORLD, dest, src, NELEMS); + // TEST_B2B_COLLECTIVE("alltoalls", rocshmem_long_alltoalls, // SHMEM_TEAM_WORLD, dest, src, 1, 1, NELEMS); - roc_shmem_free(dest); - roc_shmem_free(src); + rocshmem_free(dest); + rocshmem_free(src); - roc_shmem_finalize(); + rocshmem_finalize(); return total_errors; } diff --git a/tests/sos_tests/shmem_team_reduce.cpp b/tests/sos_tests/shmem_team_reduce.cpp index efbfd2a38d..0adc2b844d 100644 --- a/tests/sos_tests/shmem_team_reduce.cpp +++ b/tests/sos_tests/shmem_team_reduce.cpp @@ -37,7 +37,7 @@ //#include #include -#include +#include using namespace rocshmem; @@ -45,82 +45,82 @@ using namespace rocshmem; #define STRINGIFY(x) #x -#define REDUCTION(OP, TYPE) \ - do { \ - roc_shmem_ctx_##TYPE##_##OP##_to_all( \ - ROC_SHMEM_CTX_DEFAULT, ROC_SHMEM_TEAM_WORLD, dest, src, npes); \ +#define REDUCTION(OP, TYPE) \ + do { \ + rocshmem_ctx_##TYPE##_##OP##_to_all( \ + ROCSHMEM_CTX_DEFAULT, ROCSHMEM_TEAM_WORLD, dest, src, npes); \ } while (0) -#define INIT_SRC_BUFFER(TYPE) \ - do { \ - for (int i = 0; i < MAX_NPES; i++) { \ - src[i] = (TYPE)1ULL; \ - } \ +#define INIT_SRC_BUFFER(TYPE) \ + do { \ + for (int i = 0; i < MAX_NPES; i++) { \ + src[i] = (TYPE)1ULL; \ + } \ } while (0) -#define CHECK_DEST_BUFFER(OP, TYPE, CORRECT_VAL) \ - do { \ - for (int i = 0; i < npes; i++) { \ - if (dest[i] != (TYPE)CORRECT_VAL) { \ - printf( \ - "PE %i received incorrect value with " \ - "TEST_SHMEM_REDUCE(%s, %s)\n", \ - mype, #OP, #TYPE); \ - rc = EXIT_FAILURE; \ - } \ - } \ +#define CHECK_DEST_BUFFER(OP, TYPE, CORRECT_VAL) \ + do { \ + for (int i = 0; i < npes; i++) { \ + if (dest[i] != (TYPE)CORRECT_VAL) { \ + printf( \ + "PE %i received incorrect value with " \ + "TEST_SHMEM_REDUCE(%s, %s)\n", \ + mype, #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + } \ } while (0) -#define TEST_SHMEM_REDUCE(OP, TYPENAME, TYPE) \ - do { \ - TYPE *src, *dest; \ - src = dest = nullptr; \ - src = (TYPE *)roc_shmem_malloc(sizeof(TYPE) * MAX_NPES); \ - dest = (TYPE *)roc_shmem_malloc(sizeof(TYPE) * MAX_NPES); \ - \ - INIT_SRC_BUFFER(TYPE); \ - \ - REDUCTION(OP, TYPENAME); \ - \ - roc_shmem_barrier_all(); \ - \ - std::string op = STRINGIFY(OP); \ - if (op.compare("and") == 0) { \ - CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ - } else if (op.compare("or") == 0) { \ - CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ - } else if (op.compare("xor") == 0) { \ - CHECK_DEST_BUFFER(OP, TYPE, (TYPE)(npes % 2 ? 1ULL : 0ULL)); \ - } else if (op.compare("max") == 0) { \ - CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ - } else if (op.compare("min") == 0) { \ - CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ - } else if (op.compare("sum") == 0) { \ - CHECK_DEST_BUFFER(OP, TYPE, npes); \ - } else if (op.compare("prod") == 0) { \ - CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ - } else { \ - printf("Invalid operation (%s)\n", STRINGIFY(OP)); \ - roc_shmem_global_exit(1); \ - } \ - \ - roc_shmem_free(src); \ - roc_shmem_free(dest); \ - \ +#define TEST_SHMEM_REDUCE(OP, TYPENAME, TYPE) \ + do { \ + TYPE *src, *dest; \ + src = dest = nullptr; \ + src = (TYPE *)rocshmem_malloc(sizeof(TYPE) * MAX_NPES); \ + dest = (TYPE *)rocshmem_malloc(sizeof(TYPE) * MAX_NPES); \ + \ + INIT_SRC_BUFFER(TYPE); \ + \ + REDUCTION(OP, TYPENAME); \ + \ + rocshmem_barrier_all(); \ + \ + std::string op = STRINGIFY(OP); \ + if (op.compare("and") == 0) { \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + } else if (op.compare("or") == 0) { \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + } else if (op.compare("xor") == 0) { \ + CHECK_DEST_BUFFER(OP, TYPE, (TYPE)(npes % 2 ? 1ULL : 0ULL)); \ + } else if (op.compare("max") == 0) { \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + } else if (op.compare("min") == 0) { \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + } else if (op.compare("sum") == 0) { \ + CHECK_DEST_BUFFER(OP, TYPE, npes); \ + } else if (op.compare("prod") == 0) { \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + } else { \ + printf("Invalid operation (%s)\n", STRINGIFY(OP)); \ + rocshmem_global_exit(1); \ + } \ + \ + rocshmem_free(src); \ + rocshmem_free(dest); \ + \ } while (0) int main(void) { - roc_shmem_init(); + rocshmem_init(); int rc = EXIT_SUCCESS; - const int mype = roc_shmem_my_pe(); - const int npes = roc_shmem_n_pes(); + const int mype = rocshmem_my_pe(); + const int npes = rocshmem_n_pes(); if (npes > MAX_NPES) { if (mype == 0) fprintf(stderr, "ERR - Requires less than %d PEs\n", MAX_NPES); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } // TEST_SHMEM_REDUCE(and, uchar, unsigned char); @@ -284,6 +284,6 @@ int main(void) { // TEST_SHMEM_REDUCE(prod, complexd, double _Complex); // TEST_SHMEM_REDUCE(prod, complexf, float _Complex); - roc_shmem_finalize(); + rocshmem_finalize(); return rc; } diff --git a/tests/sos_tests/shmem_team_reuse_teams.cpp b/tests/sos_tests/shmem_team_reuse_teams.cpp index 4a8d1538ae..54c7611685 100644 --- a/tests/sos_tests/shmem_team_reuse_teams.cpp +++ b/tests/sos_tests/shmem_team_reuse_teams.cpp @@ -27,7 +27,7 @@ #include -#include +#include using namespace rocshmem; @@ -35,15 +35,15 @@ int main(void) { int i, me, npes; int ret = 0, errors = 0; - roc_shmem_init(); + rocshmem_init(); - me = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + me = rocshmem_my_pe(); + npes = rocshmem_n_pes(); if (me == 0) printf("Reuse teams test\n"); - roc_shmem_team_t old_team, new_team; - ret = roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, NULL, 0, + rocshmem_team_t old_team, new_team; + ret = rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, NULL, 0, &old_team); if (ret) ++errors; @@ -53,20 +53,20 @@ int main(void) { for (i = 1; i < npes; i++) { if (me == i) { printf("%3d: creating new team (start, stride, size): %3d, %3d, %3d\n", - me, roc_shmem_team_translate_pe(old_team, 1, ROC_SHMEM_TEAM_WORLD), - 1, roc_shmem_team_n_pes(old_team) - 1); + me, rocshmem_team_translate_pe(old_team, 1, ROCSHMEM_TEAM_WORLD), + 1, rocshmem_team_n_pes(old_team) - 1); } - ret = roc_shmem_team_split_strided( - old_team, 1, 1, roc_shmem_team_n_pes(old_team) - 1, NULL, 0, &new_team); - if (old_team != ROC_SHMEM_TEAM_INVALID && ret) ++errors; + ret = rocshmem_team_split_strided( + old_team, 1, 1, rocshmem_team_n_pes(old_team) - 1, NULL, 0, &new_team); + if (old_team != ROCSHMEM_TEAM_INVALID && ret) ++errors; - roc_shmem_team_destroy(old_team); + rocshmem_team_destroy(old_team); old_team = new_team; } - roc_shmem_team_destroy(old_team); - roc_shmem_finalize(); + rocshmem_team_destroy(old_team); + rocshmem_finalize(); return errors != 0; } diff --git a/tests/sos_tests/shmem_team_translate.cpp b/tests/sos_tests/shmem_team_translate.cpp index 1c3cbc0144..6f4f256a61 100644 --- a/tests/sos_tests/shmem_team_translate.cpp +++ b/tests/sos_tests/shmem_team_translate.cpp @@ -21,38 +21,38 @@ *****************************************************************************/ /* - * ROC_SHMEM roc_shmem_team_translate example to verify the team formed by even - * ranked PEs from ROC_SHMEM_TEAM_WORLD using the team created from - * roc_shmem_team_split_stride operation + * rocSHMEM rocshmem_team_translate example to verify the team formed by even + * ranked PEs from ROCSHMEM_TEAM_WORLD using the team created from + * rocshmem_team_split_stride operation */ #include -#include +#include using namespace rocshmem; int main(void) { int my_pe, npes, errors = 0; int t_pe_2, t_pe_3, t_pe_2_to_3, t_pe_3_to_2; - roc_shmem_team_t team_2s; - roc_shmem_team_t team_3s; - roc_shmem_team_config_t *config; + rocshmem_team_t team_2s; + rocshmem_team_t team_3s; + rocshmem_team_config_t *config; - roc_shmem_init(); + rocshmem_init(); config = NULL; - my_pe = roc_shmem_my_pe(); - npes = roc_shmem_n_pes(); + my_pe = rocshmem_my_pe(); + npes = rocshmem_n_pes(); - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 2, ((npes - 1) / 2) + 1, + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 2, ((npes - 1) / 2) + 1, config, 0, &team_2s); - roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 3, ((npes - 1) / 3) + 1, + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 3, ((npes - 1) / 3) + 1, config, 0, &team_3s); - t_pe_3 = roc_shmem_team_my_pe(team_3s); - t_pe_2 = roc_shmem_team_my_pe(team_2s); - t_pe_3_to_2 = roc_shmem_team_translate_pe(team_3s, t_pe_3, team_2s); - t_pe_2_to_3 = roc_shmem_team_translate_pe(team_2s, t_pe_2, team_3s); + t_pe_3 = rocshmem_team_my_pe(team_3s); + t_pe_2 = rocshmem_team_my_pe(team_2s); + t_pe_3_to_2 = rocshmem_team_translate_pe(team_3s, t_pe_3, team_2s); + t_pe_2_to_3 = rocshmem_team_translate_pe(team_2s, t_pe_2, team_3s); if (my_pe % 2 == 0 && my_pe % 3 == 0) { if (t_pe_2 == -1 || t_pe_3 == -1 || t_pe_2_to_3 == -1 || @@ -92,6 +92,6 @@ int main(void) { } } - roc_shmem_finalize(); + rocshmem_finalize(); return errors != 0; } diff --git a/tests/sos_tests/shmem_test.cpp b/tests/sos_tests/shmem_test.cpp index b3f7b9535d..cc460a825d 100644 --- a/tests/sos_tests/shmem_test.cpp +++ b/tests/sos_tests/shmem_test.cpp @@ -27,41 +27,41 @@ */ /* Each PE sends a message to every PE. PEs wait for all messages to - * arrive using roc_shmem_test to poll the array. */ + * arrive using rocshmem_test to poll the array. */ #include -#include +#include using namespace rocshmem; /* Wait for any entry in the given ivar array to match the wait criteria and * return the index of the entry that satisfied the test. */ -static int wait_any(long *ivar, int count, roc_shmem_cmps cmp, long value) { +static int wait_any(long *ivar, int count, rocshmem_cmps cmp, long value) { int idx = 0; - while (!roc_shmem_long_test(&ivar[idx], cmp, value)) idx = (idx + 1) % count; + while (!rocshmem_long_test(&ivar[idx], cmp, value)) idx = (idx + 1) % count; return idx; } int main(void) { - roc_shmem_init(); - const int mype = roc_shmem_my_pe(); - const int npes = roc_shmem_n_pes(); + rocshmem_init(); + const int mype = rocshmem_my_pe(); + const int npes = rocshmem_n_pes(); - long *wait_vars = (long *)roc_shmem_malloc(npes * sizeof(long)); + long *wait_vars = (long *)rocshmem_malloc(npes * sizeof(long)); for (int i = 0; i < npes; i++) { wait_vars[i] = 0; } /* Put mype+1 to every PE */ for (int i = 0; i < npes; i++) - roc_shmem_long_p(&wait_vars[mype], mype + 1, i); + rocshmem_long_p(&wait_vars[mype], mype + 1, i); int nrecv = 0, errors = 0; /* Wait for all messages to arrive */ while (nrecv < npes) { - int who = wait_any(wait_vars, npes, ROC_SHMEM_CMP_NE, 0); + int who = wait_any(wait_vars, npes, ROCSHMEM_CMP_NE, 0); if (wait_vars[who] != who + 1) { printf("%d: wait_vars[%d] = %ld, expected %d\n", mype, who, wait_vars[who], who + 1); @@ -71,7 +71,7 @@ int main(void) { nrecv++; } - roc_shmem_free(wait_vars); - roc_shmem_finalize(); + rocshmem_free(wait_vars); + rocshmem_finalize(); return errors; } diff --git a/tests/sos_tests/sping.cpp b/tests/sos_tests/sping.cpp index 46acf0ca5d..055c00c104 100644 --- a/tests/sos_tests/sping.cpp +++ b/tests/sos_tests/sping.cpp @@ -53,7 +53,7 @@ void printStats(int, int, int, int, double); int Verbose = 0; -#include +#include using namespace rocshmem; @@ -99,7 +99,7 @@ void usage(char *name) { } void help(char *name) { - if (roc_shmem_my_pe() == 0) { + if (rocshmem_my_pe() == 0) { printf("Usage: %s [flags] nwords [maxWords] [incWords]\n\n", name); printf(" Flags may be any of\n"); printf(" -n number repititions\n"); @@ -107,7 +107,7 @@ void help(char *name) { printf(" -h print this info\n\n"); printf(" Numbers may be postfixed with 'k' or 'm'\n\n"); } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); exit(0); } @@ -128,12 +128,12 @@ int main(int argc, char *argv[]) { long *rbuf; /* remote buffer - sink */ long *tbuf; /* transmit buffer - src */ - roc_shmem_init(); - proc = roc_shmem_my_pe(); - nproc = roc_shmem_n_pes(); + rocshmem_init(); + proc = rocshmem_my_pe(); + nproc = rocshmem_n_pes(); if (nproc == 1) { fprintf(stderr, "ERR - Requires > 1 Processing Elements\n"); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } @@ -172,13 +172,13 @@ int main(int argc, char *argv[]) { else if ((incWords = getSize(argv[optind++])) < 0) usage(progName); - if (!(rbuf = (long *)roc_shmem_malloc(maxWords * sizeof(long)))) { + if (!(rbuf = (long *)rocshmem_malloc(maxWords * sizeof(long)))) { perror("Failed memory allocation"); exit(1); } memset(rbuf, 0, maxWords * sizeof(long)); - if (!(tbuf = (long *)roc_shmem_malloc(maxWords * sizeof(long)))) { + if (!(tbuf = (long *)rocshmem_malloc(maxWords * sizeof(long)))) { perror("Failed memory allocation"); exit(1); } @@ -193,7 +193,7 @@ int main(int argc, char *argv[]) { dprint("[%d] rbuf: %ld\n", proc, (unsigned long)rbuf); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); peer = proc ^ 1; if (peer >= nproc) doprint = 0; @@ -201,37 +201,37 @@ int main(int argc, char *argv[]) { for (nwords = minWords; nwords <= maxWords; nwords = incWords ? nwords + incWords : nwords ? 2 * nwords : 1) { r = reps; - roc_shmem_barrier_all(); + rocshmem_barrier_all(); tv[0] = gettime(); if (peer < nproc) { if (proc & 1) { r--; - roc_shmem_long_wait_until(&rbuf[nwords - 1], ROC_SHMEM_CMP_NE, 0); + rocshmem_long_wait_until(&rbuf[nwords - 1], ROCSHMEM_CMP_NE, 0); rbuf[nwords - 1] = 0; } while (r-- > 0) { - roc_shmem_long_put(rbuf, tbuf, nwords, peer); - roc_shmem_long_wait_until(&rbuf[nwords - 1], ROC_SHMEM_CMP_NE, 0); + rocshmem_long_put(rbuf, tbuf, nwords, peer); + rocshmem_long_wait_until(&rbuf[nwords - 1], ROCSHMEM_CMP_NE, 0); rbuf[nwords - 1] = 0; } if (proc & 1) { - roc_shmem_long_put(rbuf, tbuf, nwords, peer); + rocshmem_long_put(rbuf, tbuf, nwords, peer); } } tv[1] = gettime(); t = dt(&tv[1], &tv[0]) / (2 * reps); - roc_shmem_barrier_all(); + rocshmem_barrier_all(); printStats(proc, peer, doprint, nwords, t); } - roc_shmem_free(rbuf); - roc_shmem_free(tbuf); + rocshmem_free(rbuf); + rocshmem_free(tbuf); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/thread_wait.cpp b/tests/sos_tests/thread_wait.cpp index 063dd80cb1..e8a39cbb56 100644 --- a/tests/sos_tests/thread_wait.cpp +++ b/tests/sos_tests/thread_wait.cpp @@ -26,13 +26,13 @@ */ /* Thread wait test: Test whether a store performed by one thead will wake up a - * second thread from a call to roc_shmem_wait. */ + * second thread from a call to rocshmem_wait. */ #include #include #include -#include +#include using namespace rocshmem; @@ -46,12 +46,12 @@ static void *src_thread_fn(void *arg) { *shr_var = 1; /* Quiet should provide a store fence */ - roc_shmem_quiet(); + rocshmem_quiet(); return NULL; } static void *dst_thread_fn(void *arg) { - roc_shmem_long_wait_until(shr_var, ROC_SHMEM_CMP_NE, 0); + rocshmem_long_wait_until(shr_var, ROCSHMEM_CMP_NE, 0); printf("shr_var is now %ld\n", *shr_var); return NULL; } @@ -60,15 +60,15 @@ int main(int argc, char *argv[]) { int tl, ret; pthread_t src_thread, dst_thread; - roc_shmem_init_thread(ROC_SHMEM_THREAD_MULTIPLE, &tl); + rocshmem_init_thread(ROCSHMEM_THREAD_MULTIPLE, &tl); - if (tl != ROC_SHMEM_THREAD_MULTIPLE) { + if (tl != ROCSHMEM_THREAD_MULTIPLE) { printf("Init failed (requested thread level %d, got %d)\n", - ROC_SHMEM_THREAD_MULTIPLE, tl); - roc_shmem_global_exit(1); + ROCSHMEM_THREAD_MULTIPLE, tl); + rocshmem_global_exit(1); } - shr_var = (long *)roc_shmem_malloc(sizeof(long)); + shr_var = (long *)rocshmem_malloc(sizeof(long)); *shr_var = 0; pthread_create(&dst_thread, NULL, &dst_thread_fn, NULL); @@ -77,9 +77,9 @@ int main(int argc, char *argv[]) { pthread_join(dst_thread, NULL); pthread_join(src_thread, NULL); - roc_shmem_free(shr_var); + rocshmem_free(shr_var); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/threading.cpp b/tests/sos_tests/threading.cpp index 73acc59ed8..5734bd893d 100644 --- a/tests/sos_tests/threading.cpp +++ b/tests/sos_tests/threading.cpp @@ -29,7 +29,7 @@ #include #include -#include +#include using namespace rocshmem; @@ -48,27 +48,27 @@ static void *roundrobin(void *tparam) { int offset = tid * N_ELEMS; /* fprintf(stderr,"Starting thread %lu with offset %d\n",tid,offset); */ - int nextpe = (roc_shmem_my_pe() + 1) % roc_shmem_n_pes(); - int prevpe = (roc_shmem_my_pe() - 1 + roc_shmem_n_pes()) % roc_shmem_n_pes(); - roc_shmem_long_put(target + offset, source + offset, N_ELEMS, nextpe); + int nextpe = (rocshmem_my_pe() + 1) % rocshmem_n_pes(); + int prevpe = (rocshmem_my_pe() - 1 + rocshmem_n_pes()) % rocshmem_n_pes(); + rocshmem_long_put(target + offset, source + offset, N_ELEMS, nextpe); /* fprintf(stderr,"Thread %lu done first put\n",tid); */ pthread_barrier_wait(&fencebar); - if (tid == 0) roc_shmem_barrier_all(); + if (tid == 0) rocshmem_barrier_all(); pthread_barrier_wait(&fencebar); - roc_shmem_long_get(source + offset, target + offset, N_ELEMS, prevpe); + rocshmem_long_get(source + offset, target + offset, N_ELEMS, prevpe); /* fprintf(stderr,"Thread %lu done first get\n",tid); */ pthread_barrier_wait(&fencebar); - if (tid == 0) roc_shmem_barrier_all(); + if (tid == 0) rocshmem_barrier_all(); pthread_barrier_wait(&fencebar); - roc_shmem_long_get(target + offset, source + offset, N_ELEMS, nextpe); + rocshmem_long_get(target + offset, source + offset, N_ELEMS, nextpe); /* fprintf(stderr,"Thread %lu done second get\n",tid); */ pthread_barrier_wait(&fencebar); - if (tid == 0) roc_shmem_barrier_all(); + if (tid == 0) rocshmem_barrier_all(); pthread_barrier_wait(&fencebar); /* fprintf(stderr,"Done thread %lu\n",tid); */ @@ -79,22 +79,22 @@ int main(int argc, char *argv[]) { int i; int tl; - roc_shmem_init_thread(ROC_SHMEM_THREAD_MULTIPLE, &tl); + rocshmem_init_thread(ROCSHMEM_THREAD_MULTIPLE, &tl); - if (tl != ROC_SHMEM_THREAD_MULTIPLE) { + if (tl != ROCSHMEM_THREAD_MULTIPLE) { printf("Init failed (requested thread level %d, got %d)\n", - ROC_SHMEM_THREAD_MULTIPLE, tl); - roc_shmem_global_exit(1); + ROCSHMEM_THREAD_MULTIPLE, tl); + rocshmem_global_exit(1); } - if (roc_shmem_n_pes() == 1) { + if (rocshmem_n_pes() == 1) { printf("%s: Requires number of PEs > 1\n", argv[0]); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } - source = (long *)roc_shmem_malloc(N_THREADS * N_ELEMS * sizeof(long)); - target = (long *)roc_shmem_malloc(N_THREADS * N_ELEMS * sizeof(long)); + source = (long *)rocshmem_malloc(N_THREADS * N_ELEMS * sizeof(long)); + target = (long *)rocshmem_malloc(N_THREADS * N_ELEMS * sizeof(long)); for (i = 0; i < N_THREADS * N_ELEMS; ++i) { source[i] = i + 1; @@ -117,18 +117,18 @@ int main(int argc, char *argv[]) { pthread_barrier_destroy(&fencebar); if (0 != memcmp(source, target, sizeof(long) * N_THREADS * N_ELEMS)) { - fprintf(stderr, "[%d] Src & Target mismatch?\n", roc_shmem_my_pe()); + fprintf(stderr, "[%d] Src & Target mismatch?\n", rocshmem_my_pe()); for (i = 0; i < 10; ++i) { printf("%ld,%ld ", source[i], target[i]); } printf("\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } - roc_shmem_free(source); - roc_shmem_free(target); + rocshmem_free(source); + rocshmem_free(target); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } diff --git a/tests/sos_tests/to_all.cpp b/tests/sos_tests/to_all.cpp index 7556ec5244..3b506df757 100644 --- a/tests/sos_tests/to_all.cpp +++ b/tests/sos_tests/to_all.cpp @@ -65,14 +65,14 @@ #include #include -#include +#include using namespace rocshmem; #define Rprintf \ - if (roc_shmem_my_pe() == 0) printf + if (rocshmem_my_pe() == 0) printf #define Rfprintf \ - if (roc_shmem_my_pe() == 0) fprintf + if (rocshmem_my_pe() == 0) fprintf #define Vprintf \ if (Verbose > 1) printf @@ -95,7 +95,7 @@ long *pSync1; #define N 128 #define MAX(a, b) ((a) > (b)) ? (a) : (b) -#define WRK_SIZE MAX(N / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE) +#define WRK_SIZE MAX(N / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE) short *src0, *dst0, *pWrk0; int *src1, *dst1, *pWrk1; @@ -124,21 +124,21 @@ int max_to_all(int me, int npes) { src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + i; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_short_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, + rocshmem_ctx_short_max_to_all(ROCSHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, npes, pWrk0, pSync); - roc_shmem_ctx_int_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, + rocshmem_ctx_int_max_to_all(ROCSHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, pWrk1, pSync1); - roc_shmem_ctx_long_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, + rocshmem_ctx_long_max_to_all(ROCSHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, npes, pWrk2, pSync); - roc_shmem_ctx_float_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst3, src3, N, 0, 0, + rocshmem_ctx_float_max_to_all(ROCSHMEM_CTX_DEFAULT, dst3, src3, N, 0, 0, npes, pWrk3, pSync1); - roc_shmem_ctx_double_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst4, src4, N, 0, 0, + rocshmem_ctx_double_max_to_all(ROCSHMEM_CTX_DEFAULT, dst4, src4, N, 0, 0, npes, pWrk4, pSync); - // roc_shmem_ctx_longdouble_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst5, src5, N, + // rocshmem_ctx_longdouble_max_to_all(ROCSHMEM_CTX_DEFAULT, dst5, src5, N, // 0, 0, npes, pWrk5, pSync1); - roc_shmem_ctx_longlong_max_to_all(ROC_SHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, + rocshmem_ctx_longlong_max_to_all(ROCSHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, npes, pWrk6, pSync); if (me == 0) { @@ -153,54 +153,54 @@ int max_to_all(int me, int npes) { } if (ok[0] == 1) { - printf("Reduction operation roc_shmem_short_max_to_all: Failed\n"); + printf("Reduction operation rocshmem_short_max_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_short_max_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_short_max_to_all: Passed\n"); pass++; } if (ok[1] == 1) { - printf("Reduction operation roc_shmem_int_max_to_all: Failed\n"); + printf("Reduction operation rocshmem_int_max_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_int_max_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_int_max_to_all: Passed\n"); pass++; } if (ok[2] == 1) { - printf("Reduction operation roc_shmem_long_max_to_all: Failed\n"); + printf("Reduction operation rocshmem_long_max_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_long_max_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_long_max_to_all: Passed\n"); pass++; } if (ok[3] == 1) { - printf("Reduction operation roc_shmem_float_max_to_all: Failed\n"); + printf("Reduction operation rocshmem_float_max_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_float_max_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_float_max_to_all: Passed\n"); pass++; } if (ok[4] == 1) { - printf("Reduction operation roc_shmem_double_max_to_all: Failed\n"); + printf("Reduction operation rocshmem_double_max_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_double_max_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_double_max_to_all: Passed\n"); pass++; } /* if(ok[5]==1){ - printf("Reduction operation roc_shmem_longdouble_max_to_all: Failed\n"); + printf("Reduction operation rocshmem_longdouble_max_to_all: Failed\n"); } else{ - Vprintf("Reduction operation roc_shmem_longdouble_max_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longdouble_max_to_all: Passed\n"); pass++; } */ pass++; if (ok[6] == 1) { - printf("Reduction operation roc_shmem_longlong_max_to_all: Failed\n"); + printf("Reduction operation rocshmem_longlong_max_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_longlong_max_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longlong_max_to_all: Passed\n"); pass++; } Vprintf("\n"); } - if (Serialize) roc_shmem_barrier_all(); + if (Serialize) rocshmem_barrier_all(); return (pass == 7 ? 1 : 0); } @@ -222,21 +222,21 @@ int min_to_all(int me, int npes) { dst6[i] = -9; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_short_min_to_all(ROC_SHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, + rocshmem_ctx_short_min_to_all(ROCSHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, npes, pWrk0, pSync); - roc_shmem_ctx_int_min_to_all(ROC_SHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, + rocshmem_ctx_int_min_to_all(ROCSHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, pWrk1, pSync1); - roc_shmem_ctx_long_min_to_all(ROC_SHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, + rocshmem_ctx_long_min_to_all(ROCSHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, npes, pWrk2, pSync); - roc_shmem_ctx_float_min_to_all(ROC_SHMEM_CTX_DEFAULT, dst3, src3, N, 0, 0, + rocshmem_ctx_float_min_to_all(ROCSHMEM_CTX_DEFAULT, dst3, src3, N, 0, 0, npes, pWrk3, pSync1); - roc_shmem_ctx_double_min_to_all(ROC_SHMEM_CTX_DEFAULT, dst4, src4, N, 0, 0, + rocshmem_ctx_double_min_to_all(ROCSHMEM_CTX_DEFAULT, dst4, src4, N, 0, 0, npes, pWrk4, pSync); - // roc_shmem_ctx_longdouble_min_to_all(ROC_SHMEM_CTX_DEFAULT, dst5, src5, N, + // rocshmem_ctx_longdouble_min_to_all(ROCSHMEM_CTX_DEFAULT, dst5, src5, N, // 0, 0, npes, pWrk5, pSync1); - roc_shmem_ctx_longlong_min_to_all(ROC_SHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, + rocshmem_ctx_longlong_min_to_all(ROCSHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, npes, pWrk6, pSync); if (me == 0) { @@ -250,54 +250,54 @@ int min_to_all(int me, int npes) { if (dst6[i] != i) ok[6] = 1; } if (ok[0] == 1) { - printf("Reduction operation roc_shmem_short_min_to_all: Failed\n"); + printf("Reduction operation rocshmem_short_min_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_short_min_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_short_min_to_all: Passed\n"); pass++; } if (ok[1] == 1) { - printf("Reduction operation roc_shmem_int_min_to_all: Failed\n"); + printf("Reduction operation rocshmem_int_min_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_int_min_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_int_min_to_all: Passed\n"); pass++; } if (ok[2] == 1) { - printf("Reduction operation roc_shmem_long_min_to_all: Failed\n"); + printf("Reduction operation rocshmem_long_min_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_long_min_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_long_min_to_all: Passed\n"); pass++; } if (ok[3] == 1) { - printf("Reduction operation roc_shmem_float_min_to_all: Failed\n"); + printf("Reduction operation rocshmem_float_min_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_float_min_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_float_min_to_all: Passed\n"); pass++; } if (ok[4] == 1) { - printf("Reduction operation roc_shmem_double_min_to_all: Failed\n"); + printf("Reduction operation rocshmem_double_min_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_double_min_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_double_min_to_all: Passed\n"); pass++; } /* if(ok[5]==1){ - printf("Reduction operation roc_shmem_longdouble_min_to_all: Failed\n"); + printf("Reduction operation rocshmem_longdouble_min_to_all: Failed\n"); } else{ - Vprintf("Reduction operation roc_shmem_longdouble_min_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longdouble_min_to_all: Passed\n"); pass++; } */ pass++; if (ok[6] == 1) { - printf("Reduction operation roc_shmem_longlong_min_to_all: Failed\n"); + printf("Reduction operation rocshmem_longlong_min_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_longlong_min_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longlong_min_to_all: Passed\n"); pass++; } Vprintf("\n"); } - if (Serialize) roc_shmem_barrier_all(); + if (Serialize) rocshmem_barrier_all(); return (pass == 7 ? 1 : 0); } @@ -318,21 +318,21 @@ int sum_to_all(int me, int npes) { dst6[i] = -9; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_short_sum_to_all(ROC_SHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, + rocshmem_ctx_short_sum_to_all(ROCSHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, npes, pWrk0, pSync); - roc_shmem_ctx_int_sum_to_all(ROC_SHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, + rocshmem_ctx_int_sum_to_all(ROCSHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, pWrk1, pSync1); - roc_shmem_ctx_long_sum_to_all(ROC_SHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, + rocshmem_ctx_long_sum_to_all(ROCSHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, npes, pWrk2, pSync); - roc_shmem_ctx_float_sum_to_all(ROC_SHMEM_CTX_DEFAULT, dst3, src3, N, 0, 0, + rocshmem_ctx_float_sum_to_all(ROCSHMEM_CTX_DEFAULT, dst3, src3, N, 0, 0, npes, pWrk3, pSync1); - roc_shmem_ctx_double_sum_to_all(ROC_SHMEM_CTX_DEFAULT, dst4, src4, N, 0, 0, + rocshmem_ctx_double_sum_to_all(ROCSHMEM_CTX_DEFAULT, dst4, src4, N, 0, 0, npes, pWrk4, pSync); - // roc_shmem_ctx_longdouble_sum_to_all(ROC_SHMEM_CTX_DEFAULT, dst5, src5, N, + // rocshmem_ctx_longdouble_sum_to_all(ROCSHMEM_CTX_DEFAULT, dst5, src5, N, // 0, 0, npes, pWrk5, pSync1); - roc_shmem_ctx_longlong_sum_to_all(ROC_SHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, + rocshmem_ctx_longlong_sum_to_all(ROCSHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, npes, pWrk6, pSync); if (me == 0) { @@ -346,55 +346,55 @@ int sum_to_all(int me, int npes) { if (dst6[i] != (long long)(npes * (npes - 1) / 2)) ok[6] = 1; } if (ok[0] == 1) { - printf("Reduction operation roc_shmem_short_sum_to_all: Failed\n"); + printf("Reduction operation rocshmem_short_sum_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_short_sum_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_short_sum_to_all: Passed\n"); pass++; } if (ok[1] == 1) { - printf("Reduction operation roc_shmem_int_sum_to_all: Failed\n"); + printf("Reduction operation rocshmem_int_sum_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_int_sum_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_int_sum_to_all: Passed\n"); pass++; } if (ok[2] == 1) { - printf("Reduction operation roc_shmem_long_sum_to_all: Failed\n"); + printf("Reduction operation rocshmem_long_sum_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_long_sum_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_long_sum_to_all: Passed\n"); pass++; } if (ok[3] == 1) { - printf("Reduction operation roc_shmem_float_sum_to_all: Failed\n"); + printf("Reduction operation rocshmem_float_sum_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_float_sum_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_float_sum_to_all: Passed\n"); pass++; } if (ok[4] == 1) { - printf("Reduction operation roc_shmem_double_sum_to_all: Failed\n"); + printf("Reduction operation rocshmem_double_sum_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_double_sum_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_double_sum_to_all: Passed\n"); pass++; } /* if(ok[5]==1){ - printf("Reduction operation roc_shmem_longdouble_sum_to_all: Failed\n"); + printf("Reduction operation rocshmem_longdouble_sum_to_all: Failed\n"); } else{ - Vprintf("Reduction operation roc_shmem_longdouble_sum_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longdouble_sum_to_all: Passed\n"); pass++; } */ pass++; if (ok[6] == 1) { - printf("Reduction operation roc_shmem_longlong_sum_to_all: Failed\n"); + printf("Reduction operation rocshmem_longlong_sum_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_longlong_sum_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longlong_sum_to_all: Passed\n"); pass++; } Vprintf("\n"); fflush(stdout); } - if (Serialize) roc_shmem_barrier_all(); + if (Serialize) rocshmem_barrier_all(); return (pass == 7 ? 1 : 0); } @@ -409,15 +409,15 @@ int and_to_all(int me, int num_pes) { dst0[i] = dst1[i] = dst2[i] = dst6[i] = -9; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_short_and_to_all(ROC_SHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, + rocshmem_ctx_short_and_to_all(ROCSHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, num_pes, pWrk0, pSync); - roc_shmem_ctx_int_and_to_all(ROC_SHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, + rocshmem_ctx_int_and_to_all(ROCSHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, num_pes, pWrk1, pSync1); - roc_shmem_ctx_long_and_to_all(ROC_SHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, + rocshmem_ctx_long_and_to_all(ROCSHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, num_pes, pWrk2, pSync); - roc_shmem_ctx_longlong_and_to_all(ROC_SHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, + rocshmem_ctx_longlong_and_to_all(ROCSHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, num_pes, pWrk6, pSync1); if (me == 0) { @@ -429,33 +429,33 @@ int and_to_all(int me, int num_pes) { } if (ok[0] == 1) { - printf("Reduction operation roc_shmem_short_and_to_all: Failed\n"); + printf("Reduction operation rocshmem_short_and_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_short_and_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_short_and_to_all: Passed\n"); pass++; } if (ok[1] == 1) { - printf("Reduction operation roc_shmem_int_and_to_all: Failed\n"); + printf("Reduction operation rocshmem_int_and_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_int_and_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_int_and_to_all: Passed\n"); pass++; } if (ok[2] == 1) { - printf("Reduction operation roc_shmem_long_and_to_all: Failed\n"); + printf("Reduction operation rocshmem_long_and_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_long_and_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_long_and_to_all: Passed\n"); pass++; } if (ok[3] == 1) { - printf("Reduction operation roc_shmem_longlong_and_to_all: Failed\n"); + printf("Reduction operation rocshmem_longlong_and_to_all: Failed\n"); } else { - Vprintf("Reduction operation roc_shmem_longlong_and_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longlong_and_to_all: Passed\n"); pass++; } Vprintf("\n"); fflush(stdout); } - if (Serialize) roc_shmem_barrier_all(); + if (Serialize) rocshmem_barrier_all(); return (pass == 4 ? 1 : 0); } @@ -503,21 +503,21 @@ int prod_to_all(int me, int npes) { expected_result6 *= i; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_short_prod_to_all(ROC_SHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, + rocshmem_ctx_short_prod_to_all(ROCSHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, npes, pWrk0, pSync); - roc_shmem_ctx_int_prod_to_all(ROC_SHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, + rocshmem_ctx_int_prod_to_all(ROCSHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, pWrk1, pSync1); - roc_shmem_ctx_long_prod_to_all(ROC_SHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, + rocshmem_ctx_long_prod_to_all(ROCSHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, npes, pWrk2, pSync); - roc_shmem_ctx_float_prod_to_all(ROC_SHMEM_CTX_DEFAULT, dst3, src3, N, 0, 0, + rocshmem_ctx_float_prod_to_all(ROCSHMEM_CTX_DEFAULT, dst3, src3, N, 0, 0, npes, pWrk3, pSync1); - roc_shmem_ctx_double_prod_to_all(ROC_SHMEM_CTX_DEFAULT, dst4, src4, N, 0, 0, + rocshmem_ctx_double_prod_to_all(ROCSHMEM_CTX_DEFAULT, dst4, src4, N, 0, 0, npes, pWrk4, pSync); - // roc_shmem_ctx_longdouble_prod_to_all(ROC_SHMEM_CTX_DEFAULT, dst5, src5, N, + // rocshmem_ctx_longdouble_prod_to_all(ROCSHMEM_CTX_DEFAULT, dst5, src5, N, // 0, 0, npes, pWrk5, pSync1); - roc_shmem_ctx_longlong_prod_to_all(ROC_SHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, + rocshmem_ctx_longlong_prod_to_all(ROCSHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, npes, pWrk6, pSync); if (me == 0) { @@ -547,62 +547,62 @@ int prod_to_all(int me, int npes) { } if (ok[0] == 1) - printf("Reduction operation roc_shmem_short_prod_to_all: Failed\n"); + printf("Reduction operation rocshmem_short_prod_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_short_prod_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_short_prod_to_all: Passed\n"); pass++; } if (ok[1] == 1) - printf("Reduction operation roc_shmem_int_prod_to_all: Failed\n"); + printf("Reduction operation rocshmem_int_prod_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_int_prod_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_int_prod_to_all: Passed\n"); pass++; } if (ok[2] == 1) - printf("Reduction operation roc_shmem_long_prod_to_all: Failed\n"); + printf("Reduction operation rocshmem_long_prod_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_long_prod_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_long_prod_to_all: Passed\n"); pass++; } if (ok[3] == 1) - printf("Reduction operation roc_shmem_float_prod_to_all: Failed\n"); + printf("Reduction operation rocshmem_float_prod_to_all: Failed\n"); else { if (float_rounding_err) { Vprintf( - "Reduction operation roc_shmem_float_prod_to_all: skipped due to " + "Reduction operation rocshmem_float_prod_to_all: skipped due to " "float rounding error\n"); } else { - Vprintf("Reduction operation roc_shmem_float_prod_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_float_prod_to_all: Passed\n"); } pass++; } if (ok[4] == 1) - printf("Reduction operation roc_shmem_double_prod_to_all: Failed\n"); + printf("Reduction operation rocshmem_double_prod_to_all: Failed\n"); else { if (double_rounding_err) { Vprintf( - "Reduction operation roc_shmem_double_prod_to_all: skipped due to " + "Reduction operation rocshmem_double_prod_to_all: skipped due to " "double rounding error\n"); } else { - Vprintf("Reduction operation roc_shmem_double_prod_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_double_prod_to_all: Passed\n"); } pass++; } /* if(ok[5]==1) - printf("Reduction operation roc_shmem_longdouble_prod_to_all: Failed\n"); + printf("Reduction operation rocshmem_longdouble_prod_to_all: Failed\n"); else { if (double_rounding_err) { - Vprintf("Reduction operation roc_shmem_longdouble_prod_to_all: skipped + Vprintf("Reduction operation rocshmem_longdouble_prod_to_all: skipped due to long double rounding error\n"); } else { - Vprintf("Reduction operation roc_shmem_longdouble_prod_to_all: + Vprintf("Reduction operation rocshmem_longdouble_prod_to_all: Passed\n"); } pass++; @@ -611,14 +611,14 @@ int prod_to_all(int me, int npes) { pass++; if (ok[6] == 1) - printf("Reduction operation roc_shmem_longlong_prod_to_all: Failed\n"); + printf("Reduction operation rocshmem_longlong_prod_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_longlong_prod_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longlong_prod_to_all: Passed\n"); pass++; } Vprintf("\n"); } - if (Serialize) roc_shmem_barrier_all(); + if (Serialize) rocshmem_barrier_all(); return (pass == 7 ? 1 : 0); } @@ -636,15 +636,15 @@ int or_to_all(int me, int npes) { dst6[i] = -9; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_short_or_to_all(ROC_SHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, + rocshmem_ctx_short_or_to_all(ROCSHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, npes, pWrk0, pSync); - roc_shmem_ctx_int_or_to_all(ROC_SHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, + rocshmem_ctx_int_or_to_all(ROCSHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, pWrk1, pSync1); - roc_shmem_ctx_long_or_to_all(ROC_SHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, npes, + rocshmem_ctx_long_or_to_all(ROCSHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, npes, pWrk2, pSync); - roc_shmem_ctx_longlong_or_to_all(ROC_SHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, + rocshmem_ctx_longlong_or_to_all(ROCSHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, npes, pWrk6, pSync1); if (me == 0) { @@ -658,35 +658,35 @@ int or_to_all(int me, int npes) { } if (ok[0] == 1) - printf("Reduction operation roc_shmem_short_or_to_all: Failed\n"); + printf("Reduction operation rocshmem_short_or_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_short_or_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_short_or_to_all: Passed\n"); pass++; } if (ok[1] == 1) - printf("Reduction operation roc_shmem_int_or_to_all: Failed\n"); + printf("Reduction operation rocshmem_int_or_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_int_or_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_int_or_to_all: Passed\n"); pass++; } if (ok[2] == 1) - printf("Reduction operation roc_shmem_long_or_to_all: Failed\n"); + printf("Reduction operation rocshmem_long_or_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_long_or_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_long_or_to_all: Passed\n"); pass++; } if (ok[6] == 1) - printf("Reduction operation roc_shmem_longlong_or_to_all: Failed\n"); + printf("Reduction operation rocshmem_longlong_or_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_longlong_or_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longlong_or_to_all: Passed\n"); pass++; } Vprintf("\n"); } - if (Serialize) roc_shmem_barrier_all(); + if (Serialize) rocshmem_barrier_all(); return (pass == 4 ? 1 : 0); } @@ -705,15 +705,15 @@ int xor_to_all(int me, int npes) { dst6[i] = -9; } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); - roc_shmem_ctx_short_xor_to_all(ROC_SHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, + rocshmem_ctx_short_xor_to_all(ROCSHMEM_CTX_DEFAULT, dst0, src0, N, 0, 0, npes, pWrk0, pSync); - roc_shmem_ctx_int_xor_to_all(ROC_SHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, + rocshmem_ctx_int_xor_to_all(ROCSHMEM_CTX_DEFAULT, dst1, src1, N, 0, 0, npes, pWrk1, pSync1); - roc_shmem_ctx_long_xor_to_all(ROC_SHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, + rocshmem_ctx_long_xor_to_all(ROCSHMEM_CTX_DEFAULT, dst2, src2, N, 0, 0, npes, pWrk2, pSync); - roc_shmem_ctx_longlong_xor_to_all(ROC_SHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, + rocshmem_ctx_longlong_xor_to_all(ROCSHMEM_CTX_DEFAULT, dst6, src6, N, 0, 0, npes, pWrk6, pSync1); if (me == 0) { @@ -725,36 +725,36 @@ int xor_to_all(int me, int npes) { } if (ok[0] == 1) - printf("Reduction operation roc_shmem_short_xor_to_all: Failed\n"); + printf("Reduction operation rocshmem_short_xor_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_short_xor_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_short_xor_to_all: Passed\n"); pass++; } if (ok[1] == 1) - printf("Reduction operation roc_shmem_int_xor_to_all: Failed\n"); + printf("Reduction operation rocshmem_int_xor_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_int_xor_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_int_xor_to_all: Passed\n"); pass++; } if (ok[2] == 1) - printf("Reduction operation roc_shmem_long_xor_to_all: Failed\n"); + printf("Reduction operation rocshmem_long_xor_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_long_xor_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_long_xor_to_all: Passed\n"); pass++; } if (ok[6] == 1) - printf("Reduction operation roc_shmem_longlong_xor_to_all: Failed\n"); + printf("Reduction operation rocshmem_longlong_xor_to_all: Failed\n"); else { - Vprintf("Reduction operation roc_shmem_longlong_xor_to_all: Passed\n"); + Vprintf("Reduction operation rocshmem_longlong_xor_to_all: Passed\n"); pass++; } Vprintf("\n"); } - if (Serialize) roc_shmem_barrier_all(); + if (Serialize) rocshmem_barrier_all(); return (pass == 4 ? 1 : 0); } @@ -763,9 +763,9 @@ int main(int argc, char *argv[]) { int c, i, mype, num_pes, tests, passed; char *pgm; - roc_shmem_init(); - mype = roc_shmem_my_pe(); - num_pes = roc_shmem_n_pes(); + rocshmem_init(); + mype = rocshmem_my_pe(); + num_pes = rocshmem_n_pes(); if ((pgm = strrchr(argv[0], '/'))) { pgm++; @@ -802,58 +802,58 @@ int main(int argc, char *argv[]) { case 'h': default: Rfprintf(stderr, "usage: %s {-v(verbose)|h(help)}\n", pgm); - roc_shmem_finalize(); + rocshmem_finalize(); return 1; } } tests = passed = 0; - pSync = (long *)roc_shmem_malloc(ROC_SHMEM_BCAST_SYNC_SIZE * sizeof(long)); - pSync1 = (long *)roc_shmem_malloc(ROC_SHMEM_BCAST_SYNC_SIZE * sizeof(long)); + pSync = (long *)rocshmem_malloc(ROCSHMEM_BCAST_SYNC_SIZE * sizeof(long)); + pSync1 = (long *)rocshmem_malloc(ROCSHMEM_BCAST_SYNC_SIZE * sizeof(long)); if (!pSync || !pSync1) { fprintf(stderr, "ERR: cannot allocate one of the pSync arrays\n"); } - for (i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; - pSync1[i] = ROC_SHMEM_SYNC_VALUE; + for (i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) { + pSync[i] = ROCSHMEM_SYNC_VALUE; + pSync1[i] = ROCSHMEM_SYNC_VALUE; } - pWrk0 = (short *)roc_shmem_malloc(WRK_SIZE * sizeof(short)); - pWrk1 = (int *)roc_shmem_malloc(WRK_SIZE * sizeof(int)); - pWrk2 = (long *)roc_shmem_malloc(WRK_SIZE * sizeof(long)); - pWrk3 = (float *)roc_shmem_malloc(WRK_SIZE * sizeof(float)); - pWrk4 = (double *)roc_shmem_malloc(WRK_SIZE * sizeof(double)); - pWrk5 = (long double *)roc_shmem_malloc(WRK_SIZE * sizeof(long double)); - pWrk6 = (long long *)roc_shmem_malloc(WRK_SIZE * sizeof(long long)); + pWrk0 = (short *)rocshmem_malloc(WRK_SIZE * sizeof(short)); + pWrk1 = (int *)rocshmem_malloc(WRK_SIZE * sizeof(int)); + pWrk2 = (long *)rocshmem_malloc(WRK_SIZE * sizeof(long)); + pWrk3 = (float *)rocshmem_malloc(WRK_SIZE * sizeof(float)); + pWrk4 = (double *)rocshmem_malloc(WRK_SIZE * sizeof(double)); + pWrk5 = (long double *)rocshmem_malloc(WRK_SIZE * sizeof(long double)); + pWrk6 = (long long *)rocshmem_malloc(WRK_SIZE * sizeof(long long)); if (!pWrk0 || !pWrk1 || !pWrk2 || !pWrk3 || !pWrk4 || !pWrk5 || !pWrk6) { fprintf(stderr, "ERR: cannot allocate one of the pWrk arrays\n"); } - src0 = (short *)roc_shmem_malloc(N * sizeof(short)); - src1 = (int *)roc_shmem_malloc(N * sizeof(int)); - src2 = (long *)roc_shmem_malloc(N * sizeof(long)); - src3 = (float *)roc_shmem_malloc(N * sizeof(float)); - src4 = (double *)roc_shmem_malloc(N * sizeof(double)); - src5 = (long double *)roc_shmem_malloc(N * sizeof(long double)); - src6 = (long long *)roc_shmem_malloc(N * sizeof(long long)); + src0 = (short *)rocshmem_malloc(N * sizeof(short)); + src1 = (int *)rocshmem_malloc(N * sizeof(int)); + src2 = (long *)rocshmem_malloc(N * sizeof(long)); + src3 = (float *)rocshmem_malloc(N * sizeof(float)); + src4 = (double *)rocshmem_malloc(N * sizeof(double)); + src5 = (long double *)rocshmem_malloc(N * sizeof(long double)); + src6 = (long long *)rocshmem_malloc(N * sizeof(long long)); if (!src0 || !src1 || !src2 || !src3 || !src4 || !src5 || !src6) { fprintf(stderr, "ERR: cannot allocate one of the src arrays\n"); } - dst0 = (short *)roc_shmem_malloc(N * sizeof(short)); - dst1 = (int *)roc_shmem_malloc(N * sizeof(int)); - dst2 = (long *)roc_shmem_malloc(N * sizeof(long)); - dst3 = (float *)roc_shmem_malloc(N * sizeof(float)); - dst4 = (double *)roc_shmem_malloc(N * sizeof(double)); - dst5 = (long double *)roc_shmem_malloc(N * sizeof(long double)); - dst6 = (long long *)roc_shmem_malloc(N * sizeof(long long)); + dst0 = (short *)rocshmem_malloc(N * sizeof(short)); + dst1 = (int *)rocshmem_malloc(N * sizeof(int)); + dst2 = (long *)rocshmem_malloc(N * sizeof(long)); + dst3 = (float *)rocshmem_malloc(N * sizeof(float)); + dst4 = (double *)rocshmem_malloc(N * sizeof(double)); + dst5 = (long double *)rocshmem_malloc(N * sizeof(long double)); + dst6 = (long long *)rocshmem_malloc(N * sizeof(long long)); if (!dst0 || !dst1 || !dst2 || !dst3 || !dst4 || !dst5 || !dst6) { fprintf(stderr, "ERR: cannot allocate one of the dst arrays\n"); } - roc_shmem_barrier_all(); + rocshmem_barrier_all(); passed += max_to_all(mype, num_pes); tests++; @@ -896,34 +896,34 @@ int main(int argc, char *argv[]) { c = (tests == passed ? 0 : 1); } - roc_shmem_free(pSync); - roc_shmem_free(pSync1); + rocshmem_free(pSync); + rocshmem_free(pSync1); - roc_shmem_free(pWrk0); - roc_shmem_free(pWrk1); - roc_shmem_free(pWrk2); - roc_shmem_free(pWrk3); - roc_shmem_free(pWrk4); - roc_shmem_free(pWrk5); - roc_shmem_free(pWrk6); + rocshmem_free(pWrk0); + rocshmem_free(pWrk1); + rocshmem_free(pWrk2); + rocshmem_free(pWrk3); + rocshmem_free(pWrk4); + rocshmem_free(pWrk5); + rocshmem_free(pWrk6); - roc_shmem_free(src0); - roc_shmem_free(src1); - roc_shmem_free(src2); - roc_shmem_free(src3); - roc_shmem_free(src4); - roc_shmem_free(src5); - roc_shmem_free(src6); + rocshmem_free(src0); + rocshmem_free(src1); + rocshmem_free(src2); + rocshmem_free(src3); + rocshmem_free(src4); + rocshmem_free(src5); + rocshmem_free(src6); - roc_shmem_free(dst0); - roc_shmem_free(dst1); - roc_shmem_free(dst2); - roc_shmem_free(dst3); - roc_shmem_free(dst4); - roc_shmem_free(dst5); - roc_shmem_free(dst6); + rocshmem_free(dst0); + rocshmem_free(dst1); + rocshmem_free(dst2); + rocshmem_free(dst3); + rocshmem_free(dst4); + rocshmem_free(dst5); + rocshmem_free(dst6); - roc_shmem_finalize(); + rocshmem_finalize(); return c; } diff --git a/tests/sos_tests/waituntil.cpp b/tests/sos_tests/waituntil.cpp index ff837e008c..2b18655ab9 100644 --- a/tests/sos_tests/waituntil.cpp +++ b/tests/sos_tests/waituntil.cpp @@ -30,7 +30,7 @@ */ /* - * exercise roc_shmem_short_wait() and roc_shmem_short_wait_until() functions. + * exercise rocshmem_short_wait() and rocshmem_short_wait_until() functions. */ #include @@ -38,17 +38,17 @@ #include #include -#include +#include using namespace rocshmem; #define DataType long -#define SHM_PUT roc_shmem_long_put -#define SHM_PUTP roc_shmem_long_p -#define SHM_GETP roc_shmem_long_g +#define SHM_PUT rocshmem_long_put +#define SHM_PUTP rocshmem_long_p +#define SHM_GETP rocshmem_long_g -#define SHM_WAITU roc_shmem_long_wait_until +#define SHM_WAITU rocshmem_long_wait_until #define PF "%ld" #define Vprintf \ @@ -65,29 +65,29 @@ int main(int argc, char *argv[]) { Verbose++; } - roc_shmem_init(); - me = roc_shmem_my_pe(); - num_pes = roc_shmem_n_pes(); + rocshmem_init(); + me = rocshmem_my_pe(); + num_pes = rocshmem_n_pes(); if (num_pes == 1) { printf("%s: Requires number of PEs > 1\n", argv[0]); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; } - target = (DataType *)roc_shmem_malloc(10 * sizeof(DataType)); + target = (DataType *)rocshmem_malloc(10 * sizeof(DataType)); - pong = (DataType *)roc_shmem_malloc(sizeof(DataType)); + pong = (DataType *)rocshmem_malloc(sizeof(DataType)); *pong = 666; - t2 = (DataType *)roc_shmem_malloc(10 * sizeof(DataType)); + t2 = (DataType *)rocshmem_malloc(10 * sizeof(DataType)); if (!t2) { - if (me == 0) printf("roc_shmem_malloc() failed?\n"); - roc_shmem_global_exit(1); + if (me == 0) printf("rocshmem_malloc() failed?\n"); + rocshmem_global_exit(1); } t2[9] = target[9] = 0xFF; - roc_shmem_barrier_all(); + rocshmem_barrier_all(); if (me == 0) { memset(target, 0, 10 * sizeof(DataType)); @@ -96,17 +96,17 @@ int main(int argc, char *argv[]) { for (pe = 1; pe < num_pes; pe++) /* put 10 elements into target on PE 1 */ SHM_PUT(target, source, 10, pe); - SHM_WAITU(pong, ROC_SHMEM_CMP_GT, 666); + SHM_WAITU(pong, ROCSHMEM_CMP_GT, 666); Vprintf("PE[%d] pong now " PF "\n", me, *pong); for (pe = 1; pe < num_pes; pe++) /* put 1 element into t2 on PE 1 */ SHM_PUTP(&t2[9], 0xDD, pe); } else { /* wait for 10th element write of 'target' */ - SHM_WAITU(&target[9], ROC_SHMEM_CMP_NE, 0xFF); + SHM_WAITU(&target[9], ROCSHMEM_CMP_NE, 0xFF); Vprintf("PE[%d] target[9] was 255 now " PF ", success.\n", me, target[9]); - SHM_WAITU(&target[9], ROC_SHMEM_CMP_EQ, 10); + SHM_WAITU(&target[9], ROCSHMEM_CMP_EQ, 10); Vprintf("PE[%d] expected target[9] == 10 now " PF "\n", me, target[9]); if (me == 1) { @@ -117,10 +117,10 @@ int main(int argc, char *argv[]) { SHM_PUTP(pong, 999, 0); } - SHM_WAITU(&t2[9], ROC_SHMEM_CMP_NE, 0xFF); + SHM_WAITU(&t2[9], ROCSHMEM_CMP_NE, 0xFF); } - // roc_shmem_barrier_all(); /* sync sender and receiver */ + // rocshmem_barrier_all(); /* sync sender and receiver */ if (me != 0) { if (memcmp(source, target, sizeof(DataType) * 10) != 0) { @@ -130,13 +130,13 @@ int main(int argc, char *argv[]) { printf(PF "," PF " ", source[i], target[i]); } printf("\n"); - roc_shmem_global_exit(1); + rocshmem_global_exit(1); } } - roc_shmem_free(t2); + rocshmem_free(t2); - if (Verbose) fprintf(stderr, "[%d] exit\n", roc_shmem_my_pe()); + if (Verbose) fprintf(stderr, "[%d] exit\n", rocshmem_my_pe()); - roc_shmem_finalize(); + rocshmem_finalize(); return 0; }