Use new naming scheme

This commit is contained in:
Brandon Potter
2024-11-25 14:12:15 -06:00
parent 308816bc5e
commit fd8dbc7fb6
179 changed files with 5250 additions and 5251 deletions
+2 -2
View File
@@ -1,4 +1,4 @@
# This is the list of ROCSHMEM's significant contributors.
# This is the list of rocSHMEM's significant contributors.
#
# This does not necessarily list everyone who has contributed code,
# especially since many employees of one corporation may be contributing.
@@ -10,4 +10,4 @@ Michael LeBeane
Rohit Zambre
Kishore Punniyamurthy
Ruchi Shah
Muhammad A. Awad
Muhammad A. Awad
+3 -3
View File
@@ -137,8 +137,8 @@ set(
rocshmem
VERSION ${ROCSHMEM_VERSION}
LANGUAGES CXX
DESCRIPTION "ROCSHMEM"
HOMEPAGE_URL "https://github.com/ROCm-Developer-Tools/ROC_SHMEM")
DESCRIPTION "rocSHMEM"
HOMEPAGE_URL "https://github.com/ROCm-Developer-Tools/rocSHMEM")
###############################################################################
# DEFAULT BUILD TYPE
@@ -190,7 +190,7 @@ set_target_properties(
${PROJECT_NAME}
PROPERTIES
PUBLIC_HEADER
"${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/roc_shmem/roc_shmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/roc_shmem/debug.hpp"
"${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/rocshmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/debug.hpp"
)
###############################################################################
+8 -8
View File
@@ -1,11 +1,11 @@
## How to fork from us
To keep our development fast and conflict free, we recommend you to [fork](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/fork) our repository and start your work from our `dev` branch in your private repository.
To keep our development fast and conflict free, we recommend you to [fork](https://github.com/ROCm-Developer-Tools/rocSHMEM/fork) our repository and start your work from our `dev` branch in your private repository.
Afterwards, git clone your repository to your local machine. But that is not it! To keep track of the original develop repository, add it as another remote.
```
git remote add mainline https://github.com/ROCm-Developer-Tools/ROC_SHMEM.git
git remote add mainline https://github.com/ROCm-Developer-Tools/rocSHMEM.git
git checkout dev
```
@@ -17,23 +17,23 @@ git checkout -b topic-<yourFeatureName>
and apply your changes there.
## How to contribute to ROCSHMEM
## How to contribute to rocSHMEM
### Did you find a bug?
- Ensure the bug was not already reported by searching on GitHub under [Issues](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/issues).
- Ensure the bug was not already reported by searching on GitHub under [Issues](https://github.com/ROCm-Developer-Tools/rocSHMEM/issues).
- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/issues/new).
- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ROCm-Developer-Tools/rocSHMEM/issues/new).
### Did you write a patch that fixes a bug?
- Open a new GitHub [pull request](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/compare) with the patch.
- Open a new GitHub [pull request](https://github.com/ROCm-Developer-Tools/rocSHMEM/compare) with the patch.
- Ensure the PR description clearly describes the problem and solution. If there is an existing GitHub issue open describing this bug, please include it in the description so we can close it.
- Ensure the PR is based on the `dev` branch of the ROCSHMEM GitHub repository.
- Ensure the PR is based on the `dev` branch of the rocSHMEM GitHub repository.
- ROCSHMEM requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/blob/main/LICENSE):
- rocSHMEM requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/ROCm-Developer-Tools/rocSHMEM/blob/main/LICENSE):
> (a) The contribution was created in whole or in part by me and I
+34 -34
View File
@@ -1,19 +1,19 @@
# ROCm OpenSHMEM (ROC_SHMEM)
# ROCm OpenSHMEM (rocSHMEM)
The ROCm OpenSHMEM (ROC_SHMEM) runtime is part of an AMD Research
The ROCm OpenSHMEM (rocSHMEM) runtime is part of an AMD Research
initiative to provide a unified runtime for heterogeneous systems.
ROC_SHMEM supports both host-centric (a traditional host-driven
rocSHMEM supports both host-centric (a traditional host-driven
OpenSHMEM runtime) and GPU-centric networking (provided a GPU kernel
the ability to perform network operations) through an
OpenSHMEM-like interface. This intra-kernel networking simplifies application
code complexity and enables more fine-grained communication/computation
overlap than traditional host-driven networking.
ROC_SHMEM's primary target is heterogeneous computing; hence, for both
CPU-centric and GPU-centric communications, ROC_SHMEM uses a single
rocSHMEM's primary target is heterogeneous computing; hence, for both
CPU-centric and GPU-centric communications, rocSHMEM uses a single
symmetric heap (SHEAP) that is allocated on GPU memories.
ROC_SHMEM's GPU-centric communication has two different backend designs.
rocSHMEM's GPU-centric communication has two different backend designs.
The backends primarily differ in their implementations of
intra-kernel networking.
@@ -24,23 +24,23 @@ the doorbell on the NIC to send network commands. GPU-IB is the default and
preferred backend design that offers the best performance.
The second design will be referred to as the Reverse Offload (RO) backend. With
the RO backend, the GPU runtime forwards ROC_SHMEM networking operations to the
the RO backend, the GPU runtime forwards rocSHMEM networking operations to the
host-side runtime, which calls into a traditional MPI or OpenSHMEM
implementation. This forwarding of requests is transparent to the
programmer, who only sees the GPU-side interface.
Both designs of the GPU-centric interface coexist seamlessly with the
CPU-centric interface of the unified runtime. ROC_SHMEM ensures that CPU-centric
CPU-centric interface of the unified runtime. rocSHMEM ensures that CPU-centric
updates to the SHEAP are consistent and visible to a GPU kernel that is executing
in parallel to host-initiated communication.
## Limitations
ROC_SHMEM is an experimental prototype from AMD Research and not an official
rocSHMEM is an experimental prototype from AMD Research and not an official
ROCm product. The software is provided as-is with no guarantees of support
from AMD or AMD Research.
ROC_SHMEM base requirements:
rocSHMEM base requirements:
* ROCm version 4.3.1 onwards
* May work with other versions, but not tested
* AMD GFX9 GPUs (e.g.: MI25, Vega 56, Vega 64, MI50, MI60, MI100, Radeon VII)
@@ -51,16 +51,16 @@ ROC_SHMEM base requirements:
* InfiniBand adaptor compatable with ROCm RDMA technology
* UCX 1.6 or greater with ROCm support
ROC_SHMEM optional requirements
rocSHMEM optional requirements
* For Documentation:
* Doxygen
ROC_SHMEM only supports HIP applications. There are no plans to port to
rocSHMEM only supports HIP applications. There are no plans to port to
OpenCL.
## Building and Installation
ROC_SHMEM uses the CMake build system. The CMakeLists file contains
rocSHMEM uses the CMake build system. The CMakeLists file contains
additional details about library options.
To create an out-of-source build:
@@ -84,34 +84,34 @@ custom install path by supplying it as an argument. For example:
../scripts/build_configs/rc_single /path/to/install
## Compiling/linking and Running with ROC_SHMEM
## Compiling/linking and Running with rocSHMEM
ROC_SHMEM is built as a host and device side library that can be statically
rocSHMEM is built as a host and device side library that can be statically
linked to your application during compilation using hipcc.
During the compilation of your application, include the ROC_SHMEM header files
and the ROC_SHMEM library when using hipcc:
During the compilation of your application, include the rocSHMEM header files
and the rocSHMEM library when using hipcc:
-I/path/to/rocshmem/install/include
-L/path/to/rocshmem/install/lib -lrocshmem
NOTE: ROC_SHMEM depends on MPI for its host code. So, you will need to link
NOTE: rocSHMEM depends on MPI for its host code. So, you will need to link
to an MPI library. Since you must use the hipcc compiler, the arguments for
MPI linkage must be added manually as opposed to using mpicc. Similary,
ROC_SHMEM depends on Verbs for its device code. So, you will need to link
rocSHMEM depends on Verbs for its device code. So, you will need to link
to a Verbs library.
When using hipcc directly (as opposed to through a build system), we
recommend performing the compilation and linking steps separately.
Here are the steps to build a standalone program, say
roc_shmem_hello.cpp.
rocshmem_hello.cpp.
```
# Compile
/opt/rocm/bin/hipcc ./roc_shmem_hello.cpp -I/path/to/rocshmem/install/include -fgpu-rdc -o ./roc_shmem_hello.o -c
/opt/rocm/bin/hipcc ./rocshmem_hello.cpp -I/path/to/rocshmem/install/include -fgpu-rdc -o ./rocshmem_hello.o -c
# Link
/opt/rocm/bin/hipcc ./roc_shmem_hello.o /path/to/rocshmem/install/lib/librocshmem.a -lmpi -lmlx5 -libverbs -lhsa-runtime64 -fgpu-rdc -o roc_shmem_hello
/opt/rocm/bin/hipcc ./rocshmem_hello.o /path/to/rocshmem/install/lib/librocshmem.a -lmpi -lmlx5 -libverbs -lhsa-runtime64 -fgpu-rdc -o rocshmem_hello
```
@@ -122,20 +122,20 @@ page useful.
## Runtime Parameters
ROC_SHMEM_HEAP_SIZE (default : 1 GB)
ROCSHMEM_HEAP_SIZE (default : 1 GB)
Defines the size of the OpenSHMEM symmetric heap
Note the heap is on the GPU memory.
ROC_SHMEM_SQ_SIZE (default 1024)
ROCSHMEM_SQ_SIZE (default 1024)
Defines the size of the SQ as number of network
packet (WQE). Each WQE is 64B. This only for
GPU-IB conduit
ROC_SHMEM_USE_CQ_GPU_MEM (default : 1)
ROCSHMEM_USE_CQ_GPU_MEM (default : 1)
Set the placement of CQ on GPU memory (1)
or CPU memory (0)
ROC_SHMEM_USE_SQ_GPU_MEM (default : 1)
ROCSHMEM_USE_SQ_GPU_MEM (default : 1)
Set the placement of SQ on GPU memory (1)
or CPU memory (0)
@@ -143,13 +143,13 @@ page useful.
Force producer/consumer queues between CPU and GPU to
be in CPU memory. RO backend only.
ROC_SHMEM also requires the following environment variable be set for ROCm:
rocSHMEM also requires the following environment variable be set for ROCm:
export HSA_FORCE_FINE_GRAIN_PCIE=1
## Documentation
To generate doxygen documentation for ROC_SHMEM's API, run the following
To generate doxygen documentation for rocSHMEM's API, run the following
from the library's build directory:
make docs
@@ -158,13 +158,13 @@ The doxygen output will be in the `docs` folder of the build directory.
## Examples
ROC_SHMEM is similar to OpenSHMEM and should be familiar to programmers who
rocSHMEM is similar to OpenSHMEM and should be familiar to programmers who
have experience with OpenSHMEM or other PGAS network programming APIs in the
context of CPUs. The best way to learn how to use ROC_SHMEM is to read the
context of CPUs. The best way to learn how to use rocSHMEM is to read the
autogenerated doxygen documentation for functions described in
`roc_shmem/roc_shmem.hpp`, or to look at the provided sample applications in the
`tests/` folder. ROC_SHMEM is shipped with a basic test suite for the
supported ROC_SHMEM API. The examples test Puts, Gets, nonblocking Puts,
`rocshmem/rocshmem.hpp`, or to look at the provided sample applications in the
`tests/` folder. rocSHMEM is shipped with a basic test suite for the
supported rocSHMEM API. The examples test Puts, Gets, nonblocking Puts,
nonblocking Gets, Quiets, Atomics, Tests, Wai-untils, Broadcasts, and
Reductions.
@@ -178,7 +178,7 @@ Here are some example uses of the driver script:
## Building the Dependencies
ROC_SHMEM requires an MPI runtime on the host that supports ROCm-Aware MPI.
rocSHMEM requires an MPI runtime on the host that supports ROCm-Aware MPI.
Currently all ROCm-Aware MPI runtimes require the usage of ROCm-Aware UCX.
To build and configure ROCm-Aware UCX, you need to:
+2 -2
View File
@@ -1,6 +1,6 @@
local help_message = [[
ROC_SHMEM is an open-source GPU initiated networking library
rocSHMEM is an open-source GPU initiated networking library
for High Performance Computing and Machine Learning workloads.
Version @ROCSHMEM_FULL_VERSION@
@@ -12,7 +12,7 @@ whatis("Name: rocshmem")
whatis("Version: @ROCSHMEM_FULL_VERSION@")
whatis("Keywords: GPU, PGAS, RMA, HPC")
whatis("Description: tool for GPU initiated networking")
whatis("URL: https://github.com/ROCm-Developer-Tools/ROC_SHMEM")
whatis("URL: https://github.com/ROCm-Developer-Tools/rocSHMEM")
-- Export environmental variables
local topDir="@CMAKE_INSTALL_PREFIX@"
+1 -1
View File
@@ -33,7 +33,7 @@ NIC (this does occur during MPI_Win_create), we are good.
UCX claims to not support GPU-aware communication because they have
not added in support for the different types of scenarios that could
exist in a system (eg, when a system does not have GPU-direct). The
scope of ROC_SHMEM is currenlty limited to configurations that UCX
scope of rocSHMEM is currently limited to configurations that UCX
already supports.
## But the main branch of MPICH does support HIP now?
+23 -23
View File
@@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_allreduce_test.o -o rocshmem_allreduce_test
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test
*/
#include <iostream>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
#define CHECK_HIP(condition) { \
hipError_t error = condition; \
@@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test
using namespace rocshmem;
__global__ void allreduce_test(int *source, int *dest, size_t nelem,
roc_shmem_team_t team) {
__shared__ roc_shmem_ctx_t ctx;
rocshmem_team_t team) {
__shared__ rocshmem_ctx_t ctx;
int64_t ctx_type = 0;
roc_shmem_wg_init();
roc_shmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = roc_shmem_ctx_n_pes(ctx);
rocshmem_wg_init();
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
roc_shmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem);
rocshmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem);
roc_shmem_ctx_quiet(ctx);
rocshmem_ctx_quiet(ctx);
__syncthreads();
roc_shmem_wg_ctx_destroy(&ctx);
roc_shmem_wg_finalize();
rocshmem_wg_ctx_destroy(&ctx);
rocshmem_wg_finalize();
}
static void init_sendbuf (int *source, int nelem, int my_pe)
@@ -82,23 +82,23 @@ int main (int argc, char **argv)
nelem = atoi(argv[1]);
}
int my_pe = roc_shmem_my_pe();
int npes = roc_shmem_n_pes();
int my_pe = rocshmem_my_pe();
int npes = rocshmem_n_pes();
int ndevices, my_device = 0;
CHECK_HIP(hipGetDeviceCount(&ndevices));
my_device = my_pe % ndevices;
CHECK_HIP(hipSetDevice(my_device));
roc_shmem_init();
rocshmem_init();
int *source = (int *)roc_shmem_malloc(nelem * sizeof(int));
int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int));
int *source = (int *)rocshmem_malloc(nelem * sizeof(int));
int *dest = (int *)rocshmem_malloc(nelem * sizeof(int));
if (NULL == source || NULL == dest) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "source: " << source << ", dest: " << dest << ", size: "
<< sizeof(int) * nelem << std::endl;
roc_shmem_global_exit(1);
rocshmem_global_exit(1);
}
init_sendbuf(source, nelem, my_pe);
@@ -106,9 +106,9 @@ int main (int argc, char **argv)
dest[i] = -1;
}
roc_shmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID;
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
rocshmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
&team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
@@ -121,9 +121,9 @@ int main (int argc, char **argv)
bool pass = check_recvbuf(dest, nelem, my_pe, npes);
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
roc_shmem_free(source);
roc_shmem_free(dest);
rocshmem_free(source);
rocshmem_free(dest);
roc_shmem_finalize();
rocshmem_finalize();
return 0;
}
+23 -23
View File
@@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_alltoall_test.o -o rocshmem_alltoall_test \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test
*/
#include <iostream>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
#define CHECK_HIP(condition) { \
hipError_t error = condition; \
@@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test
using namespace rocshmem;
__global__ void alltoall_test(int *source, int *dest, size_t nelem,
roc_shmem_team_t team) {
__shared__ roc_shmem_ctx_t ctx;
rocshmem_team_t team) {
__shared__ rocshmem_ctx_t ctx;
int64_t ctx_type = 0;
roc_shmem_wg_init();
roc_shmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = roc_shmem_ctx_n_pes(ctx);
rocshmem_wg_init();
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
roc_shmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem);
rocshmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem);
roc_shmem_ctx_quiet(ctx);
rocshmem_ctx_quiet(ctx);
__syncthreads();
roc_shmem_wg_ctx_destroy(&ctx);
roc_shmem_wg_finalize();
rocshmem_wg_ctx_destroy(&ctx);
rocshmem_wg_finalize();
}
static void init_sendbuf (int *source, int nelem, int my_pe, int npes)
@@ -87,23 +87,23 @@ int main (int argc, char **argv)
nelem = atoi(argv[1]);
}
int my_pe = roc_shmem_my_pe();
int npes = roc_shmem_n_pes();
int my_pe = rocshmem_my_pe();
int npes = rocshmem_n_pes();
int ndevices, my_device = 0;
CHECK_HIP(hipGetDeviceCount(&ndevices));
my_device = my_pe % ndevices;
CHECK_HIP(hipSetDevice(my_device));
roc_shmem_init();
rocshmem_init();
int *source = (int *)roc_shmem_malloc(nelem * npes * sizeof(int));
int *dest = (int *)roc_shmem_malloc(nelem * npes * sizeof(int));
int *source = (int *)rocshmem_malloc(nelem * npes * sizeof(int));
int *dest = (int *)rocshmem_malloc(nelem * npes * sizeof(int));
if (NULL == source || NULL == dest) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "source: " << source << ", dest: " << dest << ", size: "
<< sizeof(int) * nelem * npes << std::endl;
roc_shmem_global_exit(1);
rocshmem_global_exit(1);
}
init_sendbuf(source, nelem, my_pe, npes);
@@ -111,9 +111,9 @@ int main (int argc, char **argv)
dest[i] = -1;
}
roc_shmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID;
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
rocshmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
&team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
@@ -127,9 +127,9 @@ int main (int argc, char **argv)
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
roc_shmem_free(source);
roc_shmem_free(dest);
rocshmem_free(source);
rocshmem_free(dest);
roc_shmem_finalize();
rocshmem_finalize();
return 0;
}
+23 -23
View File
@@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_broadcast_test.o -o rocshmem_broadcast_test
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test
*/
#include <iostream>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
#define CHECK_HIP(condition) { \
hipError_t error = condition; \
@@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test
using namespace rocshmem;
__global__ void broadcast_test(int *source, int *dest, size_t nelem,
int root, roc_shmem_team_t team) {
__shared__ roc_shmem_ctx_t ctx;
int root, rocshmem_team_t team) {
__shared__ rocshmem_ctx_t ctx;
int64_t ctx_type = 0;
roc_shmem_wg_init();
roc_shmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = roc_shmem_ctx_n_pes(ctx);
rocshmem_wg_init();
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
roc_shmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root);
rocshmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root);
roc_shmem_ctx_quiet(ctx);
rocshmem_ctx_quiet(ctx);
__syncthreads();
roc_shmem_wg_ctx_destroy(&ctx);
roc_shmem_wg_finalize();
rocshmem_wg_ctx_destroy(&ctx);
rocshmem_wg_finalize();
}
static void init_sendbuf(int *source, int nelem, int my_pe)
@@ -80,23 +80,23 @@ int main(int argc, char **argv)
nelem = atoi(argv[1]);
}
int my_pe = roc_shmem_my_pe();
int npes = roc_shmem_n_pes();
int my_pe = rocshmem_my_pe();
int npes = rocshmem_n_pes();
int ndevices, my_device = 0;
CHECK_HIP(hipGetDeviceCount(&ndevices));
my_device = my_pe % ndevices;
CHECK_HIP(hipSetDevice(my_device));
roc_shmem_init();
rocshmem_init();
int *source = (int *)roc_shmem_malloc(nelem * sizeof(int));
int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int));
int *source = (int *)rocshmem_malloc(nelem * sizeof(int));
int *dest = (int *)rocshmem_malloc(nelem * sizeof(int));
if (NULL == source || NULL == dest) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "source: " << source << ", dest: " << dest << ", size: "
<< sizeof(int) * nelem << std::endl;
roc_shmem_global_exit(1);
rocshmem_global_exit(1);
}
init_sendbuf(source, nelem, my_pe);
@@ -105,9 +105,9 @@ int main(int argc, char **argv)
}
int root = 0;
roc_shmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID;
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
rocshmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
&team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
@@ -122,9 +122,9 @@ int main(int argc, char **argv)
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
}
roc_shmem_free(source);
roc_shmem_free(dest);
rocshmem_free(source);
rocshmem_free(dest);
roc_shmem_finalize();
rocshmem_finalize();
return 0;
}
+17 -17
View File
@@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_getmem_test.o -o rocshmem_getmem_test \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 ./rocshmem_getmem_test
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 ./rocshmem_getmem_test
*/
#include <iostream>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
#define CHECK_HIP(condition) { \
hipError_t error = condition; \
@@ -31,25 +31,25 @@ using namespace rocshmem;
__global__ void simple_getmem_test(int *src, int *dst, size_t nelem)
{
roc_shmem_wg_init();
rocshmem_wg_init();
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
if (threadId == 0) {
int rank = roc_shmem_my_pe();
int rank = rocshmem_my_pe();
int peer = rank ? 0 : 1;
roc_shmem_getmem(dst, src, nelem * sizeof(int), peer);
roc_shmem_quiet();
rocshmem_getmem(dst, src, nelem * sizeof(int), peer);
rocshmem_quiet();
}
__syncthreads();
roc_shmem_wg_finalize();
rocshmem_wg_finalize();
}
#define MAX_ELEM 256
int main (int argc, char **argv)
{
int rank = roc_shmem_my_pe();
int rank = rocshmem_my_pe();
int ndevices, my_device = 0;
CHECK_HIP(hipGetDeviceCount(&ndevices));
my_device = rank % ndevices;
@@ -60,15 +60,15 @@ int main (int argc, char **argv)
nelem = atoi(argv[1]);
}
roc_shmem_init();
int npes = roc_shmem_n_pes();
int *src = (int *)roc_shmem_malloc(nelem * sizeof(int));
int *dst = (int *)roc_shmem_malloc(nelem * sizeof(int));
rocshmem_init();
int npes = rocshmem_n_pes();
int *src = (int *)rocshmem_malloc(nelem * sizeof(int));
int *dst = (int *)rocshmem_malloc(nelem * sizeof(int));
if (NULL == src || NULL == dst) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "source: " << src << ", dest: " << dst << ", size: "
<< sizeof(int) * nelem << std::endl;
roc_shmem_global_exit(1);
rocshmem_global_exit(1);
}
for (int i=0; i<nelem; i++) {
@@ -79,7 +79,7 @@ int main (int argc, char **argv)
int threadsPerBlock=256;
simple_getmem_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(src, dst, nelem);
roc_shmem_barrier_all();
rocshmem_barrier_all();
CHECK_HIP(hipDeviceSynchronize());
bool pass = true;
@@ -93,8 +93,8 @@ int main (int argc, char **argv)
}
printf("Test %s \t %s\n", argv[0], pass ? "[PASS]" : "[FAIL]");
roc_shmem_free(src);
roc_shmem_free(dst);
roc_shmem_finalize();
rocshmem_free(src);
rocshmem_free(dst);
rocshmem_finalize();
return 0;
}
File diff suppressed because it is too large Load Diff
@@ -22,7 +22,7 @@
#include "primitive_tester.hpp"
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
#include <debug.hpp>
#include <unistd.h>
@@ -41,26 +41,26 @@ PrimitiveTest(int loop,
int my_pe,
ShmemContextType ctx_type)
{
__shared__ roc_shmem_ctx_t ctx;
roc_shmem_wg_init();
roc_shmem_wg_ctx_create(ctx_type, &ctx);
__shared__ rocshmem_ctx_t ctx;
rocshmem_wg_init();
rocshmem_wg_ctx_create(ctx_type, &ctx);
int block_id = hipBlockIdx_x;
for(int i =0; i< loop; i++){
roc_shmem_ctx_putmem_nbi_wg(ctx, &r_buf[my_pe*size], &s_buf[block_id * size], size, block_id);
rocshmem_ctx_putmem_nbi_wg(ctx, &r_buf[my_pe*size], &s_buf[block_id * size], size, block_id);
if(hipThreadIdx_x==0){
//roc_shmem_ctx_quiet(ctx);
//roc_shmem_ctx_threadfence_system(ctx);
roc_shmem_ctx_int_p(ctx, &flag[my_pe], i+1, block_id);
//roc_shmem_ctx_quiet(ctx);
roc_shmem_int_wait_until(&flag[block_id], ROC_SHMEM_CMP_EQ, i+1);
//rocshmem_ctx_quiet(ctx);
//rocshmem_ctx_threadfence_system(ctx);
rocshmem_ctx_int_p(ctx, &flag[my_pe], i+1, block_id);
//rocshmem_ctx_quiet(ctx);
rocshmem_int_wait_until(&flag[block_id], ROCSHMEM_CMP_EQ, i+1);
}
__syncthreads();
}
roc_shmem_wg_ctx_destroy(ctx);
roc_shmem_wg_finalize();
rocshmem_wg_ctx_destroy(ctx);
rocshmem_wg_finalize();
}
/******************************************************************************
@@ -69,16 +69,16 @@ PrimitiveTest(int loop,
PrimitiveTester::PrimitiveTester(TesterArguments args)
: Tester(args)
{
flag = (int*) roc_shmem_malloc(args.numprocs);
flag = (int*) rocshmem_malloc(args.numprocs);
memset(flag, 0, args.numprocs*sizeof(int));
// s_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size);
// r_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size);
// s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size);
// r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size);
}
PrimitiveTester::~PrimitiveTester()
{
roc_shmem_free(s_buf);
roc_shmem_free(r_buf);
rocshmem_free(s_buf);
rocshmem_free(r_buf);
}
void
@@ -99,8 +99,8 @@ PrimitiveTester::launchKernel(dim3 gridSize,
void* sendBuf = malloc(64);
void* recvBuf = malloc(64 * nproc);
s_buf = (char *)roc_shmem_malloc(size * nproc);
r_buf = (char *)roc_shmem_malloc(size * nproc);
s_buf = (char *)rocshmem_malloc(size * nproc);
r_buf = (char *)rocshmem_malloc(size * nproc);
resetBuffers(size);
MPI_Allgather(sendBuf, 64, MPI_CHAR,
@@ -108,7 +108,7 @@ PrimitiveTester::launchKernel(dim3 gridSize,
MPI_COMM_WORLD);
size_t shared_bytes;
roc_shmem_dynamic_shared(&shared_bytes);
rocshmem_dynamic_shared(&shared_bytes);
hipLaunchKernelGGL(PrimitiveTest,
gridSize,
+4 -4
View File
@@ -22,7 +22,7 @@
#include <vector>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
#include "tester.hpp"
#include "tester_arguments.hpp"
@@ -39,7 +39,7 @@ int main(int argc, char * argv[])
/***
* Select a GPU
*/
int rank = roc_shmem_my_pe();
int rank = rocshmem_my_pe();
int ndevices, my_device=0;
hipGetDeviceCount (&ndevices);
my_device = rank % ndevices;
@@ -48,7 +48,7 @@ int main(int argc, char * argv[])
/**
* Must initialize rocshmem to access arguments needed by the tester.
*/
roc_shmem_init(args.num_wgs);
rocshmem_init(args.num_wgs);
/**
* Now grab the arguments from rocshmem.
@@ -78,7 +78,7 @@ int main(int argc, char * argv[])
* The rocshmem library needs to be cleaned up with this call. It pairs
* with the init function above.
*/
roc_shmem_finalize();
rocshmem_finalize();
return 0;
}
+3 -3
View File
@@ -27,7 +27,7 @@
#include <iostream>
#include <hip/hip_runtime.h>
#include <mpi.h>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
//#include "broadcast_tester.hpp"
#include "primitive_tester.hpp"
@@ -125,8 +125,8 @@ Tester::execute()
printf("error = %d \n", err);
}
// roc_shmem_dump_stats();
// roc_shmem_reset_stats();
// rocshmem_dump_stats();
// rocshmem_reset_stats();
+1 -1
View File
@@ -25,7 +25,7 @@
#include <vector>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
#include "tester_arguments.hpp"
@@ -26,7 +26,7 @@
#include <cstdlib>
#include <iostream>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
using namespace rocshmem;
@@ -64,7 +64,7 @@ void
TesterArguments::show_usage(std::string executable_name)
{
std::cout << "Usage: " << executable_name << std::endl;
std::cout << "\t-t <number of roc_shmem service threads>\n";
std::cout << "\t-t <number of rocshmem service threads>\n";
std::cout << "\t-w <number of workgroups>\n";
std::cout << "\t-s <maximum message size (in bytes)>\n";
std::cout << "\t-a <algorithm number to test>\n";
@@ -78,7 +78,7 @@ TesterArguments::show_usage(std::string executable_name)
void
TesterArguments::get_rocshmem_arguments()
{
numprocs = roc_shmem_n_pes();
myid = roc_shmem_my_pe();
numprocs = rocshmem_n_pes();
myid = rocshmem_my_pe();
}
@@ -35,7 +35,7 @@ class TesterArguments
/**
* Initialize rocshmem members
* Valid after roc_shmem_init function called.
* Valid after rocshmem_init function called.
*/
void get_rocshmem_arguments();
@@ -54,7 +54,7 @@ class TesterArguments
uint64_t min_msg_size = 1;
uint64_t max_msg_size = 1 << 20;
unsigned wg_size = 64;
unsigned shmem_context = 8; // ROC_SHMEM_CTX_WG_PRIVATE
unsigned shmem_context = 8; // ROCSHMEM_CTX_WG_PRIVATE
/**
* Arguments obtained from rocshmem
+5 -5
View File
@@ -55,7 +55,7 @@ project(spts VERSION 1.1.0 LANGUAGES CXX)
# CONFIGURATION OPTIONS
###############################################################################
option(USE_HIP "Build HIP version of the solver" OFF)
option(USE_ROCSHMEM "Build ROC_SHMEM enabled version of the solver" OFF)
option(USE_ROCSHMEM "Build rocSHMEM enabled version of the solver" OFF)
option(ALL_ANALYZE "Build analyze and solve algorithm" OFF)
option(USE_DOUBLE "Use double precision floats for the data" OFF)
option(ALL_LEVELSET "Build levelset algorithm" OFF)
@@ -84,7 +84,7 @@ target_sources(
)
###############################################################################
# HIP / HIP + ROC_SHMEM
# HIP / HIP + rocSHMEM
###############################################################################
if(USE_HIP)
find_package(hip REQUIRED)
@@ -95,7 +95,7 @@ if(USE_HIP)
HIPHelper.cpp
)
if(USE_ROC_SHMEM)
if(USE_ROCSHMEM)
find_package(rocshmem CONFIG REQUIRED)
target_include_directories(
@@ -118,8 +118,8 @@ if(USE_HIP)
###############################################################################
else()
if(USE_ROC_SHMEM)
message(FATAL_ERROR "Cannot use ROC_SHMEM without USE_HIP")
if(USE_ROCSHMEM)
message(FATAL_ERROR "Cannot use rocSHMEM without USE_HIP")
endif()
target_sources(
+1 -1
View File
@@ -34,7 +34,7 @@
static int SPTS_BLOCK_SIZE = 0;
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
#define WF_PER_WG 1
#else
#define WF_PER_WG 16
+1 -1
View File
@@ -179,7 +179,7 @@ int main(int argc, char *argv[])
else
printf("%lf )", ((double)ns_per_levelsync_iter/1000000.));
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
MPI_Allreduce(MPI_IN_PLACE, (void *) &ns_per_analysis_iter, 1,
MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
+42 -42
View File
@@ -40,8 +40,8 @@
#include <unistd.h>
#ifdef USE_ROC_SHMEM
#include "roc_shmem.hpp"
#ifdef USE_ROCSHMEM
#include "rocshmem.hpp"
#include "mpi.h"
#endif
@@ -100,8 +100,8 @@ class SparseTriangularSolve :
int nCols;
int numBlocks;
/*
#ifdef USE_ROC_SHMEM
roc_shmem_t* handle;
#ifdef USE_ROCSHMEM
rocshmem_t* handle;
#endif
*/
std::unordered_map<int, FloatType> *observed_errors;
@@ -114,15 +114,15 @@ class SparseTriangularSolve :
x = NULL; y = NULL; y_zero = NULL, yref = NULL, observed_errors = NULL, errors_seen = NULL;
xDev = yDev = completedRowsDev = remoteInProgressArrayDev = rowBlocksDev = doneArrayDev = shadowDoneArrayDev = numRowsAtLevelDev = maxDepthDev = rowMapDev = totalSpinDev = oneBufDev = 0;
#ifdef USE_ROC_SHMEM
int roc_shmem_queues = (2560 / WF_PER_WG);
#ifdef USE_ROCSHMEM
int rocshmem_queues = (2560 / WF_PER_WG);
if (2560 % WF_PER_WG)
roc_shmem_queues++;
printf("roc_shmem_queues %d WF_PER_WG %d \n",roc_shmem_queues, WF_PER_WG);
roc_shmem_init(roc_shmem_queues);
rocshmem_queues++;
printf("rocshmem_queues %d WF_PER_WG %d \n",rocshmem_queues, WF_PER_WG);
rocshmem_init(rocshmem_queues);
this->Set_total_pes(roc_shmem_n_pes());
this->Set_this_pe(roc_shmem_my_pe());
this->Set_total_pes(rocshmem_n_pes());
this->Set_this_pe(rocshmem_my_pe());
#else
this->Set_total_pes(1);
this->Set_this_pe(0);
@@ -173,7 +173,7 @@ class SparseTriangularSolve :
if (remoteInProgressArrayDev != 0)
this->GPU->FreeMem(remoteInProgressArrayDev);
#ifndef USE_ROC_SHMEM
#ifndef USE_ROCSHMEM
if (yDev != 0)
this->GPU->FreeMem(yDev);
if (doneArrayDev != 0)
@@ -184,14 +184,14 @@ class SparseTriangularSolve :
this->GPU->FreeMem(shadowDoneArrayDev);
#else
if (yDev != 0)
roc_shmem_free(yDev);
rocshmem_free(yDev);
if (doneArrayDev != 0)
roc_shmem_free(doneArrayDev);
rocshmem_free(doneArrayDev);
if (reqUpdateArrayDev != 0)
roc_shmem_free(reqUpdateArrayDev);
rocshmem_free(reqUpdateArrayDev);
if (shadowDoneArrayDev != 0)
roc_shmem_free(shadowDoneArrayDev);
roc_shmem_finalize();
rocshmem_free(shadowDoneArrayDev);
rocshmem_finalize();
#endif
}
};
@@ -207,8 +207,8 @@ void SparseTriangularSolve<FloatType>::AddDerivedInputFlags()
AddInputFlag("non_symmetric", 'n', "false", "Force the program to work on non-symmetric matrices. This will ignore the upper triangular entirely. (Default=false)", "bool");
AddInputFlag("levelsync_size", 'l', "0", "Number of rows to launch in a level-sync kernel invocation (Default = auto-tune)", "int");
AddInputFlag("verify", 'v', "false", "Verify results", "bool");
AddInputFlag("rocshmem_algorithm", 'a', "0", "ROC_SHMEM algorithm type", "int");
AddInputFlag("block_size", 'b', "32768", "Use get-based algorithm for ROC_SHMEM", "int");
AddInputFlag("rocshmem_algorithm", 'a', "0", "rocSHMEM algorithm type", "int");
AddInputFlag("block_size", 'b', "32768", "Use get-based algorithm for rocSHMEM", "int");
AddInputFlag("put_block_size", 'p', "1024", "Block size for puts", "int");
AddInputFlag("get_backoff_factor", 'g', "128", "Backoff factor for gets", "int");
}
@@ -241,10 +241,10 @@ void SparseTriangularSolve<FloatType>::AllocateVectors(
}
xDev = this->GPU->AllocateMem("xDev", nCols*sizeof(FloatType), GPU_MEM_READ_ONLY, NULL);
#ifndef USE_ROC_SHMEM
#ifndef USE_ROCSHMEM
yDev = this->GPU->AllocateMem("yDev", nRows*sizeof(FloatType), GPU_MEM_READ_WRITE, NULL);
#else
yDev = (memPointer) roc_shmem_malloc(nRows*sizeof(FloatType));
yDev = (memPointer) rocshmem_malloc(nRows*sizeof(FloatType));
#endif
}
@@ -742,10 +742,10 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
/****** SpTS Meta-Data Setup Code ******/
/* Set up the OpenCL buffers for the SpTS meta-data */
// TODO -- is this +1 in doneArray nRows+1 required? Why?
#ifdef USE_ROC_SHMEM
doneArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t));
reqUpdateArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t));
shadowDoneArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t));
#ifdef USE_ROCSHMEM
doneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
reqUpdateArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
shadowDoneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
#else
doneArrayDev = this->GPU->AllocateMem("doneArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
reqUpdateArrayDev = this->GPU->AllocateMem("reqUpdateArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
@@ -835,7 +835,7 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
bool syncfree_better = false;
int total_workitems_per_workgroup = WF_SIZE * WF_PER_WG;
//bool roc_shmem_initialized = false;
//bool rocshmem_initialized = false;
/*********************** Actual work of the benchmark *********************/
for(int i = 0; i < iter; i++)
@@ -883,18 +883,18 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
#else
int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
/ total_workitems_per_workgroup;
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
global_work_size = this->nRows_p * WF_SIZE;
num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
/ total_workitems_per_workgroup;
/*
int roc_shmem_queues = (2560 / WF_PER_WG);
int rocshmem_queues = (2560 / WF_PER_WG);
if (2560 % WF_PER_WG)
roc_shmem_queues++;
if (!roc_shmem_initialized) {
rocshmem_queues++;
if (!rocshmem_initialized) {
int num_threads = InputFlags::GetValueInt("num_roshmem_threads");
roc_shmem_init(&handle, roc_shmem_queues);
roc_shmem_initialized = true;
rocshmem_init(&handle, rocshmem_queues);
rocshmem_initialized = true;
}
*/
int rocshmem_algorithm = InputFlags::GetValueInt("rocshmem_algorithm");
@@ -915,11 +915,11 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
printf("Using put/get hybrid intra-kernel algorithm\n");
break;
default:
printf("Unknown ROC_SHMEM algoirthm\n");
printf("Unknown rocSHMEM algorithm\n");
exit(-1);
}
size_t LDS_size;
roc_shmem_dynamic_shared(&LDS_size);
rocshmem_dynamic_shared(&LDS_size);
printf("Work size %zu, wg size %d num workgroups %d LDS %zu thisPE %d Global %d \n", global_work_size, total_workitems_per_workgroup, num_of_workgroups, LDS_size, this->Get_this_pe(), this->Get_total_pes());
MPI_Barrier(MPI_COMM_WORLD);
hipEventRecord(event_array[0], NULL);
@@ -969,7 +969,7 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
hipEventRecord(event_array[1], NULL);
hipEventSynchronize(event_array[1]);
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
// Wait for any outstanding network messages to finish up. We
// can have straggler updates to the doneArray that we don't
// have any dependencies for but we still eed it to finish so
@@ -1000,7 +1000,7 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
this->GPU->CopyToHost(numRowsAtLevelDev, numRowsAtLevel, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL);
this->GPU->Flush();
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
// Combine global statistics
MPI_Allreduce(MPI_IN_PLACE, (void *) &maxDepth, 1, MPI_UNSIGNED, MPI_MAX, MPI_COMM_WORLD);
MPI_Allreduce(MPI_IN_PLACE, (void *) &totalSpin, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
@@ -1114,8 +1114,8 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
uint32_t current_iteration = 0;
#ifdef USE_ROC_SHMEM
fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n");
#ifdef USE_ROCSHMEM
fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
exit(-1);
#endif
@@ -1215,8 +1215,8 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
level_sync_cutoff = 81920;
}
#ifdef USE_ROC_SHMEM
fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n");
#ifdef USE_ROCSHMEM
fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
exit(-1);
#endif
@@ -1346,8 +1346,8 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
// Number of levels is maxDepth. */
levelset_iter++;
#ifdef USE_ROC_SHMEM
fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n");
#ifdef USE_ROCSHMEM
fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
exit(-1);
#endif
+2 -2
View File
@@ -77,8 +77,8 @@ class SparseMatrix
d_vals = NULL;
d_row_ptrs = NULL;
this_pe = -1;//roc_shmem_my_pe(handle); // this pe
total_pes = -1;//roc_shmem_n_pes(handle); // total number of pes
this_pe = -1;//rocshmem_my_pe(handle); // this pe
total_pes = -1;//rocshmem_n_pes(handle); // total number of pes
}
void AllocateSparseMatrix(MatrixMarketReader<FloatType> &mm_reader,
@@ -5,7 +5,7 @@ src_path=$(dirname "$(realpath $0)")/..
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DUSE_ROC_SHMEM=OFF \
-DUSE_ROCSHMEM=OFF \
-DUSE_HIP=ON \
-DALL_ANALYZE=ON \
-DUSE_DOUBLE=OFF \
@@ -5,7 +5,7 @@ src_path=$(dirname "$(realpath $0)")/..
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DUSE_ROC_SHMEM=OFF \
-DUSE_ROCSHMEM=OFF \
-DUSE_HIP=OFF \
-DALL_ANALYZE=ON \
-DUSE_DOUBLE=OFF \
@@ -12,7 +12,7 @@ src_path=$(dirname "$(realpath $0)")/..
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DUSE_ROC_SHMEM=ON \
-DUSE_ROCSHMEM=ON \
-DUSE_HIP=ON \
-DALL_ANALYZE=ON \
-DUSE_DOUBLE=OFF \
+1 -1
View File
@@ -1,4 +1,4 @@
#cmakedefine USE_ROC_SHMEM
#cmakedefine USE_ROCSHMEM
#cmakedefine USE_HIP
#cmakedefine ALL_ANALYZE
#cmakedefine USE_DOUBLE
+43 -43
View File
@@ -26,8 +26,8 @@
#include <hip/math_functions.h>
#include <hip/device_functions.h>
#ifdef USE_ROC_SHMEM
#include "roc_shmem.hpp"
#ifdef USE_ROCSHMEM
#include "rocshmem.hpp"
using namespace rocshmem;
#endif
@@ -991,7 +991,7 @@ inline FPTYPE cross_lane_reduction_three(FPTYPE temp_sum, unsigned int *row_max_
__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1)
amd_spts_analyze_and_solve(
const size_t global_work_size,
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
const int this_pe,
const int total_pes,
unsigned int * __restrict__ shadowDoneArray,
@@ -1002,9 +1002,9 @@ amd_spts_analyze_and_solve(
// 1: Naive gets
// 2: blocked puts
// 3: put/get hybrid
int roc_shmem_algorithm,
int roc_shmem_put_block_size,
int roc_shmem_get_backoff_factor,
int rocshmem_algorithm,
int rocshmem_put_block_size,
int rocshmem_get_backoff_factor,
int spts_block_size,
#endif
const FPTYPE * __restrict__ vals,
@@ -1043,13 +1043,13 @@ amd_spts_analyze_and_solve(
const unsigned int wg_lid = hipThreadIdx_x;
const unsigned int lid = wg_lid % WF_SIZE;
#ifdef USE_ROC_SHMEM
__shared__ roc_shmem_ctx_t ctx;
#ifdef USE_ROCSHMEM
__shared__ rocshmem_ctx_t ctx;
//if (wg_lid == OUTPUT_THREAD) {
roc_shmem_wg_init();
roc_shmem_wg_ctx_create(ROC_SHMEM_CTX_WG_PRIVATE, &ctx);
rocshmem_wg_init();
rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx);
__syncthreads();
#endif
@@ -1061,7 +1061,7 @@ amd_spts_analyze_and_solve(
// Actual row this wavefront will work on.
const unsigned int local_row = local_first_row + local_offset;
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
// Get the global row for this wavefront assuming a row-cyclic
// decomposition. Basically we need to account for other PEs here.
int local_block_id = local_row / spts_block_size;
@@ -1144,7 +1144,7 @@ amd_spts_analyze_and_solve(
// While there are threads in this workgroup that have been unable to
// get their input, loop and wait for the flag to exist.
__asm__ volatile ("s_setprio 0");
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
int target_pe = (local_col / spts_block_size) % total_pes;
int backoff_counter = 0;
bool need_remote_notify = true;
@@ -1179,8 +1179,8 @@ amd_spts_analyze_and_solve(
spin_times++;
#ifdef USE_ROC_SHMEM
if ((total_pes > 1) && (target_pe != this_pe) && (roc_shmem_algorithm == 1)) {
#ifdef USE_ROCSHMEM
if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 1)) {
if (first_time) {
if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0)
need_comm = false;
@@ -1188,12 +1188,12 @@ amd_spts_analyze_and_solve(
first_time = false;
if (need_comm)
{
for (int i = 0; i < (backoff_counter * roc_shmem_get_backoff_factor); i++)
for (int i = 0; i < (backoff_counter * rocshmem_get_backoff_factor); i++)
__asm__ volatile("s_sleep 127");
roc_shmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
//roc_shmem_ctx_quiet(ctx);
rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
//rocshmem_ctx_quiet(ctx);
__asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n"
"s_waitcnt vmcnt(0)"
@@ -1203,7 +1203,7 @@ amd_spts_analyze_and_solve(
if (local_done)
{
roc_shmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
__asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP
:
@@ -1217,19 +1217,19 @@ amd_spts_analyze_and_solve(
}
}
if ((total_pes > 1) && (target_pe != this_pe) && (roc_shmem_algorithm == 3)) {
if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 3)) {
if (need_remote_notify) {
need_remote_notify = false;
//if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0)
//if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) == 0)
{
roc_shmem_ctx_putmem_nbi(ctx, &reqUpdateArray[local_col], oneBuf, sizeof(int), target_pe);
rocshmem_ctx_putmem_nbi(ctx, &reqUpdateArray[local_col], oneBuf, sizeof(int), target_pe);
//printf("Put 111 blockIDx %d threadID %d target_pe %d local_col %d oneBuf[0]= %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, oneBuf[0]);
roc_shmem_ctx_fence(ctx);
rocshmem_ctx_fence(ctx);
//printf("fence 222 blockIDx %d threadID %d target_pe %d local_col %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col);
roc_shmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
roc_shmem_ctx_quiet(ctx);
rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
rocshmem_ctx_quiet(ctx);
//printf("Get 333 blockIDx %d threadID %d target_pe %d local_col %d shadowDone %d \n \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, shadowDoneArray[local_col]);
__asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n"
@@ -1239,8 +1239,8 @@ amd_spts_analyze_and_solve(
if (local_done)
{
roc_shmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
roc_shmem_ctx_quiet(ctx);
rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
rocshmem_ctx_quiet(ctx);
__asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP
:
: "v"(&doneArray[local_col]),
@@ -1313,34 +1313,34 @@ amd_spts_analyze_and_solve(
__asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(row_max_depth));
asm volatile ("s_waitcnt vmcnt(0)\n\t");
#ifdef USE_ROC_SHMEM
if (roc_shmem_algorithm == 2 && total_pes > 1) {
int CHUNK = roc_shmem_put_block_size;
#ifdef USE_ROCSHMEM
if (rocshmem_algorithm == 2 && total_pes > 1) {
int CHUNK = rocshmem_put_block_size;
bool sendTime = true;
int row_base = (row / CHUNK) * CHUNK;
int num_done = atomicAdd(&shadowDoneArray[row_base], 1);
sendTime = (num_done == (CHUNK - 1));
for(int p=0; p<total_pes; p++){
if(p != this_pe && sendTime){
roc_shmem_ctx_putmem_nbi(ctx, &out_y[row_base], &out_y[row_base], sizeof(FPTYPE) * CHUNK, p);
roc_shmem_ctx_fence(ctx);
roc_shmem_ctx_putmem_nbi(ctx, &doneArray[row_base], &doneArray[row_base], sizeof(int) * CHUNK, p);
roc_shmem_ctx_quiet(ctx);
rocshmem_ctx_putmem_nbi(ctx, &out_y[row_base], &out_y[row_base], sizeof(FPTYPE) * CHUNK, p);
rocshmem_ctx_fence(ctx);
rocshmem_ctx_putmem_nbi(ctx, &doneArray[row_base], &doneArray[row_base], sizeof(int) * CHUNK, p);
rocshmem_ctx_quiet(ctx);
}
}
}
if (roc_shmem_algorithm == 0) {
if (rocshmem_algorithm == 0) {
for(int p=0; p<total_pes; p++){
if(p != this_pe){
roc_shmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
roc_shmem_ctx_fence(ctx);
roc_shmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
rocshmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
rocshmem_ctx_fence(ctx);
rocshmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
}
}
}
if (roc_shmem_algorithm == 3) {
if (rocshmem_algorithm == 3) {
// Only broadcast update if another node explicitly registered for this row. TODO:
// Make 2D array to scale
unsigned int need_broadcast;
@@ -1349,9 +1349,9 @@ amd_spts_analyze_and_solve(
if (need_broadcast == 1) {
for(int p=0; p<total_pes; p++) {
if (p != this_pe) {
roc_shmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
roc_shmem_ctx_fence(ctx);
roc_shmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
rocshmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
rocshmem_ctx_fence(ctx);
rocshmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
}
}
}
@@ -1373,11 +1373,11 @@ amd_spts_analyze_and_solve(
}
}
#ifdef USE_ROC_SHMEM
#ifdef USE_ROCSHMEM
__syncthreads();
//if (wg_lid == OUTPUT_THREAD)
roc_shmem_wg_ctx_destroy(ctx);
roc_shmem_wg_finalize();
rocshmem_wg_ctx_destroy(ctx);
rocshmem_wg_finalize();
#endif
}
+2 -2
View File
@@ -184,11 +184,11 @@ pipeline {
steps {
dir("clients/functional_tests") {
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC true'
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC true'
}
//dir("internal/clients/spts") {
// sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
// sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
// sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
//}
}
}
+3 -3
View File
@@ -396,14 +396,14 @@ pipeline {
steps {
dir("clients/functional_tests") {
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
}
dir("clients/sos_tests") {
sh 'ROC_SHMEM_RO=1 ./driver.sh ${build_dir}/RC_MULTI all ${build_dir}/RC_MULTI'
sh 'ROCSHMEM_RO=1 ./driver.sh ${build_dir}/RC_MULTI all ${build_dir}/RC_MULTI'
}
dir("internal/clients/spts") {
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
}
}
}
+2 -2
View File
@@ -321,11 +321,11 @@ pipeline {
steps {
dir("clients/functional_tests") {
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
}
dir("internal/clients/spts") {
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
}
}
}
+6 -6
View File
@@ -15,9 +15,9 @@ pipeline {
UCX_WARN_UNUSED_ENV_VARS="n"
HSA_FORCE_FINE_GRAIN_PCIE=1
UCX_TLS="rc"
ROC_SHMEM_USE_SQ_GPU_MEM=0
ROC_SHMEM_USE_CQ_GPU_MEM=0
ROC_SHMEM_NUM_BLOCKS=128
ROCSHMEM_USE_SQ_GPU_MEM=0
ROCSHMEM_USE_CQ_GPU_MEM=0
ROCSHMEM_NUM_BLOCKS=128
}
stages {
stage('Synchronize Source Code') {
@@ -244,13 +244,13 @@ pipeline {
stage('RO_NET') {
steps {
dir("clients/functional_tests") {
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/rocshmem_example_driver ro ${build_dir}/RO_NET'
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/rocshmem_example_driver ro ${build_dir}/RO_NET'
}
dir("clients/sos_tests") {
sh 'ROC_SHMEM_RO=1 ./driver.sh ${build_dir}/RO_NET short ${build_dir}/RO_NET'
sh 'ROCSHMEM_RO=1 ./driver.sh ${build_dir}/RO_NET short ${build_dir}/RO_NET'
}
//dir("internal/clients/spts") {
// sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/spts multi_thread ${build_dir}/RO_NET'
// sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/spts multi_thread ${build_dir}/RO_NET'
//}
}
}
+1 -1
View File
@@ -141,7 +141,7 @@ pipeline {
steps {
dir("clients/functional_tests") {
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
}
}
}
+1 -1
View File
@@ -33,7 +33,7 @@ ${BUILD}/sort_mpi: ${BUILD}/sort_mpi.o
${BUILD}/sort_mpi.o: ${SRC}/sort_mpi.cu
${HIPCC} $^ -I${MPI_HOME}/include -fgpu-rdc -o $@ -c
RO_FLAGS=ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1
RO_FLAGS=ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1
ITERS?=0 1 2 3 4 5 6 7 8 9
TIMEOUT=1m
HOSTS=sv-pdp-0,sv-pdp-1,sv-pdp-2,sv-pdp-3
+18 -18
View File
@@ -9,13 +9,13 @@ __device__ uint64_t timers[TIMERS] = {0};
__device__ uint64_t time_start;
#define TIMERS_START() \
if(threadIdx.x == 0) {\
time_start = roc_shmem_timer();\
time_start = rocshmem_timer();\
}
#define TIME(TIMER_NUM) \
if(threadIdx.x == 0) {\
timers[TIMER_NUM] = roc_shmem_timer() - time_start;\
time_start = roc_shmem_timer();\
timers[TIMER_NUM] = rocshmem_timer() - time_start;\
time_start = rocshmem_timer();\
}
#define OUTPUT_TIME() \
@@ -268,14 +268,14 @@ void initGPU()
MPI_Barrier(MPI_COMM_WORLD);
}
void *roc_shmem_malloc(size_t size)
void *rocshmem_malloc(size_t size)
{
void *v;
hipMalloc((void **)&v, size);
return v;
}
int roc_shmem_free(void *v)
int rocshmem_free(void *v)
{
return hipFree(v);
}
@@ -316,14 +316,14 @@ int main(int argc, char *argv[])
// Init buffers
int *keyBuffer1, *keyBuffer2;
keyBuffer1 = (int*)roc_shmem_malloc(sizeof(int) * size);
keyBuffer2 = (int*)roc_shmem_malloc(sizeof(int) * size * 4);
keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
int *sendCount = 0, *recvCount = 0, *sendOffset = 0, *recvOffset = 0;
sendCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
recvCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
sendOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
recvOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
printf("Begin untimed run\n");
// Untimed run
@@ -369,12 +369,12 @@ int main(int argc, char *argv[])
// Clean up
hipFree(keys);
hipFree(outputKeys);
roc_shmem_free(keyBuffer1);
roc_shmem_free(keyBuffer2);
roc_shmem_free(sendCount);
roc_shmem_free(recvCount);
roc_shmem_free(sendOffset);
roc_shmem_free(recvOffset);
rocshmem_free(keyBuffer1);
rocshmem_free(keyBuffer2);
rocshmem_free(sendCount);
rocshmem_free(recvCount);
rocshmem_free(sendOffset);
rocshmem_free(recvOffset);
MPI_Finalize();
return 0;
}
}
+18 -18
View File
@@ -9,13 +9,13 @@ __device__ uint64_t timers[TIMERS] = {0};
__device__ uint64_t time_start;
#define TIMERS_START() \
if(threadIdx.x == 0) {\
time_start = roc_shmem_timer();\
time_start = rocshmem_timer();\
}
#define TIME(TIMER_NUM) \
if(threadIdx.x == 0) {\
timers[TIMER_NUM] = roc_shmem_timer() - time_start;\
time_start = roc_shmem_timer();\
timers[TIMER_NUM] = rocshmem_timer() - time_start;\
time_start = rocshmem_timer();\
}
#define OUTPUT_TIME() \
@@ -282,14 +282,14 @@ void initGPU(ncclComm_t &comms)
MPI_Barrier(MPI_COMM_WORLD);
}
void *roc_shmem_malloc(size_t size)
void *rocshmem_malloc(size_t size)
{
void *v;
hipMalloc((void **)&v, size);
return v;
}
int roc_shmem_free(void *v)
int rocshmem_free(void *v)
{
return hipFree(v);
}
@@ -331,14 +331,14 @@ int main(int argc, char *argv[])
// Init buffers
int *keyBuffer1, *keyBuffer2;
keyBuffer1 = (int*)roc_shmem_malloc(sizeof(int) * size);
keyBuffer2 = (int*)roc_shmem_malloc(sizeof(int) * size * 4);
keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
int *sendCount = 0, *recvCount = 0, *sendOffset = 0, *recvOffset = 0;
sendCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
recvCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
sendOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
recvOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
printf("Begin untimed run\n");
// Untimed run
@@ -382,13 +382,13 @@ int main(int argc, char *argv[])
// Clean up
hipFree(keys);
hipFree(outputKeys);
roc_shmem_free(keyBuffer1);
roc_shmem_free(keyBuffer2);
roc_shmem_free(sendCount);
roc_shmem_free(recvCount);
roc_shmem_free(sendOffset);
roc_shmem_free(recvOffset);
rocshmem_free(keyBuffer1);
rocshmem_free(keyBuffer2);
rocshmem_free(sendCount);
rocshmem_free(recvCount);
rocshmem_free(sendOffset);
rocshmem_free(recvOffset);
ncclCommDestroy(comms);
MPI_Finalize();
return 0;
}
}
+46 -46
View File
@@ -1,7 +1,7 @@
#include <iostream>
#include <stdio.h>
#include <mpi.h>
#include <roc_shmem/roc_shmem.hpp>
#include <rocshmem/rocshmem.hpp>
#include <unistd.h>
using namespace std;
using namespace rocshmem;
@@ -16,13 +16,13 @@ __device__ uint64_t timers[TIMERS] = {0};
__device__ uint64_t time_start;
#define TIMERS_START() \
if(threadIdx.x == 0) {\
time_start = roc_shmem_timer();\
time_start = rocshmem_timer();\
}
#define TIME(TIMER_NUM) \
if(threadIdx.x == 0) {\
timers[TIMER_NUM] = roc_shmem_timer() - time_start;\
time_start = roc_shmem_timer();\
timers[TIMER_NUM] = rocshmem_timer() - time_start;\
time_start = rocshmem_timer();\
}
#define OUTPUT_TIME() \
@@ -41,11 +41,11 @@ __device__ uint64_t time_start;
#define OUTPUT_TIME()
#endif
__device__ __inline__ void alltoall(roc_shmem_ctx_t &ctx,
roc_shmem_team_t team,
__device__ __inline__ void alltoall(rocshmem_ctx_t &ctx,
rocshmem_team_t team,
int *dst, int *src) {
// Perform alltoall
roc_shmem_ctx_int_wg_alltoall(ctx,
rocshmem_ctx_int_wg_alltoall(ctx,
team,
dst, // T* dest
src, // const T* source
@@ -56,18 +56,18 @@ __global__ void sort(volatile int *keys, int *keyBuffer1,
int *keyBuffer2, int *sendCount,
int *recvCount, int *sendOffset,
int *recvOffset, int *outputKeys,
size_t size, roc_shmem_team_t team,
size_t size, rocshmem_team_t team,
int max_iters) {
__shared__ roc_shmem_ctx_t ctx;
__shared__ rocshmem_ctx_t ctx;
__shared__ int bucketCounter[MAX_PES];
__shared__ int bucketPtr[MAX_PES];
__shared__ int total_size;
roc_shmem_wg_init();
roc_shmem_wg_ctx_create(ROC_SHMEM_CTX_WG_PRIVATE, &ctx);
rocshmem_wg_init();
rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx);
int n_pes = roc_shmem_ctx_n_pes(ctx);
int my_pe = roc_shmem_my_pe();
int n_pes = rocshmem_ctx_n_pes(ctx);
int my_pe = rocshmem_my_pe();
int buckets = n_pes;
int tid = threadIdx.x; // + blockDim.x * blockIdx.x;
@@ -116,9 +116,9 @@ __global__ void sort(volatile int *keys, int *keyBuffer1,
int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1;
keyBuffer1[loc] = keys[i];
}
roc_shmem_ctx_threadfence_system(ctx);
rocshmem_ctx_threadfence_system(ctx);
// Force sync to wait for all PEs to update bucket sizes
roc_shmem_ctx_wg_team_sync(ctx, team);
rocshmem_ctx_wg_team_sync(ctx, team);
TIME(3)
// Let all PEs know how many keys you wish to send
alltoall(ctx, team, recvCount, sendCount);
@@ -129,11 +129,11 @@ __global__ void sort(volatile int *keys, int *keyBuffer1,
if(threadIdx.x == 0) {
total_size = 0;
for(int i = 0; i < buckets; ++i) {
roc_shmem_int_get_nbi(&keyBuffer2[total_size],
rocshmem_int_get_nbi(&keyBuffer2[total_size],
&keyBuffer1[recvOffset[i]], recvCount[i], i);
total_size += recvCount[i];
}
roc_shmem_quiet();
rocshmem_quiet();
}
for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x)
outputKeys[i] = 0;
@@ -163,14 +163,14 @@ __global__ void sort(volatile int *keys, int *keyBuffer1,
TIME(7)
}
OUTPUT_TIME()
roc_shmem_wg_ctx_destroy(ctx);
roc_shmem_wg_finalize();
rocshmem_wg_ctx_destroy(ctx);
rocshmem_wg_finalize();
}
bool verify(int *outputKeys, int *keyBuffer2, size_t size)
{
int num_pes = roc_shmem_n_pes();
int my_pe = roc_shmem_my_pe();
int num_pes = rocshmem_n_pes();
int my_pe = rocshmem_my_pe();
MPI_Status status;
MPI_Request request;
@@ -228,8 +228,8 @@ void initGPU()
{
// Calculation for local rank, taken from rccl-tests
int localRank = 0;
int proc = roc_shmem_my_pe();
int nProcs = roc_shmem_n_pes();
int proc = rocshmem_my_pe();
int nProcs = rocshmem_n_pes();
char hostname[1024];
gethostname(hostname, 1024);
for (int i=0; i< 1024; i++) {
@@ -261,12 +261,12 @@ void initGPU()
int main(int argc, char *argv[])
{
// Init roc_shmem stuff
// Init rocshmem stuff
initGPU();
roc_shmem_init(NUM_WGS);
int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD);
roc_shmem_team_t team_world_dup = ROC_SHMEM_TEAM_INVALID;
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD,
rocshmem_init(NUM_WGS);
int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD);
rocshmem_team_t team_world_dup = ROCSHMEM_TEAM_INVALID;
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD,
0,
1,
n_pes,
@@ -278,8 +278,8 @@ int main(int argc, char *argv[])
if(argc > 1)
iterations = atoi(argv[1]);
int num_pes = roc_shmem_n_pes();
int my_pe = roc_shmem_my_pe();
int num_pes = rocshmem_n_pes();
int my_pe = rocshmem_my_pe();
// Configure input and outputs
size_t size = 1024; //atoi(argv[2]);
@@ -298,17 +298,17 @@ int main(int argc, char *argv[])
// Init buffers
int *keyBuffer1, *keyBuffer2;
keyBuffer1 = (int*)roc_shmem_malloc(sizeof(int) * size);
keyBuffer2 = (int*)roc_shmem_malloc(sizeof(int) * size * 4);
keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
int *sendCount, *recvCount, *sendOffset, *recvOffset;
sendCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
recvCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
sendOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
recvOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
// Untimed run
roc_shmem_barrier_all();
rocshmem_barrier_all();
sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2,
sendCount, recvCount, sendOffset, recvOffset,
outputKeys, size, team_world_dup, 1);
@@ -321,7 +321,7 @@ int main(int argc, char *argv[])
}
// Timed run
roc_shmem_barrier_all();
rocshmem_barrier_all();
auto time_start = TIME_NOW;
sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2,
sendCount, recvCount, sendOffset, recvOffset,
@@ -347,12 +347,12 @@ int main(int argc, char *argv[])
// Clean up
hipFree(keys);
hipFree(outputKeys);
roc_shmem_free(keyBuffer1);
roc_shmem_free(keyBuffer2);
roc_shmem_free(sendCount);
roc_shmem_free(recvCount);
roc_shmem_free(sendOffset);
roc_shmem_free(recvOffset);
roc_shmem_finalize();
rocshmem_free(keyBuffer1);
rocshmem_free(keyBuffer2);
rocshmem_free(sendCount);
rocshmem_free(recvCount);
rocshmem_free(sendOffset);
rocshmem_free(recvOffset);
rocshmem_finalize();
return 0;
}
}
+130 -130
View File
@@ -47,88 +47,88 @@ case $2 in
###########################################################################
*"serial")
echo "get_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log
check get_n2_w1_z1_1MB
echo "getnbi_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log
check getnbi_n2_w1_z1_1MB
echo "put_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log
check put_n2_w1_z1_1MB
echo "putnbi_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log
check putnbi_n2_w1_z1_1MB
echo "wg_get_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 28 > $3/wg_get_n2_w1_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 28 > $3/wg_get_n2_w1_z64_1MB.log
check wg_get_n2_w1_z1_1MB
echo "wg_getnbi_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_n2_w1_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_n2_w1_z64_1MB.log
check wg_getnbi_n2_w1_z1_1MB
echo "wg_put_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 30 > $3/wg_put_n2_w1_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 30 > $3/wg_put_n2_w1_z64_1MB.log
check wg_put_n2_w1_z1_1MB
echo "wg_putnbi_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_n2_w1_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_n2_w1_z64_1MB.log
check wg_putnbi_n2_w1_z1_1MB
echo "wg_get_tiled_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 28 > $3/wg_get_tiled_n2_w2_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 28 > $3/wg_get_tiled_n2_w2_z64_1MB.log
check wg_get_tiled_n2_w1_z1_1MB
echo "wg_getnbi_tiled_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_tiled_n2_w2_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_tiled_n2_w2_z64_1MB.log
check wg_getnbi_tiled_n2_w1_z1_1MB
echo "wg_put_tiled_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 30 > $3/wg_put_tiled_n2_w2_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 30 > $3/wg_put_tiled_n2_w2_z64_1MB.log
check wg_put_tiled_n2_w1_z1_1MB
echo "wg_putnbi_tiled_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_tiled_n2_w2_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_tiled_n2_w2_z64_1MB.log
check wg_putnbi_tiled_n2_w1_z1_1MB
echo "wave_get_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 32 > $3/wave_get_n2_w1_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 32 > $3/wave_get_n2_w1_z64_1MB.log
check wave_get_n2_w1_z1_1MB
echo "wave_getnbi_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 33 > $3/wave_getnbi_n2_w1_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 33 > $3/wave_getnbi_n2_w1_z64_1MB.log
check wave_getnbi_n2_w1_z1_1MB
echo "wave_put_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 34 > $3/wave_put_n2_w1_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 34 > $3/wave_put_n2_w1_z64_1MB.log
check wave_put_n2_w1_z1_1MB
echo "wave_putnbi_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 35 > $3/wave_putnbi_n2_w1_z64_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 35 > $3/wave_putnbi_n2_w1_z64_1MB.log
check wave_putnbi_n2_w1_z1_1MB
echo "wave_get_tiled_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 32 > $3/wave_get_tiled_n2_w2_z128_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 32 > $3/wave_get_tiled_n2_w2_z128_1MB.log
check wave_get_tiled_n2_w1_z1_1MB
echo "wave_getnbi_tiled_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 33 > $3/wave_getnbi_tiled_n2_w2_z128_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 33 > $3/wave_getnbi_tiled_n2_w2_z128_1MB.log
check wave_getnbi_tiled_n2_w1_z1_1MB
echo "wave_put_tiled_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 34 > $3/wave_put_tiled_n2_w2_z128_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 34 > $3/wave_put_tiled_n2_w2_z128_1MB.log
check wave_put_tiled_n2_w1_z1_1MB
echo "wave_putnbi_tiled_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 35 > $3/wave_putnbi_tiled_n2_w2_z128_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 35 > $3/wave_putnbi_tiled_n2_w2_z128_1MB.log
check wave_putnbi_tiled_n2_w1_z1_1MB
echo "amofadd_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log
check amofadd_n2_w1_z1
echo "amofinc_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log
check amofinc_n2_w1_z1
echo "amofetch_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log
check amofetch_n2_w1_z1
echo "amofcswap_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log
check amofcswap_n2_w1_z1
echo "amoadd_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log
check amoadd_n2_w1_z1
echo "amoinc_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log
check amoinc_n2_w1_z1
# echo "pingpong_n2_w1"
# ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
# ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
# check pingpong_n2_w1
echo "amoset_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log
check amoset_n2_w1_z1
;;
@@ -137,88 +137,88 @@ case $2 in
###########################################################################
*"short")
echo "get_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log
check get_n2_w16_z128_8B
echo "getnbi_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log
check getnbi_n2_w16_z128_8B
echo "put_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log
check put_n2_w16_z128_8B
echo "putnbi_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log
check putnbi_n2_w16_z128_8B
echo "wg_get_n2_w1_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 28 > $3/wg_get_n2_w1_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 28 > $3/wg_get_n2_w1_z64_8B.log
check wg_get_n2_w1_z64_8B
echo "wg_getnbi_n2_w1_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 29 > $3/wg_getnbi_n2_w1_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 29 > $3/wg_getnbi_n2_w1_z64_8B.log
check wg_getnbi_n2_w1_z64_8B
echo "wg_put_n2_w1_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 30 > $3/wg_put_n2_w1_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 30 > $3/wg_put_n2_w1_z64_8B.log
check wg_put_n2_w1_z64_8B
echo "wg_putnbi_n2_w1_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 31 > $3/wg_putnbi_n2_w1_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 31 > $3/wg_putnbi_n2_w1_z64_8B.log
check wg_putnbi_n2_w1_z64_8B
echo "wg_get_tiled_n2_w16_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 28 > $3/wg_get_tiled_n2_w16_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 28 > $3/wg_get_tiled_n2_w16_z64_8B.log
check wg_get_tiled_n2_w16_z64_8B
echo "wg_getnbi_tiled_n2_w16_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 29 > $3/wg_getnbi_tiled_n2_w16_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 29 > $3/wg_getnbi_tiled_n2_w16_z64_8B.log
check wg_getnbi_tiled_n2_w16_z64_8B
echo "wg_put_tiled_n2_w16_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 30 > $3/wg_put_tiled_n2_w16_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 30 > $3/wg_put_tiled_n2_w16_z64_8B.log
check wg_put_tiled_n2_w16_z64_8B
echo "wg_putnbi_tiled_n2_w16_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 31 > $3/wg_putnbi_tiled_n2_w16_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 31 > $3/wg_putnbi_tiled_n2_w16_z64_8B.log
check wg_putnbi_tiled_n2_w16_z64_8B
echo "wave_get_n2_w1_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 32 > $3/wave_get_n2_w1_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 32 > $3/wave_get_n2_w1_z64_8B.log
check wave_get_n2_w1_z64_8B
echo "wave_getnbi_n2_w1_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 33 > $3/wave_getnbi_n2_w1_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 33 > $3/wave_getnbi_n2_w1_z64_8B.log
check wave_getnbi_n2_w1_z64_8B
echo "wave_put_n2_w1_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 34 > $3/wave_put_n2_w1_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 34 > $3/wave_put_n2_w1_z64_8B.log
check wave_put_n2_w1_z64_8B
echo "wave_putnbi_n2_w1_z64_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 35 > $3/wave_putnbi_n2_w1_z64_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 35 > $3/wave_putnbi_n2_w1_z64_8B.log
check wave_putnbi_n2_w1_z64_8B
echo "wave_get_tiled_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 32 > $3/wave_get_tiled_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 32 > $3/wave_get_tiled_n2_w16_z128_8B.log
check wave_get_tiled_n2_w16_z128_8B
echo "wave_getnbi_tiled_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 33 > $3/wave_getnbi_tiled_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 33 > $3/wave_getnbi_tiled_n2_w16_z128_8B.log
check wave_getnbi_tiled_n2_w16_z128_8B
echo "wave_put_tiled_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 34 > $3/wave_put_tiled_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 34 > $3/wave_put_tiled_n2_w16_z128_8B.log
check wave_put_tiled_n2_w16_z128_8B
echo "wave_putnbi_tiled_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 35 > $3/wave_putnbi_tiled_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 35 > $3/wave_putnbi_tiled_n2_w16_z128_8B.log
check wave_putnbi_tiled_n2_w16_z128_8B
echo "amofadd_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log
check amofadd_n2_w8_z1
echo "amofinc_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log
check amofinc_n2_w8_z1
echo "amofetch_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log
check amofetch_n2_w8_z1
echo "amofcswap_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log
check amofcswap_n2_w8_z1
echo "amoadd_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log
check amoadd_n2_w8_z1
echo "amoinc_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log
check amoinc_n2_w8_z1
# echo "pingpong_n2_w1"
# ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
# ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
# check pingpong_n2_w1
echo "amoset_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log
check amoset_n2_w8_z1
;;
@@ -228,246 +228,246 @@ case $2 in
*"exhaustive")
############################### GET ###################################
echo "get_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log
check get_n2_w1_z1_1MB
echo "get_n2_w1_z1024_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 0 > $3/get_n2_w1_z1024_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 0 > $3/get_n2_w1_z1024_512B.log
check get_n2_w1_z1024_512B
echo "get_n2_w8_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 0 > $3/get_n2_w8_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 0 > $3/get_n2_w8_z1_1MB.log
check get_n2_w8_z1_1MB
echo "get_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log
check get_n2_w16_z128_8B
echo "get_n2_w32_z256_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 0 > $3/get_n2_w32_z256_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 0 > $3/get_n2_w32_z256_512B.log
check get_n2_w32_z256_512B
echo "get_n2_w64_z1024_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 0 > $3/get_n2_w64_z1024_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 0 > $3/get_n2_w64_z1024_8B.log
check get_n2_w64_z1024_8B
############################### GETNBI ################################
echo "getnbi_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log
check getnbi_n2_w1_z1_1MB
echo "getnbi_n2_w1_z1024_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 1 > $3/getnbi_n2_w1_z1024_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 1 > $3/getnbi_n2_w1_z1024_512B.log
check getnbi_n2_w1_z1024_512B
echo "getnbi_n2_w8_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w8_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w8_z1_1MB.log
check getnbi_n2_w8_z1_1MB
echo "getnbi_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log
check getnbi_n2_w16_z128_8B
echo "getnbi_n2_w32_z256_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 1 > $3/getnbi_n2_w32_z256_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 1 > $3/getnbi_n2_w32_z256_512B.log
check getnbi_n2_w32_z256_512B
echo "getnbi_n2_w64_z1024_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 1 > $3/getnbi_n2_w64_z1024_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 1 > $3/getnbi_n2_w64_z1024_8B.log
check getnbi_n2_w64_z1024_8B
############################### PUT ###################################
echo "put_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log
check put_n2_w1_z1_1MB
echo "put_n2_w1_z1024_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 2 > $3/put_n2_w1_z1024_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 2 > $3/put_n2_w1_z1024_512B.log
check put_n2_w1_z1024_512B
echo "put_n2_w8_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 2 > $3/put_n2_w8_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 2 > $3/put_n2_w8_z1_1MB.log
check put_n2_w8_z1_1MB
echo "put_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log
check put_n2_w16_z128_8B
echo "put_n2_w32_z256_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 2 > $3/put_n2_w32_z256_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 2 > $3/put_n2_w32_z256_512B.log
check put_n2_w32_z256_512B
echo "put_n2_w64_z1024_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 2 > $3/put_n2_w64_z1024_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 2 > $3/put_n2_w64_z1024_8B.log
check put_n2_w64_z1024_8B
############################### PUTNBI ################################
echo "putnbi_n2_w1_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log
check putnbi_n2_w1_z1_1MB
echo "putnbi_n2_w1_z1024_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 3 > $3/putnbi_n2_w1_z1024_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 3 > $3/putnbi_n2_w1_z1024_512B.log
check putnbi_n2_w1_z1024_512B
echo "putnbi_n2_w8_z1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w8_z1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w8_z1_1MB.log
check putnbi_n2_w8_z1_1MB
echo "putnbi_n2_w16_z128_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log
check putnbi_n2_w16_z128_8B
echo "putnbi_n2_w32_z256_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 3 > $3/putnbi_n2_w32_z256_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 3 > $3/putnbi_n2_w32_z256_512B.log
check putnbi_n2_w32_z256_512B
echo "putnbi_n2_w64_z1024_8B"
ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 3 > $3/putnbi_n2_w64_z1024_8B.log
ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 3 > $3/putnbi_n2_w64_z1024_8B.log
check putnbi_n2_w64_z1024_8B
############################# REDUCTION ##############################
echo "reduction_n2_w1_z1_32K"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 32768 -a 5 > $3/reduction_n2_w1_z1_32K.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 32768 -a 5 > $3/reduction_n2_w1_z1_32K.log
check reduction_n2_w1_z1_32K
echo "reduction_n2_w8_z1_32K"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 32768 -a 5 > $3/reduction_n2_w8_z1_32K.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 32768 -a 5 > $3/reduction_n2_w8_z1_32K.log
check reduction_n2_w8_z1_32K
echo "reduction_n2_w32_z1_32K"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -s 32768 -a 5 > $3/reduction_n2_w32_z1_32K.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -s 32768 -a 5 > $3/reduction_n2_w32_z1_32K.log
check reduction_n2_w32_z1_32K
############################## AMOFADD ###############################
echo "amofadd_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log
check amofadd_n2_w1_z1
echo "amofadd_n2_w1_z1024"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 6 > $3/amofadd_n2_w1_z1024.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 6 > $3/amofadd_n2_w1_z1024.log
check amofadd_n2_w1_z1024
echo "amofadd_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log
check amofadd_n2_w8_z1
echo "amofadd_n2_w32_z128"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 6 > $3/amofadd_n2_w32_z128.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 6 > $3/amofadd_n2_w32_z128.log
check amofadd_n2_w32_z128
############################## AMOFINC ###############################
echo "amofinc_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log
check amofinc_n2_w1_z1
echo "amofinc_n2_w1_z1024"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 7 > $3/amofinc_n2_w1_z1024.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 7 > $3/amofinc_n2_w1_z1024.log
check amofinc_n2_w1_z1024
echo "amofinc_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log
check amofinc_n2_w8_z1
echo "amofinc_n2_w32_z128"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 7 > $3/amofinc_n2_w32_z128.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 7 > $3/amofinc_n2_w32_z128.log
check amofinc_n2_w32_z128
############################ AMOFETCH ################################
echo "amofetch_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log
check amofetch_n2_w1_z1
echo "amofetch_n2_w1_z1024"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 8 > $3/amofetch_n2_w1_z1024.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 8 > $3/amofetch_n2_w1_z1024.log
check amofetch_n2_w1_z1024
echo "amofetch_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log
check amofetch_n2_w8_z1
echo "amofetch_n2_w32_z128"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 8 > $3/amofetch_n2_w32_z128.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 8 > $3/amofetch_n2_w32_z128.log
check amofetch_n2_w32_z128
########################### AMOFCSWAP ################################
echo "amofcswap_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log
check amofcswap_n2_w1_z1
echo "amofcswap_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log
check amofcswap_n2_w8_z1
echo "amofcswap_n2_w32_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 9 > $3/amofcswap_n2_w32_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 9 > $3/amofcswap_n2_w32_z1.log
check amofcswap_n2_w32_z1
############################# AMOADD ################################
echo "amoadd_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log
check amoadd_n2_w1_z1
echo "amoadd_n2_w1_z1024"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 10 > $3/amoadd_n2_w1_z1024.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 10 > $3/amoadd_n2_w1_z1024.log
check amoadd_n2_w1_z1024
echo "amoadd_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log
check amoadd_n2_w8_z1
echo "amoadd_n2_w32_z128"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 10 > $3/amoadd_n2_w32_z128.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 10 > $3/amoadd_n2_w32_z128.log
check amoadd_n2_w32_z128
############################# AMOINC ################################
echo "amoinc_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log
check amoinc_n2_w1_z1
echo "amoinc_n2_w1_z1024"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 11 > $3/amoinc_n2_w1_z1024.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 11 > $3/amoinc_n2_w1_z1024.log
check amoinc_n2_w1_z1024
echo "amoinc_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log
check amoinc_n2_w8_z1
echo "amoinc_n2_w32_z128"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 11 > $3/amoinc_n2_w32_z128.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 11 > $3/amoinc_n2_w32_z128.log
check amoinc_n2_w32_z128
############################## INIT #################################
echo "init_n2"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -a 13 > $3/init_n2.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -a 13 > $3/init_n2.log
check init_n2
########################### PINGPONG ################################
echo "pingpong_n2_w1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
check pingpong_n2_w1
echo "pingpong_n2_w8"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 14 > $3/pingpong_n2_w8.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 14 > $3/pingpong_n2_w8.log
check pingpong_n2_w8
echo "pingpong_n2_w32"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 14 > $3/pingpong_n2_w32.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 14 > $3/pingpong_n2_w32.log
check pingpong_n2_w32
############################ BARRIER ################################
echo "barrier_n2_w1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 17 > $3/barrier_n2_w1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 17 > $3/barrier_n2_w1.log
check barrier_n2_w1
echo "barrier_n2_w8"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 17 > $3/barrier_n2_w8.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 17 > $3/barrier_n2_w8.log
check barrier_n2_w8
echo "barrier_n2_w32"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 17 > $3/barrier_n2_w32.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 17 > $3/barrier_n2_w32.log
check barrier_n2_w32
############################ SYNCALL ################################
echo "syncall_n2_w1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 18 > $3/syncall_n2_w1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 18 > $3/syncall_n2_w1.log
check syncall_n2_w1
echo "syncall_n2_w8"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 18 > $3/syncall_n2_w8.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 18 > $3/syncall_n2_w8.log
check syncall_n2_w8
echo "syncall_n2_w32"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 18 > $3/syncall_n2_w32.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 18 > $3/syncall_n2_w32.log
check syncall_n2_w32
############################# SYNC ##################################
echo "sync_n2_w1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 19 > $3/sync_n2_w1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 19 > $3/sync_n2_w1.log
check sync_n2_w1
echo "sync_n2_w8"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 19 > $3/sync_n2_w8.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 19 > $3/sync_n2_w8.log
check sync_n2_w8
echo "sync_n2_w32"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 19 > $3/sync_n2_w32.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 19 > $3/sync_n2_w32.log
check sync_n2_w32
########################### FCOLLECT ################################
echo "fcollect_n2_w1_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 22 > $3/fcollect_n2_w1_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 22 > $3/fcollect_n2_w1_512B.log
check fcollect_n2_w1_512B
echo "fcollect_n2_w8_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 22 > $3/fcollect_n2_w8_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 22 > $3/fcollect_n2_w8_512B.log
check fcollect_n2_w8_512B
echo "fcollect_n2_w32_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 22 > $3/fcollect_n2_w32_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 22 > $3/fcollect_n2_w32_512B.log
check fcollect_n2_w32_512B
########################### ALLTOALL ################################
echo "alltoall_n2_w1_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 23 > $3/alltoall_n2_w1_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 23 > $3/alltoall_n2_w1_512B.log
check alltoall_n2_w1_512B
echo "alltoall_n2_w8_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 23 > $3/alltoall_n2_w8_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 23 > $3/alltoall_n2_w8_512B.log
check alltoall_n2_w8_512B
echo "alltoall_n2_w32_512B"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 23 > $3/alltoall_n2_w32_512B.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 23 > $3/alltoall_n2_w32_512B.log
check alltoall_n2_w32_512B
########################## TEAMGETNBI ###############################
echo "teamgetnbi_n2_w1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 39 > $3/teamgetnbi_n2_w1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 39 > $3/teamgetnbi_n2_w1_1MB.log
check teamgetnbi_n2_w1_1MB
########################## TEAMPUTNBI ###############################
echo "teamputnbi_n2_w1_1MB"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 41 > $3/teamputnbi_n2_w1_1MB.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 41 > $3/teamputnbi_n2_w1_1MB.log
check teamputnbi_n2_w1_1MB
############################ AMOSET #################################
echo "amoset_n2_w1_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log
check amoset_n2_w1_z1
echo "amoset_n2_w8_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log
check amoset_n2_w8_z1
echo "amoset_n2_w32_z1"
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 44 > $3/amoset_n2_w32_z1.log
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 44 > $3/amoset_n2_w32_z1.log
check amoset_n2_w32_z1
;;
@@ -82,10 +82,10 @@ do
# test exeuction based on lib
if [ "$libnm" == "ro_net" ]
then
ROC_SHMEM_RO=1
ROCSHMEM_RO=1
ROC_NET_CPU_QUEUE=1
UCX_TLS=rc
#echo $ROC_SHMEM_RO"--"$ROC_NET_CPU_QUEUE "--"$UCX_TLS
#echo $ROCSHMEM_RO"--"$ROC_NET_CPU_QUEUE "--"$UCX_TLS
../scripts/functional_tests/driver.sh tests/functional_tests/rocshmem_example_driver $threadType .
else
../scripts/functional_tests/driver.sh tests/functional_tests/rocshmem_example_driver $threadType .
+2 -2
View File
@@ -31,8 +31,8 @@ target_sources(
context_host.cpp
context_device.cpp
mpi_init_singleton.cpp
roc_shmem_gpu.cpp
roc_shmem.cpp
rocshmem_gpu.cpp
rocshmem.cpp
team.cpp
team_tracker.cpp
util.cpp
+2 -2
View File
@@ -202,7 +202,7 @@ void Backend::reset_stats() {
reset_backend_stats();
}
__device__ bool Backend::create_ctx(int64_t option, roc_shmem_ctx_t* ctx) {
__device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
#ifdef USE_GPU_IB
return static_cast<GPUIBBackend*>(this)->create_ctx(option, ctx);
#elif defined(USE_RO)
@@ -212,7 +212,7 @@ __device__ bool Backend::create_ctx(int64_t option, roc_shmem_ctx_t* ctx) {
#endif
}
__device__ void Backend::destroy_ctx(roc_shmem_ctx_t* ctx) {
__device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) {
#ifdef USE_GPU_IB
static_cast<GPUIBBackend*>(this)->destroy_ctx(ctx);
#elif defined(USE_RO)
+6 -6
View File
@@ -36,7 +36,7 @@
#include <vector>
#include "config.h" // NOLINT(build/include_subdir)
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "backend_type.hpp"
#include "ipc_policy.hpp"
#include "memory/symmetric_heap.hpp"
@@ -56,7 +56,7 @@ class TeamInfo;
* It uses this state to populate Context objects which the GPU may use to
* perform networking operations.
*
* The roc_shmem.cpp implementation file wraps many the Backend public
* The rocshmem.cpp implementation file wraps many the Backend public
* members to implement the library's public API.
*/
class Backend {
@@ -74,8 +74,8 @@ class Backend {
*/
virtual ~Backend();
__device__ bool create_ctx(int64_t option, roc_shmem_ctx_t* ctx);
__device__ void destroy_ctx(roc_shmem_ctx_t* ctx);
__device__ bool create_ctx(int64_t option, rocshmem_ctx_t* ctx);
__device__ void destroy_ctx(rocshmem_ctx_t* ctx);
/**
* @brief Create a new team object and initialize it.
@@ -93,14 +93,14 @@ class Backend {
TeamInfo* team_info_wrt_parent,
TeamInfo* team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t* new_team) = 0;
rocshmem_team_t* new_team) = 0;
/**
* @brief Destruct a team
*
* @param[in] team Handle to the team to destroy.
*/
virtual void team_destroy(roc_shmem_team_t team) = 0;
virtual void team_destroy(rocshmem_team_t team) = 0;
/**
* @brief Reports processing element number id.
+1 -1
View File
@@ -26,7 +26,7 @@
/**
* @file constants.hpp
*
* @brief Contains global constants for ROCSHMEM library
* @brief Contains global constants for rocSHMEM library
*/
namespace rocshmem {
+4 -4
View File
@@ -30,11 +30,11 @@
#include <cstdio>
#include <iostream>
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#define BARRIER() rocshmem::roc_shmem_wg_barrier_all()
#define RANK rocshmem::roc_shmem_my_pe()
#define NPES rocshmem::roc_shmem_n_pes()
#define BARRIER() rocshmem::rocshmem_wg_barrier_all()
#define RANK rocshmem::rocshmem_my_pe()
#define NPES rocshmem::rocshmem_n_pes()
#define PE_BITS ((uint64_t)ceil(log(NPES) / log(2)))
#define PE_OF(X) ((X) >> (64 - PE_BITS))
+12 -12
View File
@@ -67,7 +67,7 @@ class Context {
* just removing the dispatch implementations.
*
* No comments for these guys since its basically the same as in the
* roc_shmem.hpp public header.
* rocshmem.hpp public header.
*/
/**************************************************************************
@@ -139,7 +139,7 @@ class Context {
__device__ void sync_all();
__device__ void sync(roc_shmem_team_t team);
__device__ void sync(rocshmem_team_t team);
template <typename T>
__device__ T amo_fetch(void* dst, T value, T cond, int pe, uint8_t atomic_op);
@@ -186,13 +186,13 @@ class Context {
template <typename T>
__device__ T g(T* source, int pe);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void to_all(T* dest, const T* source, int nreduce, int PE_start,
int logPE_stride, int PE_size, T* pWrk,
long* pSync); // NOLINT(runtime/int)
template <typename T, ROC_SHMEM_OP Op>
__device__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce);
template <typename T, ROCSHMEM_OP Op>
__device__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce);
template <typename T>
__device__ void put(T* dest, const T* source, size_t nelems, int pe);
@@ -207,15 +207,15 @@ class Context {
__device__ void get_nbi(T* dest, const T* source, size_t nelems, int pe);
template <typename T>
__device__ void alltoall(roc_shmem_team_t team, T* dest, const T* source,
__device__ void alltoall(rocshmem_team_t team, T* dest, const T* source,
int nelems);
template <typename T>
__device__ void fcollect(roc_shmem_team_t team, T* dest, const T* source,
__device__ void fcollect(rocshmem_team_t team, T* dest, const T* source,
int nelems);
template <typename T>
__device__ void broadcast(roc_shmem_team_t team, T* dest, const T* source,
__device__ void broadcast(rocshmem_team_t team, T* dest, const T* source,
int nelems, int pe_root);
template <typename T>
@@ -351,16 +351,16 @@ class Context {
long* p_sync); // NOLINT(runtime/int)
template <typename T>
__host__ void broadcast(roc_shmem_team_t team, T* dest, const T* source,
__host__ void broadcast(rocshmem_team_t team, T* dest, const T* source,
int nelems, int pe_root);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(T* dest, const T* source, int nreduce, int PE_start,
int logPE_stride, int PE_size, T* pWrk,
long* pSync); // NOLINT(runtime/int)
template <typename T, ROC_SHMEM_OP Op>
__host__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce);
template <typename T, ROCSHMEM_OP Op>
__host__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce);
template <typename T>
__host__ void wait_until(T *ivars, int cmp, T val);
+1 -1
View File
@@ -154,7 +154,7 @@ __device__ void Context::sync_all() {
DISPATCH(sync_all());
}
__device__ void Context::sync(roc_shmem_team_t team) {
__device__ void Context::sync(rocshmem_team_t team) {
ctxStats.incStat(NUM_SYNC_ALL);
DISPATCH(sync(team));
+13 -13
View File
@@ -62,7 +62,7 @@ __device__ T Context::g(T *source, int pe) {
}
// The only way to get multi-arg templates to feed into a macro
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void Context::to_all(T *dest, const T *source, int nreduce,
int PE_start, int logPE_stride, int PE_size,
T *pWrk,
@@ -79,11 +79,11 @@ __device__ void Context::to_all(T *dest, const T *source, int nreduce,
PE_size, pWrk, pSync));
}
template <typename T, ROC_SHMEM_OP Op>
__device__ int Context::reduce(roc_shmem_team_t team, T *dest, const T *source,
template <typename T, ROCSHMEM_OP Op>
__device__ int Context::reduce(rocshmem_team_t team, T *dest, const T *source,
int nreduce) {
if (nreduce == 0) {
return ROC_SHMEM_SUCCESS;
return ROCSHMEM_SUCCESS;
}
if (is_thread_zero_in_block()) {
@@ -140,7 +140,7 @@ __device__ void Context::get_nbi(T *dest, const T *source, size_t nelems,
}
template <typename T>
__device__ void Context::alltoall(roc_shmem_team_t team, T *dest,
__device__ void Context::alltoall(rocshmem_team_t team, T *dest,
const T *source, int nelems) {
if (nelems == 0) {
return;
@@ -154,7 +154,7 @@ __device__ void Context::alltoall(roc_shmem_team_t team, T *dest,
}
template <typename T>
__device__ void Context::fcollect(roc_shmem_team_t team, T *dest,
__device__ void Context::fcollect(rocshmem_team_t team, T *dest,
const T *source, int nelems) {
if (nelems == 0) {
return;
@@ -168,7 +168,7 @@ __device__ void Context::fcollect(roc_shmem_team_t team, T *dest,
}
template <typename T>
__device__ void Context::broadcast(roc_shmem_team_t team, T *dest,
__device__ void Context::broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems, int pe_root) {
if (nelems == 0) {
return;
@@ -340,32 +340,32 @@ __device__ __forceinline__ int Context::test(T *ivars, int cmp,
int ret = 0;
volatile T *vol_ivars = reinterpret_cast<T *>(ivars);
switch (cmp) {
case ROC_SHMEM_CMP_EQ:
case ROCSHMEM_CMP_EQ:
if (uncached_load(vol_ivars) == val) {
ret = 1;
}
break;
case ROC_SHMEM_CMP_NE:
case ROCSHMEM_CMP_NE:
if (uncached_load(vol_ivars) != val) {
ret = 1;
}
break;
case ROC_SHMEM_CMP_GT:
case ROCSHMEM_CMP_GT:
if (uncached_load(vol_ivars) > val) {
ret = 1;
}
break;
case ROC_SHMEM_CMP_GE:
case ROCSHMEM_CMP_GE:
if (uncached_load(vol_ivars) >= val) {
ret = 1;
}
break;
case ROC_SHMEM_CMP_LT:
case ROCSHMEM_CMP_LT:
if (uncached_load(vol_ivars) < val) {
ret = 1;
}
break;
case ROC_SHMEM_CMP_LE:
case ROCSHMEM_CMP_LE:
if (uncached_load(vol_ivars) <= val) {
ret = 1;
}
+5 -5
View File
@@ -194,7 +194,7 @@ __host__ void Context::broadcast(T *dest, const T *source, int nelems,
}
template <typename T>
__host__ void Context::broadcast(roc_shmem_team_t team, T *dest,
__host__ void Context::broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems,
int pe_root) { // NOLINT(runtime/int)
if (nelems == 0) {
@@ -206,7 +206,7 @@ __host__ void Context::broadcast(roc_shmem_team_t team, T *dest,
HOST_DISPATCH(broadcast<T>(team, dest, source, nelems, pe_root));
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void Context::to_all(T *dest, const T *source, int nreduce,
int PE_start, int logPE_stride, int PE_size,
T *pWrk,
@@ -221,11 +221,11 @@ __host__ void Context::to_all(T *dest, const T *source, int nreduce,
logPE_stride, PE_size, pWrk, pSync));
}
template <typename T, ROC_SHMEM_OP Op>
__host__ int Context::reduce(roc_shmem_team_t team, T *dest, const T *source,
template <typename T, ROCSHMEM_OP Op>
__host__ int Context::reduce(rocshmem_team_t team, T *dest, const T *source,
int nreduce) { // NOLINT(runtime/int)
if (nreduce == 0) {
return ROC_SHMEM_SUCCESS;
return ROCSHMEM_SUCCESS;
}
ctxHostStats.incStat(NUM_HOST_TO_ALL);
+2 -2
View File
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_FENCE_POLICY_HPP_
#define LIBRARY_SRC_FENCE_POLICY_HPP_
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
namespace rocshmem {
@@ -43,7 +43,7 @@ class Fence {
* @param[in] options interpreted as a bitfield using bitwise operations
*/
__host__ __device__ Fence(long option) {
if (option & ROC_SHMEM_CTX_NOSTORE) {
if (option & ROCSHMEM_CTX_NOSTORE) {
flush_ = false;
}
}
+51 -51
View File
@@ -30,7 +30,7 @@
#include <cstdlib>
#include <mutex> // NOLINT(build/c++11)
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "../backend_type.hpp"
#include "../context_incl.hpp"
#include "gpu_ib_team.hpp"
@@ -47,10 +47,10 @@ namespace rocshmem {
} \
}
extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT;
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
roc_shmem_team_t get_external_team(GPUIBTeam *team) {
return reinterpret_cast<roc_shmem_team_t>(team);
rocshmem_team_t get_external_team(GPUIBTeam *team) {
return reinterpret_cast<rocshmem_team_t>(team);
}
int get_ls_non_zero_bit(char *bitmask, int mask_length) {
@@ -68,7 +68,7 @@ int get_ls_non_zero_bit(char *bitmask, int mask_length) {
}
GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) {
if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
std::stringstream sstream(maximum_num_contexts_str);
sstream >> maximum_num_contexts_;
}
@@ -96,7 +96,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
setup_team_world();
roc_shmem_collective_init();
rocshmem_collective_init();
teams_init();
@@ -109,7 +109,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
#ifdef USE_HOST_SIDE_HDP_FLUSH
hdp_gpu_cpu_flush_flag_ =
static_cast<unsigned int *>(roc_shmem_malloc(sizeof(unsigned int)));
static_cast<unsigned int *>(rocshmem_malloc(sizeof(unsigned int)));
hdp_policy->set_flush_polling_ptr(hdp_gpu_cpu_flush_flag_);
hdp_flush_worker_thread = std::thread(&GPUIBBackend::hdp_flush_poll, this);
@@ -125,7 +125,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
}
__device__ bool GPUIBBackend::create_ctx(int64_t options,
roc_shmem_ctx_t *ctx) {
rocshmem_ctx_t *ctx) {
GPUIBContext *ctx_;
auto pop_result = ctx_free_list.get()->pop_front();
@@ -154,7 +154,7 @@ void GPUIBBackend::ctx_destroy(Context *ctx) {
delete gpu_ib_host_ctx;
}
__device__ void GPUIBBackend::destroy_ctx(roc_shmem_ctx_t *ctx) {
__device__ void GPUIBBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
ctx_free_list.get()->push_back(static_cast<GPUIBContext *>(ctx->ctx_opaque));
}
@@ -167,7 +167,7 @@ GPUIBBackend::~GPUIBBackend() {
#ifdef USE_HOST_SIDE_HDP_FLUSH
hdp_flush_worker_thread.join();
hdp_policy->set_flush_polling_ptr(nullptr);
roc_shmem_free(hdp_gpu_cpu_flush_flag_);
rocshmem_free(hdp_gpu_cpu_flush_flag_);
#endif
/**
@@ -203,7 +203,7 @@ void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team,
TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) {
rocshmem_team_t *new_team) {
/**
* Read the bit mask and find out a common index into
* the pool of available work arrays.
@@ -237,7 +237,7 @@ void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team,
*new_team = get_external_team(new_team_obj);
}
void GPUIBBackend::team_destroy(roc_shmem_team_t team) {
void GPUIBBackend::team_destroy(rocshmem_team_t team) {
GPUIBTeam *team_obj = get_internal_gpu_ib_team(team);
/* Mark the pool as available */
@@ -263,7 +263,7 @@ void GPUIBBackend::initialize_network() { networkImpl.networkHostSetup(this); }
void GPUIBBackend::setup_default_host_ctx() {
default_host_ctx_ = new GPUIBHostContext(this, 0);
ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_;
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_;
}
void GPUIBBackend::setup_ctxs() {
@@ -288,19 +288,19 @@ void GPUIBBackend::setup_default_ctx() {
new (default_ctx_) GPUIBContext(this, true, 0);
/*
* Set the ROC_SHMEM_CTX_DEFAULT in constant memory.
* Set the ROCSHMEM_CTX_DEFAULT in constant memory.
*/
int *symbol_address;
CHECK_HIP(hipGetSymbolAddress(reinterpret_cast<void **>(&symbol_address),
HIP_SYMBOL(ROC_SHMEM_CTX_DEFAULT)));
HIP_SYMBOL(ROCSHMEM_CTX_DEFAULT)));
TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world;
roc_shmem_ctx_t ctx_default_host{default_ctx_, tinfo};
rocshmem_ctx_t ctx_default_host{default_ctx_, tinfo};
hipStream_t stream;
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
CHECK_HIP(hipMemcpyAsync(symbol_address, &ctx_default_host,
sizeof(roc_shmem_ctx_t), hipMemcpyDefault, stream));
sizeof(rocshmem_ctx_t), hipMemcpyDefault, stream));
CHECK_HIP(hipStreamSynchronize(stream));
CHECK_HIP(hipStreamDestroy(stream));
}
@@ -328,9 +328,9 @@ void GPUIBBackend::setup_team_world() {
team_tracker.set_team_world(team_world);
/**
* Copy the address to ROC_SHMEM_TEAM_WORLD.
* Copy the address to ROCSHMEM_TEAM_WORLD.
*/
ROC_SHMEM_TEAM_WORLD = reinterpret_cast<roc_shmem_team_t>(team_world);
ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
}
void GPUIBBackend::init_mpi_once(MPI_Comm comm) {
@@ -381,19 +381,19 @@ void GPUIBBackend::teams_init() {
* Allocate pools for the teams sync and work arrary from the SHEAP.
*/
auto max_num_teams{team_tracker.get_max_num_teams()};
barrier_pSync_pool = reinterpret_cast<long *>(roc_shmem_malloc(
sizeof(long) * ROC_SHMEM_BARRIER_SYNC_SIZE * max_num_teams));
reduce_pSync_pool = reinterpret_cast<long *>(roc_shmem_malloc(
sizeof(long) * ROC_SHMEM_REDUCE_SYNC_SIZE * max_num_teams));
bcast_pSync_pool = reinterpret_cast<long *>(roc_shmem_malloc(
sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE * max_num_teams));
alltoall_pSync_pool = reinterpret_cast<long *>(roc_shmem_malloc(
sizeof(long) * ROC_SHMEM_ALLTOALL_SYNC_SIZE * max_num_teams));
barrier_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams));
reduce_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams));
bcast_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams));
alltoall_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
sizeof(long) * ROCSHMEM_ALLTOALL_SYNC_SIZE * max_num_teams));
/* Accommodating for largest possible data type for pWrk */
pWrk_pool = roc_shmem_malloc(
sizeof(double) * ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams);
pAta_pool = roc_shmem_malloc(sizeof(double) * ROC_SHMEM_ATA_MAX_WRKDATA_SIZE *
pWrk_pool = rocshmem_malloc(
sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams);
pAta_pool = rocshmem_malloc(sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE *
max_num_teams);
/**
@@ -402,25 +402,25 @@ void GPUIBBackend::teams_init() {
long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
for (int team_i = 0; team_i < max_num_teams; team_i++) {
barrier_pSync = reinterpret_cast<long *>(
&barrier_pSync_pool[team_i * ROC_SHMEM_BARRIER_SYNC_SIZE]);
&barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
reduce_pSync = reinterpret_cast<long *>(
&reduce_pSync_pool[team_i * ROC_SHMEM_REDUCE_SYNC_SIZE]);
&reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = reinterpret_cast<long *>(
&bcast_pSync_pool[team_i * ROC_SHMEM_BCAST_SYNC_SIZE]);
&bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
alltoall_pSync = reinterpret_cast<long *>(
&alltoall_pSync_pool[team_i * ROC_SHMEM_ALLTOALL_SYNC_SIZE]);
&alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
for (int i = 0; i < ROC_SHMEM_BARRIER_SYNC_SIZE; i++) {
barrier_pSync[i] = ROC_SHMEM_SYNC_VALUE;
for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) {
reduce_pSync[i] = ROC_SHMEM_SYNC_VALUE;
for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROC_SHMEM_BCAST_SYNC_SIZE; i++) {
bcast_pSync[i] = ROC_SHMEM_SYNC_VALUE;
for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROC_SHMEM_ALLTOALL_SYNC_SIZE; i++) {
alltoall_pSync[i] = ROC_SHMEM_SYNC_VALUE;
for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
}
@@ -457,30 +457,30 @@ void GPUIBBackend::teams_init() {
}
void GPUIBBackend::teams_destroy() {
roc_shmem_free(barrier_pSync_pool);
roc_shmem_free(reduce_pSync_pool);
roc_shmem_free(bcast_pSync_pool);
roc_shmem_free(alltoall_pSync_pool);
roc_shmem_free(pWrk_pool);
roc_shmem_free(pAta_pool);
rocshmem_free(barrier_pSync_pool);
rocshmem_free(reduce_pSync_pool);
rocshmem_free(bcast_pSync_pool);
rocshmem_free(alltoall_pSync_pool);
rocshmem_free(pWrk_pool);
rocshmem_free(pAta_pool);
free(pool_bitmask_);
free(reduced_bitmask_);
}
void GPUIBBackend::roc_shmem_collective_init() {
void GPUIBBackend::rocshmem_collective_init() {
/*
* Allocate heap space for barrier_sync
*/
size_t one_sync_size_bytes{sizeof(*barrier_sync)};
size_t sync_size_bytes{one_sync_size_bytes * ROC_SHMEM_BARRIER_SYNC_SIZE};
size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
heap.malloc(reinterpret_cast<void **>(&barrier_sync), sync_size_bytes);
/*
* Initialize the barrier synchronization array with default values.
*/
for (int i = 0; i < num_pes; i++) {
barrier_sync[i] = ROC_SHMEM_SYNC_VALUE;
barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
}
/*
+11 -11
View File
@@ -72,19 +72,19 @@ class GPUIBBackend : public Backend {
void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) override;
rocshmem_team_t *new_team) override;
/**
* @copydoc Backend::team_destroy(roc_shmem_team_t)
* @copydoc Backend::team_destroy(rocshmem_team_t)
*/
void team_destroy(roc_shmem_team_t team) override;
void team_destroy(rocshmem_team_t team) override;
/**
* @copydoc Backend::ctx_create
*/
void ctx_create(int64_t options, void **ctx) override;
__device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx);
__device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
/**
* @copydoc Backend::ctx_destroy
@@ -94,7 +94,7 @@ class GPUIBBackend : public Backend {
/**
* @copydoc Backend::ctx_destroy
*/
__device__ void destroy_ctx(roc_shmem_ctx_t *ctx);
__device__ void destroy_ctx(rocshmem_ctx_t *ctx);
protected:
/**
@@ -151,10 +151,10 @@ class GPUIBBackend : public Backend {
void initialize_ipc();
/**
* @brief Allocate and initialize the ROC_SHMEM_CTX_DEFAULT variable.
* @brief Allocate and initialize the ROCSHMEM_CTX_DEFAULT variable.
*
* @todo The default_ctx member looks unused after it is copied into
* the ROC_SHMEM_CTX_DEFAULT variable.
* the ROCSHMEM_CTX_DEFAULT variable.
*/
void setup_default_ctx();
void setup_ctxs();
@@ -187,7 +187,7 @@ class GPUIBBackend : public Backend {
* When this method completes, the barrier_sync member will be available
* for use.
*/
void roc_shmem_collective_init();
void rocshmem_collective_init();
#ifdef USE_HOST_SIDE_HDP_FLUSH
/**
@@ -245,8 +245,8 @@ class GPUIBBackend : public Backend {
void *pAta_pool{nullptr};
/**
* @brief ROC_SHMEM's copy of MPI_COMM_WORLD (for interoperability
* with orthogonal MPI usage in an MPI+ROC_SHMEM program).
* @brief rocSHMEM's copy of MPI_COMM_WORLD (for interoperability
* with orthogonal MPI usage in an MPI+rocSHMEM program).
*/
MPI_Comm gpu_ib_comm_world{};
MPI_Comm backend_comm{};
@@ -334,7 +334,7 @@ class GPUIBBackend : public Backend {
* specification).
*
* @todo Remove this member from the backend class. There is another
* copy stored in ROC_SHMEM_CTX_DEFAULT.
* copy stored in ROCSHMEM_CTX_DEFAULT.
*/
GPUIBContext *default_ctx_{nullptr};
+4 -4
View File
@@ -39,19 +39,19 @@ int Connection::coherent_cq = 0;
Connection::Connection(GPUIBBackend* b, int k) : backend(b), key_offset(k) {
char* value = nullptr;
if ((value = getenv("ROC_SHMEM_USE_IB_HCA"))) {
if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) {
requested_dev = value;
}
if ((value = getenv("ROC_SHMEM_SQ_SIZE"))) {
if ((value = getenv("ROCSHMEM_SQ_SIZE"))) {
sq_size = atoi(value);
}
if ((value = getenv("ROC_SHMEM_USE_CQ_GPU_MEM")) != nullptr) {
if ((value = getenv("ROCSHMEM_USE_CQ_GPU_MEM")) != nullptr) {
cq_use_gpu_mem = atoi(value);
}
if ((value = getenv("ROC_SHMEM_USE_SQ_GPU_MEM")) != nullptr) {
if ((value = getenv("ROCSHMEM_USE_SQ_GPU_MEM")) != nullptr) {
sq_use_gpu_mem = atoi(value);
}
}
+1 -1
View File
@@ -31,7 +31,7 @@ extern "C" {
#include <vector>
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "connection_policy.hpp"
namespace rocshmem {
+1 -1
View File
@@ -25,7 +25,7 @@
#include <hip/hip_runtime.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "../backend_type.hpp"
#include "../context_incl.hpp"
#include "backend_ib.hpp"
+17 -17
View File
@@ -69,7 +69,7 @@ class GPUIBContext : public Context {
__device__ void sync_all();
__device__ void sync(roc_shmem_team_t team);
__device__ void sync(rocshmem_team_t team);
template <typename T>
__device__ void amo_add(void *dst, T value, int pe);
@@ -113,13 +113,13 @@ class GPUIBContext : public Context {
template <typename T>
__device__ T g(const T *source, int pe);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void to_all(T *dest, const T *source, int nreduce, int PE_start,
int logPE_stride, int PE_size, T *pWrk,
long *pSync); // NOLINT(runtime/int)
template <typename T, ROC_SHMEM_OP Op>
__device__ void to_all(roc_shmem_team_t team, T *dest, const T *source,
template <typename T, ROCSHMEM_OP Op>
__device__ void to_all(rocshmem_team_t team, T *dest, const T *source,
int nreduce);
template <typename T>
@@ -135,7 +135,7 @@ class GPUIBContext : public Context {
__device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
__device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
int nelems, int pe_root);
template <typename T>
@@ -144,43 +144,43 @@ class GPUIBContext : public Context {
long *p_sync); // NOLINT(runtime/int)
template <typename T>
__device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source,
__device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void alltoall_broadcast(roc_shmem_team_t team, T *dest,
__device__ void alltoall_broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void alltoall_brucks(roc_shmem_team_t team, T *dest,
__device__ void alltoall_brucks(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void alltoall_gcen(roc_shmem_team_t team, T *dest, const T *source,
__device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void alltoall_gcen2(roc_shmem_team_t team, T *dest,
__device__ void alltoall_gcen2(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source,
__device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void fcollect_broadcast(roc_shmem_team_t team, T *dest,
__device__ void fcollect_broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void fcollect_brucks(roc_shmem_team_t team, T *dest,
__device__ void fcollect_brucks(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void fcollect_gcen(roc_shmem_team_t team, T *dest, const T *source,
__device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void fcollect_gcen2(roc_shmem_team_t team, T *dest,
__device__ void fcollect_gcen2(rocshmem_team_t team, T *dest,
const T *source, int nelems);
__device__ void putmem_wg(void *dest, const void *source, size_t nelems,
@@ -232,13 +232,13 @@ class GPUIBContext : public Context {
__device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
private:
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void internal_direct_allreduce(
T *dst, const T *src, int nelems, int PE_start, int logPE_stride,
int PE_size, T *pWrk,
long *pSync); // NOLINT(runtime/int)
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void internal_ring_allreduce(T *dst, const T *src, int nelems,
int PE_start, int logPE_stride,
int PE_size, T *pWrk,
+10 -10
View File
@@ -20,7 +20,7 @@
* IN THE SOFTWARE.
*****************************************************************************/
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "../context_incl.hpp"
#include "context_ib_tmpl_device.hpp"
#include "../util.hpp"
@@ -35,8 +35,8 @@ __device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start,
// Go through all PE offsets (except current offset = 0)
// and wait until they all reach
for (size_t i = 1; i < n_pes; i++) {
wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val);
pSync[i] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
pSync[i] = ROCSHMEM_SYNC_VALUE;
}
threadfence_system();
// Announce to other PEs that all have reached
@@ -48,8 +48,8 @@ __device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start,
// Mark current PE offset as reached
size_t pe_offset = (pe - PE_start) / stride;
put_nbi(&pSync[pe_offset], &flag_val, 1, PE_start);
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
pSync[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
}
}
@@ -59,16 +59,16 @@ __device__ void GPUIBContext::internal_atomic_barrier(int pe, int PE_start,
int64_t *pSync) {
int64_t flag_val = 1;
if (pe == PE_start) {
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(n_pes - 1));
pSync[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
put_nbi(&pSync[0], &flag_val, 1, j);
}
} else {
amo_add<int64_t>(&pSync[0], flag_val, PE_start);
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
pSync[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
}
}
@@ -88,7 +88,7 @@ __device__ void GPUIBContext::internal_sync(int pe, int PE_start, int stride,
__syncthreads();
}
__device__ void GPUIBContext::sync(roc_shmem_team_t team) {
__device__ void GPUIBContext::sync(rocshmem_team_t team) {
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
+4 -4
View File
@@ -86,16 +86,16 @@ class GPUIBHostContext : public Context {
long *p_sync); // NOLINT(runtime/int)
template <typename T>
__host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
__host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
int nelems, int pe_root);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
int log_pe_stride, int pe_size, T *p_wrk,
long *p_sync); // NOLINT(runtime/int)
template <typename T, ROC_SHMEM_OP Op>
__host__ void to_all(roc_shmem_team_t team, T *dest, const T *source,
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(rocshmem_team_t team, T *dest, const T *source,
int nreduce);
template <typename T>
+53 -53
View File
@@ -24,16 +24,16 @@
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "context_ib_device.hpp"
#include "gpu_ib_team.hpp"
#include "queue_pair.hpp"
#include "../util.hpp"
#include "../roc_shmem_calc.hpp"
#include "../rocshmem_calc.hpp"
namespace rocshmem {
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
int wg_size) {
for (size_t i = wg_id; i < size; i += wg_size) {
@@ -47,7 +47,7 @@ __device__ void GPUIBContext::p(T *dest, T value, int pe) {
putmem_nbi(dest, &value, sizeof(T), pe);
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void GPUIBContext::internal_ring_allreduce(
T *dst, const T *src, int nelems, [[maybe_unused]] int PE_start,
[[maybe_unused]] int logPE_stride, [[maybe_unused]] int PE_size, T *pWrk,
@@ -81,7 +81,7 @@ __device__ void GPUIBContext::internal_ring_allreduce(
wait_val = seg + 100;
p(&pSync[round], wait_val, send_pe);
wait_until(&pSync[round], ROC_SHMEM_CMP_EQ, wait_val);
wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val);
__threadfence();
}
__syncthreads();
@@ -99,19 +99,19 @@ __device__ void GPUIBContext::internal_ring_allreduce(
fence();
wait_val = seg + 100;
p(&pSync[round], wait_val, send_pe);
wait_until(&pSync[round], ROC_SHMEM_CMP_EQ, wait_val);
wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val);
}
__syncthreads();
}
}
__syncthreads();
for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
pSync[i] = ROC_SHMEM_SYNC_VALUE;
pSync[i] = ROCSHMEM_SYNC_VALUE;
}
__syncthreads();
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void GPUIBContext::internal_direct_allreduce(
T *dst, const T *src, int nelems, int PE_start, int logPE_stride,
int PE_size, T *pWrk,
@@ -147,7 +147,7 @@ __device__ void GPUIBContext::internal_direct_allreduce(
if (i != pe) {
// Wait for leader thread to see that the buffer is ready.
if (is_thread_zero_in_block()) {
wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, 1L);
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, 1L);
}
__syncthreads();
@@ -159,14 +159,14 @@ __device__ void GPUIBContext::internal_direct_allreduce(
__syncthreads();
for (int i = wg_id; i < num_pes; i += wg_size) {
pSync[i] = ROC_SHMEM_SYNC_VALUE;
pSync[i] = ROCSHMEM_SYNC_VALUE;
}
__syncthreads();
}
template <typename T, ROC_SHMEM_OP Op>
__device__ void GPUIBContext::to_all(roc_shmem_team_t team, T *dest,
template <typename T, ROCSHMEM_OP Op>
__device__ void GPUIBContext::to_all(rocshmem_team_t team, T *dest,
const T *source, int nreduce) {
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
@@ -189,7 +189,7 @@ __device__ void GPUIBContext::to_all(roc_shmem_team_t team, T *dest,
p_sync);
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce,
int PE_start, int logPE_stride,
int PE_size, T *pWrk,
@@ -200,8 +200,8 @@ __device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce,
size_t ring_pSync = 2 * num_pes;
size_t provided_pWrk =
max(nreduce / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
size_t provided_pSync = ROC_SHMEM_REDUCE_SYNC_SIZE;
max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE;
// TODO(bpotter):
// We basically do a direct reduce if pWrk is big enough, else we
@@ -212,12 +212,12 @@ __device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce,
internal_direct_allreduce<T, Op>(dest, source, nreduce, PE_start,
logPE_stride, PE_size, pWrk, pSync);
} else {
if (ring_pSync <= ROC_SHMEM_REDUCE_SYNC_SIZE) {
if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) {
int chunk_size = 1024;
size_t ring_pWrk = chunk_size * num_pes;
if (provided_pWrk < ring_pWrk) {
ring_pWrk = max(nreduce / 2, // NOLINT
ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
chunk_size = ring_pWrk / num_pes;
}
int seg_size = ring_pWrk;
@@ -434,7 +434,7 @@ __device__ void GPUIBContext::internal_get_broadcast(
}
template <typename T>
__device__ void GPUIBContext::broadcast(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::broadcast(rocshmem_team_t team, T *dst,
const T *src, int nelems, int pe_root) {
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
@@ -475,14 +475,14 @@ __device__ void GPUIBContext::broadcast(T *dst, const T *src, int nelems,
}
template <typename T>
__device__ void GPUIBContext::alltoall(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::alltoall(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// Currently broadcast implementation performs the best
alltoall_broadcast(team, dst, src, nelems);
}
template <typename T>
__device__ void GPUIBContext::alltoall_broadcast(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::alltoall_broadcast(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// Broadcast implementation of alltoall collective
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
@@ -514,7 +514,7 @@ __device__ void GPUIBContext::alltoall_broadcast(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::alltoall_brucks(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// Brucks implementation of alltoall collective
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
@@ -537,7 +537,7 @@ __device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst,
int blk_size = get_flat_block_size();
// Check if we have enough buffer space. If not, fail.
if (pe_size * nelems * 2 > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
if (pe_size * nelems * 2 > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
assert(false);
}
@@ -612,7 +612,7 @@ __device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::alltoall_gcen(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// GPU-centric implementation of alltoall collective
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
@@ -629,12 +629,12 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst,
int stride = 1 << log_pe_stride;
long *pSync = team_obj->alltoall_pSync;
int64_t *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE];
int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
int my_pe_in_team = team_obj->my_pe;
// Check if we have enough buffer space. If not, fail.
T *pAta = reinterpret_cast<T *>(team_obj->pAta);
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
assert(false);
}
@@ -677,29 +677,29 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst,
if (dest_pe2 != my_pe) amo_add<int64_t>(&pSync[0], flag_val, dest_pe2);
if (my_pe == dest_pe) {
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1));
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
pSync2[0] = ROCSHMEM_SYNC_VALUE;
__threadfence_system();
for (int i = 1; i < clust_size; ++i)
put_nbi(&pSync2[0], &flag_val, 1,
team_obj->get_pe_in_world(my_pe_in_team + i));
} else {
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val);
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
pSync2[0] = ROCSHMEM_SYNC_VALUE;
__threadfence_system();
}
if (my_pe == dest_pe2) {
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(num_clust - 1));
pSync[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(num_clust - 1));
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
for (size_t i = 1, j = dest_pe2 + clust_size * stride; i < num_clust;
++i, j += clust_size * stride) {
put_nbi(&pSync[0], &flag_val, 1, j);
}
} else {
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
pSync[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
}
}
@@ -707,7 +707,7 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::alltoall_gcen2(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// GPU-centric implementation of alltoall collective
// Uses in-place blocking sync
@@ -725,12 +725,12 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst,
int stride = 1 << log_pe_stride;
long *pSync = team_obj->alltoall_pSync;
int64_t *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE];
int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
int my_pe_in_team = team_obj->my_pe;
// Check if we have enough buffer space. If not, fail.
T *pAta = reinterpret_cast<T *>(team_obj->pAta);
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
assert(false);
}
@@ -771,15 +771,15 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst,
if (is_thread_zero_in_block()) {
quiet();
if ((my_pe_in_team % clust_size) == 0) {
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1));
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
pSync2[0] = ROCSHMEM_SYNC_VALUE;
__threadfence_system();
for (int i = 1; i < clust_size; ++i)
put_nbi(&pSync2[0], &flag_val, 1,
team_obj->get_pe_in_world(my_pe_in_team + i));
} else {
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val);
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
pSync2[0] = ROCSHMEM_SYNC_VALUE;
__threadfence_system();
}
}
@@ -790,7 +790,7 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void GPUIBContext::fcollect(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::fcollect(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// Main function for fcollect
// Broadcast version performs moderately well
@@ -799,7 +799,7 @@ __device__ void GPUIBContext::fcollect(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void GPUIBContext::fcollect_broadcast(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::fcollect_broadcast(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// Broadcast implementation of fcollect collective
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
@@ -832,7 +832,7 @@ __device__ void GPUIBContext::fcollect_broadcast(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::fcollect_brucks(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// Brucks implementation of fcollect collective
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
@@ -855,7 +855,7 @@ __device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst,
int blk_size = get_flat_block_size();
// Check if we have enough buffer space. If not, fail.
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
assert(false);
}
@@ -895,7 +895,7 @@ __device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::fcollect_gcen(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// GPU-centric implementation of fcollect collective
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
@@ -912,12 +912,12 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst,
int stride = 1 << log_pe_stride;
long *pSync = team_obj->alltoall_pSync;
long *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE];
long *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
int my_pe_in_team = team_obj->my_pe;
// Check if we have enough buffer space. If not, fail.
T *pAta = reinterpret_cast<T *>(team_obj->pAta);
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
assert(false);
}
@@ -957,15 +957,15 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst,
if (is_thread_zero_in_block()) {
quiet();
if ((my_pe_in_team % clust_size) == 0) {
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1));
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
pSync2[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
for (int i = 1; i < clust_size; ++i)
put_nbi(&pSync2[0], &flag_val, 1,
team_obj->get_pe_in_world(my_pe_in_team + i));
} else {
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val);
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
pSync2[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
}
}
@@ -976,7 +976,7 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void GPUIBContext::fcollect_gcen2(roc_shmem_team_t team, T *dst,
__device__ void GPUIBContext::fcollect_gcen2(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
// GPU-centric implementation of fcollect collective
// Uses in-place blocking sync
@@ -998,7 +998,7 @@ __device__ void GPUIBContext::fcollect_gcen2(roc_shmem_team_t team, T *dst,
// Check if we have enough buffer space. If not, fail.
T *pAta = reinterpret_cast<T *>(team_obj->pAta);
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
assert(false);
}
+4 -4
View File
@@ -93,13 +93,13 @@ __host__ void GPUIBHostContext::broadcast(
}
template <typename T>
__host__ void GPUIBHostContext::broadcast(roc_shmem_team_t team, T *dest,
__host__ void GPUIBHostContext::broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems,
int pe_root) {
host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce,
int pe_start, int log_pe_stride,
int pe_size, T *p_wrk,
@@ -108,8 +108,8 @@ __host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce,
pe_size, p_wrk, p_sync);
}
template <typename T, ROC_SHMEM_OP Op>
__host__ void GPUIBHostContext::to_all(roc_shmem_team_t team, T *dest,
template <typename T, ROCSHMEM_OP Op>
__host__ void GPUIBHostContext::to_all(rocshmem_team_t team, T *dest,
const T *source, int nreduce) {
host_interface->to_all<T, Op>(team, dest, source, nreduce);
}
+1 -1
View File
@@ -20,7 +20,7 @@
* IN THE SOFTWARE.
*****************************************************************************/
#include "roc_shmem/debug.hpp"
#include "rocshmem/debug.hpp"
#include "qe_dumper.hpp"
+2 -2
View File
@@ -31,11 +31,11 @@ namespace rocshmem {
DynamicConnection::DynamicConnection(GPUIBBackend* b) : Connection(b, 4) {
char* value = nullptr;
if ((value = getenv("ROC_SHMEM_NUM_DCIs"))) {
if ((value = getenv("ROCSHMEM_NUM_DCIs"))) {
num_dcis = atoi(value);
}
if ((value = getenv("ROC_SHMEM_NUM_DCT"))) {
if ((value = getenv("ROCSHMEM_NUM_DCT"))) {
num_dct = atoi(value);
}
}
+6 -6
View File
@@ -38,17 +38,17 @@ GPUIBTeam::GPUIBTeam(Backend *backend, TeamInfo *team_info_parent,
pool_index_ = pool_index;
barrier_pSync =
&(b->barrier_pSync_pool[pool_index * ROC_SHMEM_BARRIER_SYNC_SIZE]);
&(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
reduce_pSync =
&(b->reduce_pSync_pool[pool_index * ROC_SHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROC_SHMEM_BCAST_SYNC_SIZE]);
&(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
alltoall_pSync =
&(b->alltoall_pSync_pool[pool_index * ROC_SHMEM_ALLTOALL_SYNC_SIZE]);
&(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
pWrk = reinterpret_cast<char *>(b->pWrk_pool) +
ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
pAta = reinterpret_cast<char *>(b->pAta_pool) +
ROC_SHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
}
GPUIBTeam::~GPUIBTeam() {}
+2 -2
View File
@@ -297,7 +297,7 @@ void NetworkOnImpl::setup_gpu_qps(GPUIBBackend *B) {
}
}
void NetworkOnImpl::roc_shmem_g_init(SymmetricHeap *heap_handle,
void NetworkOnImpl::rocshmem_g_init(SymmetricHeap *heap_handle,
MPI_Comm thread_comm) {
init_g_ret(heap_handle, thread_comm, num_blocks, &g_ret);
}
@@ -327,7 +327,7 @@ __host__ void NetworkOnImpl::networkHostSetup(GPUIBBackend *B) {
connection->initialize_gpu_policy(&connection_policy, heap_rkey);
roc_shmem_g_init(&B->heap, B->thread_comm);
rocshmem_g_init(&B->heap, B->thread_comm);
connection->post_wqes();
+2 -2
View File
@@ -27,7 +27,7 @@
#include <mpi.h>
#include "config.h" // NOLINT(build/include_subdir)
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "connection_policy.hpp"
#include "queue_pair.hpp"
#include "../hdp_policy.hpp"
@@ -154,7 +154,7 @@ class NetworkOnImpl {
* @brief Allocate and initialize device-side memory that will be used for
* the return of g shmem ops (eg: shmem_int_g)
*/
void roc_shmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm);
void rocshmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm);
/**
* @brief The backend delegates some InfiniBand connection setup to
+1 -1
View File
@@ -28,7 +28,7 @@
*
* @section DESCRIPTION
* An IB QueuePair (SQ and CQ) that the device can use to perform network
* operations. Most important ROC_SHMEM operations are performed by this
* operations. Most important rocSHMEM operations are performed by this
* class.
*/
+2 -2
View File
@@ -32,7 +32,7 @@ class QueuePair;
/*
* GPU single-thread policy class. Only a single work-item per work-group
* is allowed to call into a ROC_SHMEM function (unless it is specifically
* is allowed to call into a rocSHMEM function (unless it is specifically
* called out as a collective API. This thread policy is the fastest but
* is not as flexible.
*/
@@ -59,7 +59,7 @@ class SingleThreadImpl {
/*
* GPU multi-thread policy class. Multiple work-items per work-group are
* allowed to call into a ROC_SHMEM function. A bit slower than its
* allowed to call into a rocSHMEM function. A bit slower than its
* single-thread counterpart but it enables a much more flexible user-facing
* API.
*/
+3 -3
View File
@@ -83,13 +83,13 @@ int HostInterface::find_win_info_in_pool(WindowInfo* window_info) {
}
__host__ HostInterface::HostInterface(HdpPolicy* hdp_policy,
MPI_Comm roc_shmem_comm,
MPI_Comm rocshmem_comm,
SymmetricHeap* heap) {
/*
* Duplicate a communicator from roc_shem's comm
* world for the host interface
*/
MPI_Comm_dup(roc_shmem_comm, &host_comm_world_);
MPI_Comm_dup(rocshmem_comm, &host_comm_world_);
MPI_Comm_rank(host_comm_world_, &my_pe_);
MPI_Comm_rank(host_comm_world_, &num_pes_);
@@ -103,7 +103,7 @@ __host__ HostInterface::HostInterface(HdpPolicy* hdp_policy,
* Allocate and initialize pool of windows for contexts
*/
char* value{nullptr};
if ((value = getenv("ROC_SHMEM_MAX_NUM_HOST_CONTEXTS"))) {
if ((value = getenv("ROCSHMEM_MAX_NUM_HOST_CONTEXTS"))) {
max_num_ctxs_ = atoi(value);
}
+8 -8
View File
@@ -36,7 +36,7 @@
#include <map>
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "../hdp_policy.hpp"
#include "../memory/symmetric_heap.hpp"
#include "../memory/window_info.hpp"
@@ -104,7 +104,7 @@ class HostInterface {
/**
* @brief Primary constructor
*/
__host__ HostInterface(HdpPolicy* hdp_policy, MPI_Comm roc_shmem_comm,
__host__ HostInterface(HdpPolicy* hdp_policy, MPI_Comm rocshmem_comm,
SymmetricHeap* heap);
/**
@@ -198,16 +198,16 @@ class HostInterface {
long* p_sync); // NOLINT(runtime/int)
template <typename T>
__host__ void broadcast(roc_shmem_team_t team, T* dest, const T* source,
__host__ void broadcast(rocshmem_team_t team, T* dest, const T* source,
int nelems, int pe_root);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(T* dest, const T* source, int nreduce, int pe_start,
int log_pe_stride, int pe_size, T* p_wrk,
long* p_sync); // NOLINT(runtime/int)
template <typename T, ROC_SHMEM_OP Op>
__host__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce);
template <typename T, ROCSHMEM_OP Op>
__host__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce);
template <typename T>
__host__ void wait_until(T *ivars, int cmp, T val,
@@ -288,7 +288,7 @@ class HostInterface {
__host__ MPI_Comm get_mpi_comm(int pe_start, int log_pe_stride, int pe_size);
__host__ MPI_Op get_mpi_op(ROC_SHMEM_OP Op);
__host__ MPI_Op get_mpi_op(ROCSHMEM_OP Op);
template <typename T>
__host__ MPI_Datatype get_mpi_type();
@@ -300,7 +300,7 @@ class HostInterface {
__host__ int test_and_compare(MPI_Aint offset, MPI_Datatype mpi_type,
int cmp, T val, MPI_Win win);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all_internal(MPI_Comm mpi_comm, T* dest, const T* source,
int nreduce);
+22 -22
View File
@@ -200,7 +200,7 @@ __host__ void HostInterface::broadcast(T* dest, const T* source, int nelems,
}
template <typename T>
__host__ void HostInterface::broadcast(roc_shmem_team_t team, T* dest,
__host__ void HostInterface::broadcast(rocshmem_team_t team, T* dest,
const T* source, int nelems,
int pe_root) {
DPRINTF("Function: Team-based host_broadcast\n");
@@ -216,24 +216,24 @@ __host__ void HostInterface::broadcast(roc_shmem_team_t team, T* dest,
return;
}
__host__ inline MPI_Op HostInterface::get_mpi_op(ROC_SHMEM_OP Op) {
__host__ inline MPI_Op HostInterface::get_mpi_op(ROCSHMEM_OP Op) {
switch (Op) {
case ROC_SHMEM_SUM:
case ROCSHMEM_SUM:
return MPI_SUM;
case ROC_SHMEM_MAX:
case ROCSHMEM_MAX:
return MPI_MAX;
case ROC_SHMEM_MIN:
case ROCSHMEM_MIN:
return MPI_MIN;
case ROC_SHMEM_PROD:
case ROCSHMEM_PROD:
return MPI_PROD;
case ROC_SHMEM_AND:
case ROCSHMEM_AND:
return MPI_BAND;
case ROC_SHMEM_OR:
case ROCSHMEM_OR:
return MPI_BOR;
case ROC_SHMEM_XOR:
case ROCSHMEM_XOR:
return MPI_BXOR;
default:
fprintf(stderr, "Unknown ROC_SHMEM op MPI conversion %d\n", Op);
fprintf(stderr, "Unknown rocSHMEM op MPI conversion %d\n", Op);
abort();
return 0;
}
@@ -330,7 +330,7 @@ __host__ T HostInterface::amo_fetch_cas(void* dst, T value, T cond, int pe,
return ret;
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void HostInterface::to_all_internal(MPI_Comm mpi_comm, T* dest,
const T* source, int nreduce) {
DPRINTF("Function: host_to_all_internal\n");
@@ -356,7 +356,7 @@ __host__ void HostInterface::to_all_internal(MPI_Comm mpi_comm, T* dest,
return;
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void HostInterface::to_all(T* dest, const T* source, int nreduce,
int pe_start, int log_pe_stride,
int pe_size, [[maybe_unused]] T* p_wrk,
@@ -375,8 +375,8 @@ __host__ void HostInterface::to_all(T* dest, const T* source, int nreduce,
return;
}
template <typename T, ROC_SHMEM_OP Op>
__host__ int HostInterface::reduce(roc_shmem_team_t team, T* dest,
template <typename T, ROCSHMEM_OP Op>
__host__ int HostInterface::reduce(rocshmem_team_t team, T* dest,
const T* source, int nreduce) {
DPRINTF("Function: Team-based host_reduce\n");
@@ -388,7 +388,7 @@ __host__ int HostInterface::reduce(roc_shmem_team_t team, T* dest,
to_all_internal<T, Op>(mpi_comm, dest, source, nreduce);
return ROC_SHMEM_SUCCESS;
return ROCSHMEM_SUCCESS;
}
template <typename T>
@@ -397,26 +397,26 @@ __host__ inline int HostInterface::compare(int cmp, T input_val,
int cond_satisfied{0};
switch (cmp) {
case ROC_SHMEM_CMP_EQ:
case ROCSHMEM_CMP_EQ:
cond_satisfied = (input_val == target_val) ? 1 : 0;
break;
case ROC_SHMEM_CMP_NE:
case ROCSHMEM_CMP_NE:
cond_satisfied = (input_val != target_val) ? 1 : 0;
break;
case ROC_SHMEM_CMP_GT:
case ROCSHMEM_CMP_GT:
cond_satisfied = (input_val > target_val) ? 1 : 0;
break;
case ROC_SHMEM_CMP_GE:
case ROCSHMEM_CMP_GE:
cond_satisfied = (input_val >= target_val) ? 1 : 0;
break;
case ROC_SHMEM_CMP_LT:
case ROCSHMEM_CMP_LT:
cond_satisfied = (input_val < target_val) ? 1 : 0;
break;
case ROC_SHMEM_CMP_LE:
case ROCSHMEM_CMP_LE:
cond_satisfied = (input_val <= target_val) ? 1 : 0;
break;
default:
assert(cmp >= ROC_SHMEM_CMP_EQ && cmp <= ROC_SHMEM_CMP_LE);
assert(cmp >= ROCSHMEM_CMP_EQ && cmp <= ROCSHMEM_CMP_LE);
break;
}
+40 -40
View File
@@ -33,10 +33,10 @@ namespace rocshmem {
} \
}
extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT;
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
roc_shmem_team_t get_external_team(GPUIBTeam *team) {
return reinterpret_cast<roc_shmem_team_t>(team);
rocshmem_team_t get_external_team(GPUIBTeam *team) {
return reinterpret_cast<rocshmem_team_t>(team);
}
int get_ls_non_zero_bit(char *bitmask, int mask_length) {
@@ -57,7 +57,7 @@ IPCBackend::IPCBackend(MPI_Comm comm)
: Backend() {
type = BackendType::IPC_BACKEND;
if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) {
if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
std::stringstream sstream(maximum_num_contexts_str);
sstream >> maximum_num_contexts_;
}
@@ -82,7 +82,7 @@ IPCBackend::IPCBackend(MPI_Comm comm)
default_host_ctx = std::make_unique<IPCHostContext>(this, 0);
ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
init_g_ret(&heap, thread_comm, MAX_NUM_BLOCKS, &bp->g_ret);
@@ -92,7 +92,7 @@ IPCBackend::IPCBackend(MPI_Comm comm)
init_wrk_sync_buffer();
roc_shmem_collective_init();
rocshmem_collective_init();
setup_fence_buffer();
@@ -143,7 +143,7 @@ void IPCBackend::setup_ctxs() {
}
}
__device__ bool IPCBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) {
__device__ bool IPCBackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) {
IPCContext *ctx_{nullptr};
auto pop_result = ctx_free_list.get()->pop_front();
@@ -158,7 +158,7 @@ __device__ bool IPCBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) {
return true;
}
__device__ void IPCBackend::destroy_ctx(roc_shmem_ctx_t *ctx) {
__device__ void IPCBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
ctx_free_list.get()->push_back(static_cast<IPCContext *>(ctx->ctx_opaque));
}
@@ -182,9 +182,9 @@ void IPCBackend::setup_team_world() {
team_tracker.set_team_world(team_world);
/**
* Copy the address to ROC_SHMEM_TEAM_WORLD.
* Copy the address to ROCSHMEM_TEAM_WORLD.
*/
ROC_SHMEM_TEAM_WORLD = reinterpret_cast<roc_shmem_team_t>(team_world);
ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
}
void IPCBackend::init_mpi_once(MPI_Comm comm) {
@@ -205,7 +205,7 @@ void IPCBackend::init_mpi_once(MPI_Comm comm) {
NET_CHECK(MPI_Comm_rank(thread_comm, &my_pe));
}
void IPCBackend::team_destroy(roc_shmem_team_t team) {
void IPCBackend::team_destroy(rocshmem_team_t team) {
IPCTeam *team_obj = get_internal_ipc_team(team);
/* Mark the pool as available */
@@ -221,7 +221,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team,
TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) {
rocshmem_team_t *new_team) {
/**
* Read the bit mask and find out a common index into
* the pool of available work arrays.
@@ -303,24 +303,24 @@ void IPCBackend::init_wrk_sync_buffer() {
/**
* size of barrier sync
*/
Wrk_Sync_buffer_size_ += sizeof(*barrier_sync) * ROC_SHMEM_BARRIER_SYNC_SIZE;
Wrk_Sync_buffer_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE;
/**
* Size of sync arrays for the teams
*/
Wrk_Sync_buffer_size_ += sizeof(long) * max_num_teams *
(ROC_SHMEM_BARRIER_SYNC_SIZE +
ROC_SHMEM_REDUCE_SYNC_SIZE +
ROC_SHMEM_BCAST_SYNC_SIZE +
ROC_SHMEM_ALLTOALL_SYNC_SIZE);
(ROCSHMEM_BARRIER_SYNC_SIZE +
ROCSHMEM_REDUCE_SYNC_SIZE +
ROCSHMEM_BCAST_SYNC_SIZE +
ROCSHMEM_ALLTOALL_SYNC_SIZE);
/**
* Size of work arrays for the teams
* Accommodate largest possible data type for pWrk
*/
Wrk_Sync_buffer_size_ += sizeof(double) * max_num_teams *
(ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE +
ROC_SHMEM_ATA_MAX_WRKDATA_SIZE);
(ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE +
ROCSHMEM_ATA_MAX_WRKDATA_SIZE);
/**
* Size of fence array
@@ -397,12 +397,12 @@ void IPCBackend::setup_fence_buffer() {
temp_Wrk_Sync_buff_ptr_ += sizeof(int) * num_pes;
}
void IPCBackend::roc_shmem_collective_init() {
void IPCBackend::rocshmem_collective_init() {
/*
* Allocate heap space for barrier_sync
*/
size_t one_sync_size_bytes{sizeof(*barrier_sync)};
size_t sync_size_bytes{one_sync_size_bytes * ROC_SHMEM_BARRIER_SYNC_SIZE};
size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
barrier_sync = reinterpret_cast<int64_t*>(temp_Wrk_Sync_buff_ptr_);
temp_Wrk_Sync_buff_ptr_ += sync_size_bytes;
@@ -410,7 +410,7 @@ void IPCBackend::roc_shmem_collective_init() {
* Initialize the barrier synchronization array with default values.
*/
for (int i = 0; i < num_pes; i++) {
barrier_sync[i] = ROC_SHMEM_SYNC_VALUE;
barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
}
/*
@@ -427,29 +427,29 @@ void IPCBackend::teams_init() {
auto max_num_teams{team_tracker.get_max_num_teams()};
barrier_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BARRIER_SYNC_SIZE
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE
* max_num_teams;
reduce_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_REDUCE_SYNC_SIZE
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE
* max_num_teams;
bcast_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
* max_num_teams;
alltoall_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
* max_num_teams;
/* Accommodating for largest possible data type for pWrk */
pWrk_pool = reinterpret_cast<void *>(temp_Wrk_Sync_buff_ptr_);
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
* max_num_teams;
pAta_pool = reinterpret_cast<void *>(temp_Wrk_Sync_buff_ptr_);
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROC_SHMEM_ATA_MAX_WRKDATA_SIZE
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE
* max_num_teams;
/**
@@ -458,25 +458,25 @@ void IPCBackend::teams_init() {
long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
for (int team_i = 0; team_i < max_num_teams; team_i++) {
barrier_pSync = reinterpret_cast<long *>(
&barrier_pSync_pool[team_i * ROC_SHMEM_BARRIER_SYNC_SIZE]);
&barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
reduce_pSync = reinterpret_cast<long *>(
&reduce_pSync_pool[team_i * ROC_SHMEM_REDUCE_SYNC_SIZE]);
&reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = reinterpret_cast<long *>(
&bcast_pSync_pool[team_i * ROC_SHMEM_BCAST_SYNC_SIZE]);
&bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
alltoall_pSync = reinterpret_cast<long *>(
&alltoall_pSync_pool[team_i * ROC_SHMEM_ALLTOALL_SYNC_SIZE]);
&alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
for (int i = 0; i < ROC_SHMEM_BARRIER_SYNC_SIZE; i++) {
barrier_pSync[i] = ROC_SHMEM_SYNC_VALUE;
for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) {
reduce_pSync[i] = ROC_SHMEM_SYNC_VALUE;
for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROC_SHMEM_BCAST_SYNC_SIZE; i++) {
bcast_pSync[i] = ROC_SHMEM_SYNC_VALUE;
for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROC_SHMEM_ALLTOALL_SYNC_SIZE; i++) {
alltoall_pSync[i] = ROC_SHMEM_SYNC_VALUE;
for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
}
+7 -7
View File
@@ -48,13 +48,13 @@ class IPCBackend : public Backend {
*/
virtual ~IPCBackend();
__device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx);
__device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
/**
* @brief Destroy a `roc_shmem_ctx_t` context and returns it back to the
* @brief Destroy a `rocshmem_ctx_t` context and returns it back to the
* context free list.
*/
__device__ void destroy_ctx(roc_shmem_ctx_t *ctx);
__device__ void destroy_ctx(rocshmem_ctx_t *ctx);
/**
* @copydoc Backend::ctx_create
@@ -103,12 +103,12 @@ class IPCBackend : public Backend {
void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) override;
rocshmem_team_t *new_team) override;
/**
* @copydoc Backend::team_destroy(roc_shmem_team_t)
* @copydoc Backend::team_destroy(rocshmem_team_t)
*/
void team_destroy(roc_shmem_team_t team) override;
void team_destroy(rocshmem_team_t team) override;
/**
* @brief Accessor for work/sync bases
@@ -213,7 +213,7 @@ class IPCBackend : public Backend {
* When this method completes, the barrier_sync member will be available
* for use.
*/
void roc_shmem_collective_init();
void rocshmem_collective_init();
/**
* @brief Allocate buffer for fence/quiet operation
+1 -1
View File
@@ -31,7 +31,7 @@
#include <cstdlib>
#include "config.h" // NOLINT(build/include_subdir)
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "backend_ipc.hpp"
namespace rocshmem {
+10 -10
View File
@@ -63,7 +63,7 @@ class IPCContext : public Context {
__device__ void sync_all();
__device__ void sync(roc_shmem_team_t team);
__device__ void sync(rocshmem_team_t team);
template <typename T>
__device__ void p(T *dest, T value, int pe);
@@ -121,18 +121,18 @@ class IPCContext : public Context {
__device__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
// Collectives
template <typename T, ROC_SHMEM_OP Op>
__device__ int reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce);
template <typename T, ROCSHMEM_OP Op>
__device__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce);
template <typename T>
__device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
__device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
int nelems, int pe_root);
template <typename T>
__device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source,
__device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source,
__device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
int nelems);
@@ -211,11 +211,11 @@ class IPCContext : public Context {
int pe_root); // NOLINT(runtime/int)
template <typename T>
__device__ void fcollect_linear(roc_shmem_team_t team, T *dest,
__device__ void fcollect_linear(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void alltoall_linear(roc_shmem_team_t team, T *dest,
__device__ void alltoall_linear(rocshmem_team_t team, T *dest,
const T *source, int nelems);
__device__ void internal_sync(int pe, int PE_start, int stride, int PE_size,
@@ -227,10 +227,10 @@ class IPCContext : public Context {
__device__ void internal_atomic_barrier(int pe, int PE_start, int stride,
int n_pes, int64_t *pSync);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void internal_direct_allreduce(T *dst, const T *src,
int nelems, IPCTeam *team_obj);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void internal_ring_allreduce(T *dst, const T *src,
int nelems, IPCTeam *team_obj,
int n_seg, int seg_size, int chunk_size);
+10 -10
View File
@@ -20,7 +20,7 @@
* IN THE SOFTWARE.
*****************************************************************************/
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "../context_incl.hpp"
#include "context_ipc_tmpl_device.hpp"
#include "../util.hpp"
@@ -39,8 +39,8 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start,
__threadfence_system();
#endif /* __gfx90a__ */
for (size_t i = 1; i < n_pes; i++) {
wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val);
pSync[i] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
pSync[i] = ROCSHMEM_SYNC_VALUE;
}
threadfence_system();
@@ -58,8 +58,8 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start,
#if defined(__gfx90a__)
__threadfence_system();
#endif /* __gfx90a__ */
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
pSync[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
}
}
@@ -69,8 +69,8 @@ __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start,
int64_t *pSync) {
int64_t flag_val = 1;
if (pe == PE_start) {
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(n_pes - 1));
pSync[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
@@ -78,8 +78,8 @@ __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start,
}
} else {
amo_add<int64_t>(&pSync[0], flag_val, PE_start);
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
pSync[0] = ROC_SHMEM_SYNC_VALUE;
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
}
}
@@ -98,7 +98,7 @@ __device__ void IPCContext::internal_sync(int pe, int PE_start, int stride,
__syncthreads();
}
__device__ void IPCContext::sync(roc_shmem_team_t team) {
__device__ void IPCContext::sync(rocshmem_team_t team) {
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
int pe = team_obj->my_pe_in_world;
+4 -4
View File
@@ -86,16 +86,16 @@ class IPCHostContext : public Context {
long *p_sync);
template <typename T>
__host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
__host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
int nelems, int pe_root);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
int log_pe_stride, int pe_size, T *p_wrk,
long *p_sync);
template <typename T, ROC_SHMEM_OP Op>
__host__ int reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce);
template <typename T, ROCSHMEM_OP Op>
__host__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce);
template <typename T>
__host__ void wait_until(T *ivars, int cmp, T val);
+23 -23
View File
@@ -24,11 +24,11 @@
#define LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_
#include "config.h" // NOLINT(build/include_subdir)
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "context_ipc_device.hpp"
#include "../util.hpp"
#include "ipc_team.hpp"
#include "../roc_shmem_calc.hpp"
#include "../rocshmem_calc.hpp"
namespace rocshmem {
@@ -153,7 +153,7 @@ __device__ T IPCContext::amo_fetch_cas(void *dest, T value, T cond, int pe) {
}
// Collectives
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
int wg_size) {
for (size_t i = wg_id; i < size; i += wg_size) {
@@ -162,7 +162,7 @@ __device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
__syncthreads();
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void IPCContext::internal_direct_allreduce(
T *dst, const T *src, int nelems, IPCTeam *team_obj) { // NOLINT(runtime/int)
@@ -203,7 +203,7 @@ __device__ void IPCContext::internal_direct_allreduce(
if (i != pe) {
// Wait for leader thread to see that the buffer is ready.
if (is_thread_zero_in_block()) {
wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val);
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
}
__syncthreads();
@@ -216,7 +216,7 @@ __device__ void IPCContext::internal_direct_allreduce(
__syncthreads();
for (int i = wg_id; i < num_pes; i += wg_size) {
pSync[i] = ROC_SHMEM_SYNC_VALUE;
pSync[i] = ROCSHMEM_SYNC_VALUE;
}
threadfence_system();
__syncthreads();
@@ -278,7 +278,7 @@ __device__ void IPCContext::internal_direct_allreduce(
* [02+12+22+32] [02+12+22+32] [02+12+22+32] [02+12+22+32]
* [03+13+23+33] [03+13+23+33] [03+13+23+33] [03+13+23+33]
*/
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void IPCContext::internal_ring_allreduce(
T *dst, const T *src, int nelems, IPCTeam *team_obj, // NOLINT(runtime/int)
int n_seg, int seg_size, int chunk_size) {
@@ -323,7 +323,7 @@ __device__ void IPCContext::internal_ring_allreduce(
#if defined(__gfx90a__)
__threadfence_system();
#endif /* __gfx90a__ */
wait_until(&pSync[iter], ROC_SHMEM_CMP_EQ, wait_val);
wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val);
}
__syncthreads();
compute_reduce<T, Op>(&pWrk[off_recv], &dst[off_seg + off_recv],
@@ -344,7 +344,7 @@ __device__ void IPCContext::internal_ring_allreduce(
#if defined(__gfx90a__)
__threadfence_system();
#endif /* __gfx90a__ */
wait_until(&pSync[iter], ROC_SHMEM_CMP_EQ, wait_val);
wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val);
}
__syncthreads();
}
@@ -352,13 +352,13 @@ __device__ void IPCContext::internal_ring_allreduce(
__syncthreads();
for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
pSync[i] = ROC_SHMEM_SYNC_VALUE;
pSync[i] = ROCSHMEM_SYNC_VALUE;
}
__syncthreads();
}
template <typename T, ROC_SHMEM_OP Op>
__device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest,
template <typename T, ROCSHMEM_OP Op>
__device__ int IPCContext::reduce(rocshmem_team_t team, T *dest,
const T *source, int nreduce) {
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
@@ -367,14 +367,14 @@ __device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest,
size_t direct_pWrk = PE_size * nreduce;
size_t direct_pSync = PE_size;
size_t ring_pSync = 2 * PE_size;
size_t provided_pWrk = max(nreduce / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
size_t provided_pSync = ROC_SHMEM_REDUCE_SYNC_SIZE;
size_t provided_pWrk = max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE;
if (provided_pWrk >= direct_pWrk && provided_pSync >= direct_pSync) {
internal_direct_allreduce<T, Op>(dest, source, nreduce, team_obj);
} else {
if (ring_pSync <= ROC_SHMEM_REDUCE_SYNC_SIZE) {
size_t ring_pWrk = ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE;
if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) {
size_t ring_pWrk = ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE;
// integer division truncating value
int chunk_size = ring_pWrk / PE_size;
int seg_size = chunk_size * PE_size;
@@ -410,10 +410,10 @@ __device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest,
}
} else {
GPU_DPRINTF("Unsupported reduction size for IPC conduit.\n");
return ROC_SHMEM_ERROR;
return ROCSHMEM_ERROR;
}
}
return ROC_SHMEM_SUCCESS;
return ROCSHMEM_SUCCESS;
}
template <typename T>
@@ -439,7 +439,7 @@ __device__ void IPCContext::internal_get_broadcast(
}
template <typename T>
__device__ void IPCContext::broadcast(roc_shmem_team_t team, T *dst,
__device__ void IPCContext::broadcast(rocshmem_team_t team, T *dst,
const T *src, int nelems, int pe_root) {
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
@@ -471,13 +471,13 @@ __device__ void IPCContext::internal_broadcast(T *dst, const T *src, int nelems,
}
template <typename T>
__device__ void IPCContext::alltoall(roc_shmem_team_t team, T *dst,
__device__ void IPCContext::alltoall(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
alltoall_linear(team, dst, src, nelems);
}
template <typename T>
__device__ void IPCContext::alltoall_linear(roc_shmem_team_t team, T *dst,
__device__ void IPCContext::alltoall_linear(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
@@ -500,13 +500,13 @@ __device__ void IPCContext::alltoall_linear(roc_shmem_team_t team, T *dst,
}
template <typename T>
__device__ void IPCContext::fcollect(roc_shmem_team_t team, T *dst,
__device__ void IPCContext::fcollect(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
fcollect_linear(team, dst, src, nelems);
}
template <typename T>
__device__ void IPCContext::fcollect_linear(roc_shmem_team_t team, T *dst,
__device__ void IPCContext::fcollect_linear(rocshmem_team_t team, T *dst,
const T *src, int nelems) {
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
+4 -4
View File
@@ -93,13 +93,13 @@ __host__ void IPCHostContext::broadcast(
}
template <typename T>
__host__ void IPCHostContext::broadcast(roc_shmem_team_t team, T *dest,
__host__ void IPCHostContext::broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems,
int pe_root) {
host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce,
int pe_start, int log_pe_stride,
int pe_size, T *p_wrk,
@@ -108,8 +108,8 @@ __host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce,
pe_size, p_wrk, p_sync);
}
template <typename T, ROC_SHMEM_OP Op>
__host__ int IPCHostContext::reduce(roc_shmem_team_t team, T *dest,
template <typename T, ROCSHMEM_OP Op>
__host__ int IPCHostContext::reduce(rocshmem_team_t team, T *dest,
const T *source, int nreduce) {
return host_interface->reduce<T, Op>(team, dest, source, nreduce);
}
+2 -2
View File
@@ -46,7 +46,7 @@ class IPCDefaultContextProxy {
auto ctx{proxy_.get()};
new (ctx) IPCContext(reinterpret_cast<Backend*>(backend));
ctx->tinfo = tinfo;
roc_shmem_ctx_t local{ctx, tinfo};
rocshmem_ctx_t local{ctx, tinfo};
set_internal_ctx(&local);
}
@@ -89,4 +89,4 @@ using IPCDefaultContextProxyT = IPCDefaultContextProxy<HIPAllocator>;
} // namespace rocshmem
#endif // LIBRARY_SRC_IPC_CONTEXT_PROXY_HPP_
#endif // LIBRARY_SRC_IPC_CONTEXT_PROXY_HPP_
+6 -6
View File
@@ -38,17 +38,17 @@ IPCTeam::IPCTeam(Backend *backend, TeamInfo *team_info_parent,
pool_index_ = pool_index;
barrier_pSync =
&(b->barrier_pSync_pool[pool_index * ROC_SHMEM_BARRIER_SYNC_SIZE]);
&(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
reduce_pSync =
&(b->reduce_pSync_pool[pool_index * ROC_SHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROC_SHMEM_BCAST_SYNC_SIZE]);
&(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
alltoall_pSync =
&(b->alltoall_pSync_pool[pool_index * ROC_SHMEM_ALLTOALL_SYNC_SIZE]);
&(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
pWrk = reinterpret_cast<char *>(b->pWrk_pool) +
ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
pAta = reinterpret_cast<char *>(b->pAta_pool) +
ROC_SHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
}
IPCTeam::~IPCTeam() {}
+1 -1
View File
@@ -123,7 +123,7 @@ class IpcOffImpl {
__device__ bool isIpcAvailable(int my_pe, int target_pe) { return false; }
__device__ void ipcGpuInit(Backend *roc_shmem_handle, Context *ctx,
__device__ void ipcGpuInit(Backend *rocshmem_handle, Context *ctx,
int thread_id) {}
__device__ void ipcCopy(void *dst, void *src, size_t size) {}
+1 -1
View File
@@ -27,7 +27,7 @@
namespace rocshmem {
SingleHeap::SingleHeap() {
if (auto heap_size_cstr = getenv("ROC_SHMEM_HEAP_SIZE")) {
if (auto heap_size_cstr = getenv("ROCSHMEM_HEAP_SIZE")) {
std::stringstream sstream(heap_size_cstr);
size_t heap_size;
sstream >> heap_size;
+1 -1
View File
@@ -29,7 +29,7 @@
namespace rocshmem {
SlabHeap::SlabHeap() {
if (auto slab_size_cstr = getenv("ROC_SHMEM_SLAB_SIZE")) {
if (auto slab_size_cstr = getenv("ROCSHMEM_SLAB_SIZE")) {
std::stringstream sstream(slab_size_cstr);
size_t slab_size;
sstream >> slab_size;
+1 -1
View File
@@ -81,7 +81,7 @@ class MPIInitSingleton {
int nprocs_{-1};
/**
* @brief Was MPI initialized before ROCSHMEM_init call
* @brief Was MPI initialized before rocshmem_init call
*/
int pre_init_done{0};
+10 -10
View File
@@ -32,7 +32,7 @@
#include <memory>
#include <thread> // NOLINT
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "../atomic_return.hpp"
#include "../backend_type.hpp"
#include "../context_incl.hpp"
@@ -42,13 +42,13 @@
namespace rocshmem {
extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT;
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
ROBackend::ROBackend(MPI_Comm comm)
: profiler_proxy_(MAX_NUM_BLOCKS), Backend() {
type = BackendType::RO_BACKEND;
if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) {
if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
std::stringstream sstream(maximum_num_contexts_str);
sstream >> maximum_num_contexts_;
}
@@ -83,14 +83,14 @@ ROBackend::ROBackend(MPI_Comm comm)
default_host_ctx = std::make_unique<ROHostContext>(this, 0);
ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
team_world_proxy_ = new ROTeamProxy<HIPAllocator>(
this, transport_->get_world_comm(), my_pe, num_pes);
team_tracker.set_team_world(team_world_proxy_->get());
ROC_SHMEM_TEAM_WORLD =
reinterpret_cast<roc_shmem_team_t>(team_world_proxy_->get());
ROCSHMEM_TEAM_WORLD =
reinterpret_cast<rocshmem_team_t>(team_world_proxy_->get());
default_block_handle_proxy_ = DefaultBlockHandleProxyT(
bp->g_ret, bp->atomic_ret, &queue_, &ipcImpl, hdp_proxy_.get());
@@ -120,7 +120,7 @@ ROBackend::~ROBackend() {
CHECK_HIP(hipFree(ctx_array));
}
__device__ bool ROBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) {
__device__ bool ROBackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) {
ROContext *ctx_;
auto pop_result = ctx_free_list.get()->pop_front();
@@ -133,11 +133,11 @@ __device__ bool ROBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) {
return true;
}
__device__ void ROBackend::destroy_ctx(roc_shmem_ctx_t *ctx) {
__device__ void ROBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
ctx_free_list.get()->push_back(static_cast<ROContext *>(ctx->ctx_opaque));
}
void ROBackend::team_destroy(roc_shmem_team_t team) {
void ROBackend::team_destroy(rocshmem_team_t team) {
ROTeam *team_obj{get_internal_ro_team(team)};
team_obj->~ROTeam();
@@ -148,7 +148,7 @@ void ROBackend::create_new_team(Team *parent_team,
TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) {
rocshmem_team_t *new_team) {
transport_->createNewTeam(this, parent_team, team_info_wrt_parent,
team_info_wrt_world, num_pes, my_pe_in_new_team,
team_comm, new_team);
+6 -6
View File
@@ -85,20 +85,20 @@ class ROBackend : public Backend {
void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) override;
rocshmem_team_t *new_team) override;
/**
* @copydoc Backend::team_destroy(roc_shmem_team_t)
* @copydoc Backend::team_destroy(rocshmem_team_t)
*/
void team_destroy(roc_shmem_team_t team) override;
void team_destroy(rocshmem_team_t team) override;
__device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx);
__device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
/**
* @brief Destroy a `roc_shmem_ctx_t` context and returns it back to the
* @brief Destroy a `rocshmem_ctx_t` context and returns it back to the
* context free list.
*/
__device__ void destroy_ctx(roc_shmem_ctx_t *ctx);
__device__ void destroy_ctx(rocshmem_ctx_t *ctx);
/**
* @copydoc Backend::ctx_create
+2 -2
View File
@@ -23,7 +23,7 @@
#ifndef LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_
#define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "../device_proxy.hpp"
#include "../memory/hip_allocator.hpp"
#include "context_ro_device.hpp"
@@ -46,7 +46,7 @@ class DefaultContextProxy {
: constructed_{true} {
auto ctx{proxy_.get()};
new (ctx) ROContext(reinterpret_cast<Backend*>(backend), -1);
roc_shmem_ctx_t local{ctx, tinfo};
rocshmem_ctx_t local{ctx, tinfo};
set_internal_ctx(&local);
}
+3 -3
View File
@@ -30,7 +30,7 @@
#include <cstdlib>
#include "config.h" // NOLINT(build/include_subdir)
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "../backend_type.hpp"
#include "../hdp_policy.hpp"
#include "backend_proxy.hpp"
@@ -176,7 +176,7 @@ __device__ void ROContext::sync_all() {
__syncthreads();
}
__device__ void ROContext::sync(roc_shmem_team_t team) {
__device__ void ROContext::sync(rocshmem_team_t team) {
ROTeam *team_obj = reinterpret_cast<ROTeam *>(team);
if (is_thread_zero_in_block()) {
build_queue_element(RO_NET_SYNC, nullptr, nullptr, 0, 0, 0, 0, 0, nullptr,
@@ -472,7 +472,7 @@ __device__ void build_queue_element(
ro_net_cmds type, void *dst, void *src, size_t size, int pe,
int logPE_stride, int PE_size, int PE_root, void *pWrk, long *pSync,
MPI_Comm team_comm, int ro_net_win_id, BlockHandle *handle,
bool blocking, ROC_SHMEM_OP op, ro_net_types datatype) {
bool blocking, ROCSHMEM_OP op, ro_net_types datatype) {
auto write_slot{next_write_slot(handle)};
auto queue_element = &handle->queue[write_slot];
+16 -16
View File
@@ -34,7 +34,7 @@ __device__ void build_queue_element(
ro_net_cmds type, void *dst, void *src, size_t size, int pe,
int logPE_stride, int PE_size, int PE_root, void *pWrk, long *pSync,
MPI_Comm team_comm, int ro_net_win_id, BlockHandle *handle,
bool blocking, ROC_SHMEM_OP op = ROC_SHMEM_SUM,
bool blocking, ROCSHMEM_OP op = ROCSHMEM_SUM,
ro_net_types datatype = RO_NET_INT);
class ROContext : public Context {
@@ -67,7 +67,7 @@ class ROContext : public Context {
__device__ void sync_all();
__device__ void sync(roc_shmem_team_t team);
__device__ void sync(rocshmem_team_t team);
template <typename T>
__device__ void p(T *dest, T value, int pe);
@@ -75,13 +75,13 @@ class ROContext : public Context {
template <typename T>
__device__ T g(const T *source, int pe);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void to_all(T *dest, const T *source, int nreduce, int PE_start,
int logPE_stride, int PE_size, T *pWrk,
long *pSync); // NOLINT(runtime/int)
template <typename T, ROC_SHMEM_OP Op>
__device__ void to_all(roc_shmem_team_t team, T *dest, const T *source,
template <typename T, ROCSHMEM_OP Op>
__device__ void to_all(rocshmem_team_t team, T *dest, const T *source,
int nreduce);
template <typename T>
@@ -133,7 +133,7 @@ class ROContext : public Context {
__device__ void amo_xor(void *dst, T value, int pe);
template <typename T>
__device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
__device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
int nelems, int pe_root);
template <typename T>
@@ -142,43 +142,43 @@ class ROContext : public Context {
long *p_sync); // NOLINT(runtime/int)
template <typename T>
__device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source,
__device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void alltoall_broadcast(roc_shmem_team_t team, T *dest,
__device__ void alltoall_broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void alltoall_mpi(roc_shmem_team_t team, T *dest, const T *source,
__device__ void alltoall_mpi(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void alltoall_gcen(roc_shmem_team_t team, T *dest, const T *source,
__device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void alltoall_gcen2(roc_shmem_team_t team, T *dest,
__device__ void alltoall_gcen2(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source,
__device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void fcollect_broadcast(roc_shmem_team_t team, T *dest,
__device__ void fcollect_broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void fcollect_mpi(roc_shmem_team_t team, T *dest, const T *source,
__device__ void fcollect_mpi(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void fcollect_gcen(roc_shmem_team_t team, T *dest, const T *source,
__device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void fcollect_gcen2(roc_shmem_team_t team, T *dest,
__device__ void fcollect_gcen2(rocshmem_team_t team, T *dest,
const T *source, int nelems);
__device__ void putmem_wg(void *dest, const void *source, size_t nelems,
+4 -4
View File
@@ -135,16 +135,16 @@ class ROHostContext : public Context {
long *p_sync); // NOLINT(runtime/int)
template <typename T>
__host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
__host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
int nelems, int pe_root);
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
int log_pe_stride, int pe_size, T *p_wrk,
long *p_sync); // NOLINT(runtime/int)
template <typename T, ROC_SHMEM_OP Op>
__host__ void to_all(roc_shmem_team_t team, T *dest, const T *source,
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(rocshmem_team_t team, T *dest, const T *source,
int nreduce);
template <typename T>
+16 -16
View File
@@ -108,8 +108,8 @@ struct GetROType<long double> {
********************************* DEVICE API *********************************
*****************************************************************************/
template <typename T, ROC_SHMEM_OP Op>
__device__ void ROContext::to_all(roc_shmem_team_t team, T *dest,
template <typename T, ROCSHMEM_OP Op>
__device__ void ROContext::to_all(rocshmem_team_t team, T *dest,
const T *source, int nreduce) {
if (!is_thread_zero_in_block()) {
__syncthreads();
@@ -125,7 +125,7 @@ __device__ void ROContext::to_all(roc_shmem_team_t team, T *dest,
__syncthreads();
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__device__ void ROContext::to_all(T *dest, const T *source, int nreduce,
int PE_start, int logPE_stride, int PE_size,
T *pWrk, long *pSync) {
@@ -210,7 +210,7 @@ __device__ T ROContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
value, pe, 0, 0, 0,
reinterpret_cast<void *>(static_cast<long long>(cond)),
nullptr, (MPI_Comm)NULL, ro_net_win_id, block_handle, true,
ROC_SHMEM_SUM, GetROType<T>::Type);
ROCSHMEM_SUM, GetROType<T>::Type);
__threadfence();
return *source;
}
@@ -225,7 +225,7 @@ __device__ T ROContext::amo_fetch_add(void *dst, T value, int pe) {
auto source{get_unused_atomic()};
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<T *>(source), value,
pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
ro_net_win_id, block_handle, true, ROC_SHMEM_SUM,
ro_net_win_id, block_handle, true, ROCSHMEM_SUM,
GetROType<T>::Type);
__threadfence();
return *source;
@@ -241,7 +241,7 @@ __device__ T ROContext::amo_swap(void *dst, T value, int pe) {
auto source{get_unused_atomic()};
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<void *>(source),
value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
ro_net_win_id, block_handle, true, ROC_SHMEM_REPLACE,
ro_net_win_id, block_handle, true, ROCSHMEM_REPLACE,
GetROType<T>::Type);
__threadfence();
return *source;
@@ -257,7 +257,7 @@ __device__ T ROContext::amo_fetch_and(void *dst, T value, int pe) {
auto source{get_unused_atomic()};
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<void *>(source),
value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
ro_net_win_id, block_handle, true, ROC_SHMEM_AND,
ro_net_win_id, block_handle, true, ROCSHMEM_AND,
GetROType<T>::Type);
__threadfence();
return *source;
@@ -273,7 +273,7 @@ __device__ T ROContext::amo_fetch_or(void *dst, T value, int pe) {
auto source{get_unused_atomic()};
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<void *>(source),
value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
ro_net_win_id, block_handle, true, ROC_SHMEM_OR,
ro_net_win_id, block_handle, true, ROCSHMEM_OR,
GetROType<T>::Type);
__threadfence();
return *source;
@@ -289,7 +289,7 @@ __device__ T ROContext::amo_fetch_xor(void *dst, T value, int pe) {
auto source{get_unused_atomic()};
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<void *>(source),
value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
ro_net_win_id, block_handle, true, ROC_SHMEM_XOR,
ro_net_win_id, block_handle, true, ROCSHMEM_XOR,
GetROType<T>::Type);
__threadfence();
return *source;
@@ -301,7 +301,7 @@ __device__ void ROContext::amo_xor(void *dst, T value, int pe) {
}
template <typename T>
__device__ void ROContext::broadcast(roc_shmem_team_t team, T *dest,
__device__ void ROContext::broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems, int pe_root) {
if (!is_thread_zero_in_block()) {
__syncthreads();
@@ -313,7 +313,7 @@ __device__ void ROContext::broadcast(roc_shmem_team_t team, T *dest,
build_queue_element(RO_NET_TEAM_BROADCAST, dest, const_cast<T *>(source),
nelems, 0, 0, 0, pe_root, nullptr, nullptr,
team_obj->mpi_comm, ro_net_win_id, block_handle, true,
ROC_SHMEM_SUM, GetROType<T>::Type);
ROCSHMEM_SUM, GetROType<T>::Type);
__syncthreads();
}
@@ -331,13 +331,13 @@ __device__ void ROContext::broadcast(T *dest, const T *source, int nelems,
build_queue_element(RO_NET_BROADCAST, dest, const_cast<T *>(source), nelems,
pe_start, log_pe_stride, pe_size, pe_root, nullptr,
p_sync, (MPI_Comm)NULL, ro_net_win_id, block_handle, true,
ROC_SHMEM_SUM, GetROType<T>::Type);
ROCSHMEM_SUM, GetROType<T>::Type);
__syncthreads();
}
template <typename T>
__device__ void ROContext::alltoall(roc_shmem_team_t team, T *dest,
__device__ void ROContext::alltoall(rocshmem_team_t team, T *dest,
const T *source, int nelems) {
if (!is_thread_zero_in_block()) {
__syncthreads();
@@ -349,13 +349,13 @@ __device__ void ROContext::alltoall(roc_shmem_team_t team, T *dest,
build_queue_element(RO_NET_ALLTOALL, dest, const_cast<T *>(source), nelems, 0,
0, 0, 0, team_obj->ata_buffer, nullptr,
team_obj->mpi_comm, ro_net_win_id, block_handle, true,
ROC_SHMEM_SUM, GetROType<T>::Type);
ROCSHMEM_SUM, GetROType<T>::Type);
__syncthreads();
}
template <typename T>
__device__ void ROContext::fcollect(roc_shmem_team_t team, T *dest,
__device__ void ROContext::fcollect(rocshmem_team_t team, T *dest,
const T *source, int nelems) {
if (!is_thread_zero_in_block()) {
__syncthreads();
@@ -367,7 +367,7 @@ __device__ void ROContext::fcollect(roc_shmem_team_t team, T *dest,
build_queue_element(RO_NET_FCOLLECT, dest, const_cast<T *>(source), nelems, 0,
0, 0, 0, team_obj->ata_buffer, nullptr,
team_obj->mpi_comm, ro_net_win_id, block_handle, true,
ROC_SHMEM_SUM, GetROType<T>::Type);
ROCSHMEM_SUM, GetROType<T>::Type);
__syncthreads();
}
+4 -4
View File
@@ -114,7 +114,7 @@ __host__ void ROHostContext::broadcast(T *dest, const T *source, int nelems,
}
template <typename T>
__host__ void ROHostContext::broadcast(roc_shmem_team_t team, T *dest,
__host__ void ROHostContext::broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems,
int pe_root) {
DPRINTF("Function: Team-based ro_net_host_broadcast\n");
@@ -122,7 +122,7 @@ __host__ void ROHostContext::broadcast(roc_shmem_team_t team, T *dest,
host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
}
template <typename T, ROC_SHMEM_OP Op>
template <typename T, ROCSHMEM_OP Op>
__host__ void ROHostContext::to_all(T *dest, const T *source, int nreduce,
int pe_start, int log_pe_stride,
int pe_size, T *p_wrk, long *p_sync) {
@@ -132,8 +132,8 @@ __host__ void ROHostContext::to_all(T *dest, const T *source, int nreduce,
pe_size, p_wrk, p_sync);
}
template <typename T, ROC_SHMEM_OP Op>
__host__ void ROHostContext::to_all(roc_shmem_team_t team, T *dest,
template <typename T, ROCSHMEM_OP Op>
__host__ void ROHostContext::to_all(rocshmem_team_t team, T *dest,
const T *source, int nreduce) {
DPRINTF("Function: Team-based ro_net_host_to_all\n");
+20 -20
View File
@@ -142,7 +142,7 @@ void MPITransport::submitRequestsToMPI() {
const_cast<unsigned long long *>(&next_element.ol1.atomic_value),
next_element.PE, next_element.ro_net_win_id, queue_idx,
next_element.threadId, true,
static_cast<ROC_SHMEM_OP>(next_element.op),
static_cast<ROCSHMEM_OP>(next_element.op),
static_cast<ro_net_types>(next_element.datatype));
DPRINTF("Received AMO dst %p src %p Val %llu pe %d\n", next_element.dst,
next_element.src, next_element.ol1.atomic_value, next_element.PE);
@@ -163,7 +163,7 @@ void MPITransport::submitRequestsToMPI() {
team_reduction(next_element.dst, next_element.src, next_element.ol1.size,
next_element.ro_net_win_id, queue_idx,
next_element.team_comm,
static_cast<ROC_SHMEM_OP>(next_element.op),
static_cast<ROCSHMEM_OP>(next_element.op),
static_cast<ro_net_types>(next_element.datatype),
next_element.threadId, true);
DPRINTF("Received FLOAT_SUM_TEAM_TO_ALL dst %p src %p size %lu team %d\n",
@@ -175,7 +175,7 @@ void MPITransport::submitRequestsToMPI() {
next_element.PE, next_element.ro_net_win_id, queue_idx,
next_element.PE, next_element.logPE_stride,
next_element.PE_size, next_element.ol2.pWrk, next_element.pSync,
static_cast<ROC_SHMEM_OP>(next_element.op),
static_cast<ROCSHMEM_OP>(next_element.op),
static_cast<ro_net_types>(next_element.datatype),
next_element.threadId, true);
DPRINTF(
@@ -275,15 +275,15 @@ void MPITransport::finalizeTransport() {
delete host_interface;
}
roc_shmem_team_t get_external_team(ROTeam *team) {
return reinterpret_cast<roc_shmem_team_t>(team);
rocshmem_team_t get_external_team(ROTeam *team) {
return reinterpret_cast<rocshmem_team_t>(team);
}
void MPITransport::createNewTeam(ROBackend *backend, Team *parent_team,
TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) {
rocshmem_team_t *new_team) {
ROTeam *new_team_obj{nullptr};
CHECK_HIP(hipMalloc(&new_team_obj, sizeof(ROTeam)));
@@ -342,26 +342,26 @@ void MPITransport::barrier(int blockId, int threadId, bool blocking,
outstanding[blockId]++;
}
MPI_Op MPITransport::get_mpi_op(ROC_SHMEM_OP op) {
MPI_Op MPITransport::get_mpi_op(ROCSHMEM_OP op) {
switch (op) {
case ROC_SHMEM_SUM:
case ROCSHMEM_SUM:
return MPI_SUM;
case ROC_SHMEM_MAX:
case ROCSHMEM_MAX:
return MPI_MAX;
case ROC_SHMEM_MIN:
case ROCSHMEM_MIN:
return MPI_MIN;
case ROC_SHMEM_PROD:
case ROCSHMEM_PROD:
return MPI_PROD;
case ROC_SHMEM_AND:
case ROCSHMEM_AND:
return MPI_BAND;
case ROC_SHMEM_OR:
case ROCSHMEM_OR:
return MPI_BOR;
case ROC_SHMEM_XOR:
case ROCSHMEM_XOR:
return MPI_BXOR;
case ROC_SHMEM_REPLACE:
case ROCSHMEM_REPLACE:
return MPI_REPLACE;
default:
fprintf(stderr, "Unknown ROC_SHMEM op MPI conversion %d\n", op);
fprintf(stderr, "Unknown rocSHMEM op MPI conversion %d\n", op);
abort();
}
}
@@ -383,7 +383,7 @@ static MPI_Datatype convertType(ro_net_types type) {
case RO_NET_LONG_DOUBLE:
return MPI_LONG_DOUBLE;
default:
fprintf(stderr, "Unknown ROC_SHMEM type MPI conversion %d\n", type);
fprintf(stderr, "Unknown rocSHMEM type MPI conversion %d\n", type);
abort();
}
}
@@ -391,7 +391,7 @@ static MPI_Datatype convertType(ro_net_types type) {
void MPITransport::reduction(void *dst, void *src, int size, int pe,
int win_id, int blockId, int start, int logPstride,
int sizePE, void *pWrk, long *pSync,
ROC_SHMEM_OP op, ro_net_types type, int threadId,
ROCSHMEM_OP op, ro_net_types type, int threadId,
bool blocking) {
MPI_Request request{};
MPI_Op mpi_op{get_mpi_op(op)};
@@ -435,7 +435,7 @@ void MPITransport::broadcast(void *dst, void *src, int size, int pe,
}
void MPITransport::team_reduction(void *dst, void *src, int size, int win_id,
int blockId, MPI_Comm team, ROC_SHMEM_OP op,
int blockId, MPI_Comm team, ROCSHMEM_OP op,
ro_net_types type, int threadId,
bool blocking) {
MPI_Request request{};
@@ -1046,7 +1046,7 @@ void MPITransport::putMem(void *dst, void *src, int size, int pe, int win_id,
void MPITransport::amoFOP(void *dst, void *src, void *val, int pe, int win_id,
int blockId, int threadId, bool blocking,
ROC_SHMEM_OP op, ro_net_types type) {
ROCSHMEM_OP op, ro_net_types type) {
queue->flush_hdp();
auto *bp{backend_proxy->get()};
+5 -5
View File
@@ -49,18 +49,18 @@ class MPITransport : public Transport {
TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) override;
rocshmem_team_t *new_team) override;
void barrier(int blockId, int threadId, bool blocking,
MPI_Comm team) override;
void reduction(void *dst, void *src, int size, int pe, int win_id,
int blockId, int start, int logPstride, int sizePE, void *pWrk,
long *pSync, ROC_SHMEM_OP op, ro_net_types type,
long *pSync, ROCSHMEM_OP op, ro_net_types type,
int threadId, bool blocking) override;
void team_reduction(void *dst, void *src, int size, int win_id, int blockId,
MPI_Comm team, ROC_SHMEM_OP op, ro_net_types type,
MPI_Comm team, ROCSHMEM_OP op, ro_net_types type,
int threadId, bool blocking) override;
void broadcast(void *dst, void *src, int size, int pe, int win_id,
@@ -116,7 +116,7 @@ class MPITransport : public Transport {
int threadId, bool blocking, bool inline_data = false) override;
void amoFOP(void *dst, void *src, void *val, int pe, int win_id, int blockId,
int threadId, bool blocking, ROC_SHMEM_OP op,
int threadId, bool blocking, ROCSHMEM_OP op,
ro_net_types type) override;
void amoFCAS(void *dst, void *src, void *val, int pe, int win_id, int blockId,
@@ -193,7 +193,7 @@ class MPITransport : public Transport {
void submitRequestsToMPI();
MPI_Op get_mpi_op(ROC_SHMEM_OP op);
MPI_Op get_mpi_op(ROCSHMEM_OP op);
Queue *queue{nullptr};
+5 -5
View File
@@ -27,7 +27,7 @@
#include <cassert>
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "backend_proxy.hpp"
#include "ro_net_team.hpp"
@@ -48,18 +48,18 @@ class Transport {
TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
roc_shmem_team_t *new_team) = 0;
rocshmem_team_t *new_team) = 0;
virtual void barrier(int wg_id, int threadId, bool blocking,
MPI_Comm team) = 0;
virtual void reduction(void *dst, void *src, int size, int pe, int win_id,
int wg_id, int start, int logPstride, int sizePE,
void *pWrk, long *pSync, ROC_SHMEM_OP op,
void *pWrk, long *pSync, ROCSHMEM_OP op,
ro_net_types type, int threadId, bool blocking) = 0;
virtual void team_reduction(void *dst, void *src, int size, int win_id,
int wg_id, MPI_Comm team, ROC_SHMEM_OP op,
int wg_id, MPI_Comm team, ROCSHMEM_OP op,
ro_net_types type, int threadId,
bool blocking) = 0;
@@ -89,7 +89,7 @@ class Transport {
int wg_id, int threadId, bool blocking) = 0;
virtual void amoFOP(void *dst, void *src, void *val, int pe, int win_id,
int wg_id, int threadId, bool blocking, ROC_SHMEM_OP op,
int wg_id, int threadId, bool blocking, ROCSHMEM_OP op,
ro_net_types type) = 0;
virtual void amoFCAS(void *dst, void *src, void *val, int pe, int win_id,
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -20,14 +20,14 @@
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_ROC_SHMEM_CALC_HPP_
#define LIBRARY_SRC_ROC_SHMEM_CALC_HPP_
#ifndef LIBRARY_SRC_ROCSHMEM_CALC_HPP_
#define LIBRARY_SRC_ROCSHMEM_CALC_HPP_
namespace rocshmem {
// clang-format off
NOWARN(-Wunused-parameter,
template <ROC_SHMEM_OP Op>
template <ROCSHMEM_OP Op>
struct OpWrap {
template <typename T>
__device__ static void Calc(T *src, T *dst, int i) {
@@ -41,7 +41,7 @@ struct OpWrap {
************************** TEMPLATE SPECIALIZATIONS **************************
*****************************************************************************/
template <>
struct OpWrap<ROC_SHMEM_SUM> {
struct OpWrap<ROCSHMEM_SUM> {
template <typename T>
__device__ static void Calc(T *src, T *dst, int i) {
dst[i] += src[i];
@@ -49,7 +49,7 @@ struct OpWrap<ROC_SHMEM_SUM> {
};
template <>
struct OpWrap<ROC_SHMEM_MAX> {
struct OpWrap<ROCSHMEM_MAX> {
template <typename T>
__device__ static void Calc(T *src, T *dst, int i) {
dst[i] = max(dst[i], src[i]);
@@ -57,7 +57,7 @@ struct OpWrap<ROC_SHMEM_MAX> {
};
template <>
struct OpWrap<ROC_SHMEM_MIN> {
struct OpWrap<ROCSHMEM_MIN> {
template <typename T>
__device__ static void Calc(T *src, T *dst, int i) {
dst[i] = min(dst[i], src[i]);
@@ -65,7 +65,7 @@ struct OpWrap<ROC_SHMEM_MIN> {
};
template <>
struct OpWrap<ROC_SHMEM_PROD> {
struct OpWrap<ROCSHMEM_PROD> {
template <typename T>
__device__ static void Calc(T *src, T *dst, int i) {
dst[i] *= src[i];
@@ -73,7 +73,7 @@ struct OpWrap<ROC_SHMEM_PROD> {
};
template <>
struct OpWrap<ROC_SHMEM_AND> {
struct OpWrap<ROCSHMEM_AND> {
template <typename T>
__device__ static void Calc(T *src, T *dst, int i) {
dst[i] &= src[i];
@@ -81,7 +81,7 @@ struct OpWrap<ROC_SHMEM_AND> {
};
template <>
struct OpWrap<ROC_SHMEM_OR> {
struct OpWrap<ROCSHMEM_OR> {
template <typename T>
__device__ static void Calc(T *src, T *dst, int i) {
dst[i] |= src[i];
@@ -89,7 +89,7 @@ struct OpWrap<ROC_SHMEM_OR> {
};
template <>
struct OpWrap<ROC_SHMEM_XOR> {
struct OpWrap<ROCSHMEM_XOR> {
template <typename T>
__device__ static void Calc(T *src, T *dst, int i) {
dst[i] ^= src[i];
@@ -97,4 +97,4 @@ struct OpWrap<ROC_SHMEM_XOR> {
};
}
#endif // LIBRARY_SRC_ROC_SHMEM_CALC_HPP_
#endif // LIBRARY_SRC_ROCSHMEM_CALC_HPP_
+1540
View File
File diff suppressed because it is too large Load Diff
+5 -5
View File
@@ -27,12 +27,12 @@
#include <atomic>
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "util.hpp"
namespace rocshmem {
enum roc_shmem_stats {
enum rocshmem_stats {
NUM_PUT = 0,
NUM_PUT_NBI,
NUM_P,
@@ -85,7 +85,7 @@ enum roc_shmem_stats {
NUM_STATS
};
enum roc_shmem_host_stats {
enum rocshmem_host_stats {
NUM_HOST_PUT = 0,
NUM_HOST_PUT_NBI,
NUM_HOST_P,
@@ -135,10 +135,10 @@ class Stats {
StatType stats[I] = {0};
public:
__device__ uint64_t startTimer() const { return roc_shmem_timer(); }
__device__ uint64_t startTimer() const { return rocshmem_timer(); }
__device__ void endTimer(uint64_t start, int index) {
incStat(index, roc_shmem_timer() - start);
incStat(index, rocshmem_timer() - start);
}
__device__ void incStat(int index, int value = 1) {
+10 -10
View File
@@ -24,34 +24,34 @@
#include <cmath>
#include "roc_shmem/roc_shmem.hpp"
#include "rocshmem/rocshmem.hpp"
#include "backend_bc.hpp"
#include "util.hpp"
namespace rocshmem {
roc_shmem_team_t ROC_SHMEM_TEAM_WORLD;
rocshmem_team_t ROCSHMEM_TEAM_WORLD;
__host__ __device__ Team* get_internal_team(roc_shmem_team_t team) {
__host__ __device__ Team* get_internal_team(rocshmem_team_t team) {
return reinterpret_cast<Team*>(team);
}
GPUIBTeam* get_internal_gpu_ib_team(roc_shmem_team_t team) {
GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team) {
return reinterpret_cast<GPUIBTeam*>(team);
}
ROTeam* get_internal_ro_team(roc_shmem_team_t team) {
ROTeam* get_internal_ro_team(rocshmem_team_t team) {
return reinterpret_cast<ROTeam*>(team);
}
IPCTeam* get_internal_ipc_team(roc_shmem_team_t team) {
IPCTeam* get_internal_ipc_team(rocshmem_team_t team) {
return reinterpret_cast<IPCTeam*>(team);
}
__host__ __device__ int team_translate_pe(roc_shmem_team_t src_team, int src_pe,
roc_shmem_team_t dst_team) {
if (src_team == ROC_SHMEM_TEAM_INVALID ||
dst_team == ROC_SHMEM_TEAM_INVALID) {
__host__ __device__ int team_translate_pe(rocshmem_team_t src_team, int src_pe,
rocshmem_team_t dst_team) {
if (src_team == ROCSHMEM_TEAM_INVALID ||
dst_team == ROCSHMEM_TEAM_INVALID) {
return -1;
}

Some files were not shown because too many files have changed in this diff Show More