Use new naming scheme
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# This is the list of ROCSHMEM's significant contributors.
|
||||
# This is the list of rocSHMEM's significant contributors.
|
||||
#
|
||||
# This does not necessarily list everyone who has contributed code,
|
||||
# especially since many employees of one corporation may be contributing.
|
||||
@@ -10,4 +10,4 @@ Michael LeBeane
|
||||
Rohit Zambre
|
||||
Kishore Punniyamurthy
|
||||
Ruchi Shah
|
||||
Muhammad A. Awad
|
||||
Muhammad A. Awad
|
||||
|
||||
+3
-3
@@ -137,8 +137,8 @@ set(
|
||||
rocshmem
|
||||
VERSION ${ROCSHMEM_VERSION}
|
||||
LANGUAGES CXX
|
||||
DESCRIPTION "ROCSHMEM"
|
||||
HOMEPAGE_URL "https://github.com/ROCm-Developer-Tools/ROC_SHMEM")
|
||||
DESCRIPTION "rocSHMEM"
|
||||
HOMEPAGE_URL "https://github.com/ROCm-Developer-Tools/rocSHMEM")
|
||||
|
||||
###############################################################################
|
||||
# DEFAULT BUILD TYPE
|
||||
@@ -190,7 +190,7 @@ set_target_properties(
|
||||
${PROJECT_NAME}
|
||||
PROPERTIES
|
||||
PUBLIC_HEADER
|
||||
"${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/roc_shmem/roc_shmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/roc_shmem/debug.hpp"
|
||||
"${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/rocshmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/debug.hpp"
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
|
||||
+8
-8
@@ -1,11 +1,11 @@
|
||||
## How to fork from us
|
||||
|
||||
To keep our development fast and conflict free, we recommend you to [fork](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/fork) our repository and start your work from our `dev` branch in your private repository.
|
||||
To keep our development fast and conflict free, we recommend you to [fork](https://github.com/ROCm-Developer-Tools/rocSHMEM/fork) our repository and start your work from our `dev` branch in your private repository.
|
||||
|
||||
Afterwards, git clone your repository to your local machine. But that is not it! To keep track of the original develop repository, add it as another remote.
|
||||
|
||||
```
|
||||
git remote add mainline https://github.com/ROCm-Developer-Tools/ROC_SHMEM.git
|
||||
git remote add mainline https://github.com/ROCm-Developer-Tools/rocSHMEM.git
|
||||
git checkout dev
|
||||
```
|
||||
|
||||
@@ -17,23 +17,23 @@ git checkout -b topic-<yourFeatureName>
|
||||
|
||||
and apply your changes there.
|
||||
|
||||
## How to contribute to ROCSHMEM
|
||||
## How to contribute to rocSHMEM
|
||||
|
||||
### Did you find a bug?
|
||||
|
||||
- Ensure the bug was not already reported by searching on GitHub under [Issues](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/issues).
|
||||
- Ensure the bug was not already reported by searching on GitHub under [Issues](https://github.com/ROCm-Developer-Tools/rocSHMEM/issues).
|
||||
|
||||
- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/issues/new).
|
||||
- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ROCm-Developer-Tools/rocSHMEM/issues/new).
|
||||
|
||||
### Did you write a patch that fixes a bug?
|
||||
|
||||
- Open a new GitHub [pull request](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/compare) with the patch.
|
||||
- Open a new GitHub [pull request](https://github.com/ROCm-Developer-Tools/rocSHMEM/compare) with the patch.
|
||||
|
||||
- Ensure the PR description clearly describes the problem and solution. If there is an existing GitHub issue open describing this bug, please include it in the description so we can close it.
|
||||
|
||||
- Ensure the PR is based on the `dev` branch of the ROCSHMEM GitHub repository.
|
||||
- Ensure the PR is based on the `dev` branch of the rocSHMEM GitHub repository.
|
||||
|
||||
- ROCSHMEM requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/ROCm-Developer-Tools/ROC_SHMEM/blob/main/LICENSE):
|
||||
- rocSHMEM requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/ROCm-Developer-Tools/rocSHMEM/blob/main/LICENSE):
|
||||
|
||||
|
||||
> (a) The contribution was created in whole or in part by me and I
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
# ROCm OpenSHMEM (ROC_SHMEM)
|
||||
# ROCm OpenSHMEM (rocSHMEM)
|
||||
|
||||
The ROCm OpenSHMEM (ROC_SHMEM) runtime is part of an AMD Research
|
||||
The ROCm OpenSHMEM (rocSHMEM) runtime is part of an AMD Research
|
||||
initiative to provide a unified runtime for heterogeneous systems.
|
||||
ROC_SHMEM supports both host-centric (a traditional host-driven
|
||||
rocSHMEM supports both host-centric (a traditional host-driven
|
||||
OpenSHMEM runtime) and GPU-centric networking (provided a GPU kernel
|
||||
the ability to perform network operations) through an
|
||||
OpenSHMEM-like interface. This intra-kernel networking simplifies application
|
||||
code complexity and enables more fine-grained communication/computation
|
||||
overlap than traditional host-driven networking.
|
||||
|
||||
ROC_SHMEM's primary target is heterogeneous computing; hence, for both
|
||||
CPU-centric and GPU-centric communications, ROC_SHMEM uses a single
|
||||
rocSHMEM's primary target is heterogeneous computing; hence, for both
|
||||
CPU-centric and GPU-centric communications, rocSHMEM uses a single
|
||||
symmetric heap (SHEAP) that is allocated on GPU memories.
|
||||
|
||||
ROC_SHMEM's GPU-centric communication has two different backend designs.
|
||||
rocSHMEM's GPU-centric communication has two different backend designs.
|
||||
The backends primarily differ in their implementations of
|
||||
intra-kernel networking.
|
||||
|
||||
@@ -24,23 +24,23 @@ the doorbell on the NIC to send network commands. GPU-IB is the default and
|
||||
preferred backend design that offers the best performance.
|
||||
|
||||
The second design will be referred to as the Reverse Offload (RO) backend. With
|
||||
the RO backend, the GPU runtime forwards ROC_SHMEM networking operations to the
|
||||
the RO backend, the GPU runtime forwards rocSHMEM networking operations to the
|
||||
host-side runtime, which calls into a traditional MPI or OpenSHMEM
|
||||
implementation. This forwarding of requests is transparent to the
|
||||
programmer, who only sees the GPU-side interface.
|
||||
|
||||
Both designs of the GPU-centric interface coexist seamlessly with the
|
||||
CPU-centric interface of the unified runtime. ROC_SHMEM ensures that CPU-centric
|
||||
CPU-centric interface of the unified runtime. rocSHMEM ensures that CPU-centric
|
||||
updates to the SHEAP are consistent and visible to a GPU kernel that is executing
|
||||
in parallel to host-initiated communication.
|
||||
|
||||
## Limitations
|
||||
|
||||
ROC_SHMEM is an experimental prototype from AMD Research and not an official
|
||||
rocSHMEM is an experimental prototype from AMD Research and not an official
|
||||
ROCm product. The software is provided as-is with no guarantees of support
|
||||
from AMD or AMD Research.
|
||||
|
||||
ROC_SHMEM base requirements:
|
||||
rocSHMEM base requirements:
|
||||
* ROCm version 4.3.1 onwards
|
||||
* May work with other versions, but not tested
|
||||
* AMD GFX9 GPUs (e.g.: MI25, Vega 56, Vega 64, MI50, MI60, MI100, Radeon VII)
|
||||
@@ -51,16 +51,16 @@ ROC_SHMEM base requirements:
|
||||
* InfiniBand adaptor compatable with ROCm RDMA technology
|
||||
* UCX 1.6 or greater with ROCm support
|
||||
|
||||
ROC_SHMEM optional requirements
|
||||
rocSHMEM optional requirements
|
||||
* For Documentation:
|
||||
* Doxygen
|
||||
|
||||
ROC_SHMEM only supports HIP applications. There are no plans to port to
|
||||
rocSHMEM only supports HIP applications. There are no plans to port to
|
||||
OpenCL.
|
||||
|
||||
## Building and Installation
|
||||
|
||||
ROC_SHMEM uses the CMake build system. The CMakeLists file contains
|
||||
rocSHMEM uses the CMake build system. The CMakeLists file contains
|
||||
additional details about library options.
|
||||
|
||||
To create an out-of-source build:
|
||||
@@ -84,34 +84,34 @@ custom install path by supplying it as an argument. For example:
|
||||
|
||||
../scripts/build_configs/rc_single /path/to/install
|
||||
|
||||
## Compiling/linking and Running with ROC_SHMEM
|
||||
## Compiling/linking and Running with rocSHMEM
|
||||
|
||||
ROC_SHMEM is built as a host and device side library that can be statically
|
||||
rocSHMEM is built as a host and device side library that can be statically
|
||||
linked to your application during compilation using hipcc.
|
||||
|
||||
During the compilation of your application, include the ROC_SHMEM header files
|
||||
and the ROC_SHMEM library when using hipcc:
|
||||
During the compilation of your application, include the rocSHMEM header files
|
||||
and the rocSHMEM library when using hipcc:
|
||||
|
||||
-I/path/to/rocshmem/install/include
|
||||
-L/path/to/rocshmem/install/lib -lrocshmem
|
||||
|
||||
NOTE: ROC_SHMEM depends on MPI for its host code. So, you will need to link
|
||||
NOTE: rocSHMEM depends on MPI for its host code. So, you will need to link
|
||||
to an MPI library. Since you must use the hipcc compiler, the arguments for
|
||||
MPI linkage must be added manually as opposed to using mpicc. Similary,
|
||||
ROC_SHMEM depends on Verbs for its device code. So, you will need to link
|
||||
rocSHMEM depends on Verbs for its device code. So, you will need to link
|
||||
to a Verbs library.
|
||||
|
||||
When using hipcc directly (as opposed to through a build system), we
|
||||
recommend performing the compilation and linking steps separately.
|
||||
Here are the steps to build a standalone program, say
|
||||
roc_shmem_hello.cpp.
|
||||
rocshmem_hello.cpp.
|
||||
|
||||
```
|
||||
# Compile
|
||||
/opt/rocm/bin/hipcc ./roc_shmem_hello.cpp -I/path/to/rocshmem/install/include -fgpu-rdc -o ./roc_shmem_hello.o -c
|
||||
/opt/rocm/bin/hipcc ./rocshmem_hello.cpp -I/path/to/rocshmem/install/include -fgpu-rdc -o ./rocshmem_hello.o -c
|
||||
|
||||
# Link
|
||||
/opt/rocm/bin/hipcc ./roc_shmem_hello.o /path/to/rocshmem/install/lib/librocshmem.a -lmpi -lmlx5 -libverbs -lhsa-runtime64 -fgpu-rdc -o roc_shmem_hello
|
||||
/opt/rocm/bin/hipcc ./rocshmem_hello.o /path/to/rocshmem/install/lib/librocshmem.a -lmpi -lmlx5 -libverbs -lhsa-runtime64 -fgpu-rdc -o rocshmem_hello
|
||||
|
||||
```
|
||||
|
||||
@@ -122,20 +122,20 @@ page useful.
|
||||
|
||||
## Runtime Parameters
|
||||
|
||||
ROC_SHMEM_HEAP_SIZE (default : 1 GB)
|
||||
ROCSHMEM_HEAP_SIZE (default : 1 GB)
|
||||
Defines the size of the OpenSHMEM symmetric heap
|
||||
Note the heap is on the GPU memory.
|
||||
|
||||
ROC_SHMEM_SQ_SIZE (default 1024)
|
||||
ROCSHMEM_SQ_SIZE (default 1024)
|
||||
Defines the size of the SQ as number of network
|
||||
packet (WQE). Each WQE is 64B. This only for
|
||||
GPU-IB conduit
|
||||
|
||||
ROC_SHMEM_USE_CQ_GPU_MEM (default : 1)
|
||||
ROCSHMEM_USE_CQ_GPU_MEM (default : 1)
|
||||
Set the placement of CQ on GPU memory (1)
|
||||
or CPU memory (0)
|
||||
|
||||
ROC_SHMEM_USE_SQ_GPU_MEM (default : 1)
|
||||
ROCSHMEM_USE_SQ_GPU_MEM (default : 1)
|
||||
Set the placement of SQ on GPU memory (1)
|
||||
or CPU memory (0)
|
||||
|
||||
@@ -143,13 +143,13 @@ page useful.
|
||||
Force producer/consumer queues between CPU and GPU to
|
||||
be in CPU memory. RO backend only.
|
||||
|
||||
ROC_SHMEM also requires the following environment variable be set for ROCm:
|
||||
rocSHMEM also requires the following environment variable be set for ROCm:
|
||||
|
||||
export HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
|
||||
## Documentation
|
||||
|
||||
To generate doxygen documentation for ROC_SHMEM's API, run the following
|
||||
To generate doxygen documentation for rocSHMEM's API, run the following
|
||||
from the library's build directory:
|
||||
|
||||
make docs
|
||||
@@ -158,13 +158,13 @@ The doxygen output will be in the `docs` folder of the build directory.
|
||||
|
||||
## Examples
|
||||
|
||||
ROC_SHMEM is similar to OpenSHMEM and should be familiar to programmers who
|
||||
rocSHMEM is similar to OpenSHMEM and should be familiar to programmers who
|
||||
have experience with OpenSHMEM or other PGAS network programming APIs in the
|
||||
context of CPUs. The best way to learn how to use ROC_SHMEM is to read the
|
||||
context of CPUs. The best way to learn how to use rocSHMEM is to read the
|
||||
autogenerated doxygen documentation for functions described in
|
||||
`roc_shmem/roc_shmem.hpp`, or to look at the provided sample applications in the
|
||||
`tests/` folder. ROC_SHMEM is shipped with a basic test suite for the
|
||||
supported ROC_SHMEM API. The examples test Puts, Gets, nonblocking Puts,
|
||||
`rocshmem/rocshmem.hpp`, or to look at the provided sample applications in the
|
||||
`tests/` folder. rocSHMEM is shipped with a basic test suite for the
|
||||
supported rocSHMEM API. The examples test Puts, Gets, nonblocking Puts,
|
||||
nonblocking Gets, Quiets, Atomics, Tests, Wai-untils, Broadcasts, and
|
||||
Reductions.
|
||||
|
||||
@@ -178,7 +178,7 @@ Here are some example uses of the driver script:
|
||||
|
||||
## Building the Dependencies
|
||||
|
||||
ROC_SHMEM requires an MPI runtime on the host that supports ROCm-Aware MPI.
|
||||
rocSHMEM requires an MPI runtime on the host that supports ROCm-Aware MPI.
|
||||
Currently all ROCm-Aware MPI runtimes require the usage of ROCm-Aware UCX.
|
||||
|
||||
To build and configure ROCm-Aware UCX, you need to:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
local help_message = [[
|
||||
|
||||
ROC_SHMEM is an open-source GPU initiated networking library
|
||||
rocSHMEM is an open-source GPU initiated networking library
|
||||
for High Performance Computing and Machine Learning workloads.
|
||||
|
||||
Version @ROCSHMEM_FULL_VERSION@
|
||||
@@ -12,7 +12,7 @@ whatis("Name: rocshmem")
|
||||
whatis("Version: @ROCSHMEM_FULL_VERSION@")
|
||||
whatis("Keywords: GPU, PGAS, RMA, HPC")
|
||||
whatis("Description: tool for GPU initiated networking")
|
||||
whatis("URL: https://github.com/ROCm-Developer-Tools/ROC_SHMEM")
|
||||
whatis("URL: https://github.com/ROCm-Developer-Tools/rocSHMEM")
|
||||
|
||||
-- Export environmental variables
|
||||
local topDir="@CMAKE_INSTALL_PREFIX@"
|
||||
|
||||
@@ -33,7 +33,7 @@ NIC (this does occur during MPI_Win_create), we are good.
|
||||
UCX claims to not support GPU-aware communication because they have
|
||||
not added in support for the different types of scenarios that could
|
||||
exist in a system (eg, when a system does not have GPU-direct). The
|
||||
scope of ROC_SHMEM is currenlty limited to configurations that UCX
|
||||
scope of rocSHMEM is currently limited to configurations that UCX
|
||||
already supports.
|
||||
|
||||
## But the main branch of MPICH does support HIP now?
|
||||
|
||||
@@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_allreduce_test.o -o rocshmem_allreduce_test
|
||||
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
|
||||
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
|
||||
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
#define CHECK_HIP(condition) { \
|
||||
hipError_t error = condition; \
|
||||
@@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test
|
||||
using namespace rocshmem;
|
||||
|
||||
__global__ void allreduce_test(int *source, int *dest, size_t nelem,
|
||||
roc_shmem_team_t team) {
|
||||
__shared__ roc_shmem_ctx_t ctx;
|
||||
rocshmem_team_t team) {
|
||||
__shared__ rocshmem_ctx_t ctx;
|
||||
int64_t ctx_type = 0;
|
||||
|
||||
roc_shmem_wg_init();
|
||||
roc_shmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = roc_shmem_ctx_n_pes(ctx);
|
||||
rocshmem_wg_init();
|
||||
rocshmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = rocshmem_ctx_n_pes(ctx);
|
||||
|
||||
roc_shmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem);
|
||||
rocshmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem);
|
||||
|
||||
roc_shmem_ctx_quiet(ctx);
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
__syncthreads();
|
||||
|
||||
roc_shmem_wg_ctx_destroy(&ctx);
|
||||
roc_shmem_wg_finalize();
|
||||
rocshmem_wg_ctx_destroy(&ctx);
|
||||
rocshmem_wg_finalize();
|
||||
}
|
||||
|
||||
static void init_sendbuf (int *source, int nelem, int my_pe)
|
||||
@@ -82,23 +82,23 @@ int main (int argc, char **argv)
|
||||
nelem = atoi(argv[1]);
|
||||
}
|
||||
|
||||
int my_pe = roc_shmem_my_pe();
|
||||
int npes = roc_shmem_n_pes();
|
||||
int my_pe = rocshmem_my_pe();
|
||||
int npes = rocshmem_n_pes();
|
||||
|
||||
int ndevices, my_device = 0;
|
||||
CHECK_HIP(hipGetDeviceCount(&ndevices));
|
||||
my_device = my_pe % ndevices;
|
||||
CHECK_HIP(hipSetDevice(my_device));
|
||||
|
||||
roc_shmem_init();
|
||||
rocshmem_init();
|
||||
|
||||
int *source = (int *)roc_shmem_malloc(nelem * sizeof(int));
|
||||
int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int));
|
||||
int *source = (int *)rocshmem_malloc(nelem * sizeof(int));
|
||||
int *dest = (int *)rocshmem_malloc(nelem * sizeof(int));
|
||||
if (NULL == source || NULL == dest) {
|
||||
std::cout << "Error allocating memory from symmetric heap" << std::endl;
|
||||
std::cout << "source: " << source << ", dest: " << dest << ", size: "
|
||||
<< sizeof(int) * nelem << std::endl;
|
||||
roc_shmem_global_exit(1);
|
||||
rocshmem_global_exit(1);
|
||||
}
|
||||
|
||||
init_sendbuf(source, nelem, my_pe);
|
||||
@@ -106,9 +106,9 @@ int main (int argc, char **argv)
|
||||
dest[i] = -1;
|
||||
}
|
||||
|
||||
roc_shmem_team_t team_reduce_world_dup;
|
||||
team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID;
|
||||
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
|
||||
rocshmem_team_t team_reduce_world_dup;
|
||||
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
|
||||
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
|
||||
&team_reduce_world_dup);
|
||||
|
||||
CHECK_HIP(hipDeviceSynchronize());
|
||||
@@ -121,9 +121,9 @@ int main (int argc, char **argv)
|
||||
bool pass = check_recvbuf(dest, nelem, my_pe, npes);
|
||||
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
|
||||
|
||||
roc_shmem_free(source);
|
||||
roc_shmem_free(dest);
|
||||
rocshmem_free(source);
|
||||
rocshmem_free(dest);
|
||||
|
||||
roc_shmem_finalize();
|
||||
rocshmem_finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_alltoall_test.o -o rocshmem_alltoall_test \
|
||||
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
|
||||
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
|
||||
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
#define CHECK_HIP(condition) { \
|
||||
hipError_t error = condition; \
|
||||
@@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_alltoall_test
|
||||
using namespace rocshmem;
|
||||
|
||||
__global__ void alltoall_test(int *source, int *dest, size_t nelem,
|
||||
roc_shmem_team_t team) {
|
||||
__shared__ roc_shmem_ctx_t ctx;
|
||||
rocshmem_team_t team) {
|
||||
__shared__ rocshmem_ctx_t ctx;
|
||||
int64_t ctx_type = 0;
|
||||
|
||||
roc_shmem_wg_init();
|
||||
roc_shmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = roc_shmem_ctx_n_pes(ctx);
|
||||
rocshmem_wg_init();
|
||||
rocshmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = rocshmem_ctx_n_pes(ctx);
|
||||
|
||||
roc_shmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem);
|
||||
rocshmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem);
|
||||
|
||||
roc_shmem_ctx_quiet(ctx);
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
__syncthreads();
|
||||
|
||||
roc_shmem_wg_ctx_destroy(&ctx);
|
||||
roc_shmem_wg_finalize();
|
||||
rocshmem_wg_ctx_destroy(&ctx);
|
||||
rocshmem_wg_finalize();
|
||||
}
|
||||
|
||||
static void init_sendbuf (int *source, int nelem, int my_pe, int npes)
|
||||
@@ -87,23 +87,23 @@ int main (int argc, char **argv)
|
||||
nelem = atoi(argv[1]);
|
||||
}
|
||||
|
||||
int my_pe = roc_shmem_my_pe();
|
||||
int npes = roc_shmem_n_pes();
|
||||
int my_pe = rocshmem_my_pe();
|
||||
int npes = rocshmem_n_pes();
|
||||
|
||||
int ndevices, my_device = 0;
|
||||
CHECK_HIP(hipGetDeviceCount(&ndevices));
|
||||
my_device = my_pe % ndevices;
|
||||
CHECK_HIP(hipSetDevice(my_device));
|
||||
|
||||
roc_shmem_init();
|
||||
rocshmem_init();
|
||||
|
||||
int *source = (int *)roc_shmem_malloc(nelem * npes * sizeof(int));
|
||||
int *dest = (int *)roc_shmem_malloc(nelem * npes * sizeof(int));
|
||||
int *source = (int *)rocshmem_malloc(nelem * npes * sizeof(int));
|
||||
int *dest = (int *)rocshmem_malloc(nelem * npes * sizeof(int));
|
||||
if (NULL == source || NULL == dest) {
|
||||
std::cout << "Error allocating memory from symmetric heap" << std::endl;
|
||||
std::cout << "source: " << source << ", dest: " << dest << ", size: "
|
||||
<< sizeof(int) * nelem * npes << std::endl;
|
||||
roc_shmem_global_exit(1);
|
||||
rocshmem_global_exit(1);
|
||||
}
|
||||
|
||||
init_sendbuf(source, nelem, my_pe, npes);
|
||||
@@ -111,9 +111,9 @@ int main (int argc, char **argv)
|
||||
dest[i] = -1;
|
||||
}
|
||||
|
||||
roc_shmem_team_t team_reduce_world_dup;
|
||||
team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID;
|
||||
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
|
||||
rocshmem_team_t team_reduce_world_dup;
|
||||
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
|
||||
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
|
||||
&team_reduce_world_dup);
|
||||
|
||||
CHECK_HIP(hipDeviceSynchronize());
|
||||
@@ -127,9 +127,9 @@ int main (int argc, char **argv)
|
||||
|
||||
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
|
||||
|
||||
roc_shmem_free(source);
|
||||
roc_shmem_free(dest);
|
||||
rocshmem_free(source);
|
||||
rocshmem_free(dest);
|
||||
|
||||
roc_shmem_finalize();
|
||||
rocshmem_finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_broadcast_test.o -o rocshmem_broadcast_test
|
||||
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
|
||||
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
|
||||
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
#define CHECK_HIP(condition) { \
|
||||
hipError_t error = condition; \
|
||||
@@ -30,21 +30,21 @@ ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test
|
||||
using namespace rocshmem;
|
||||
|
||||
__global__ void broadcast_test(int *source, int *dest, size_t nelem,
|
||||
int root, roc_shmem_team_t team) {
|
||||
__shared__ roc_shmem_ctx_t ctx;
|
||||
int root, rocshmem_team_t team) {
|
||||
__shared__ rocshmem_ctx_t ctx;
|
||||
int64_t ctx_type = 0;
|
||||
|
||||
roc_shmem_wg_init();
|
||||
roc_shmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = roc_shmem_ctx_n_pes(ctx);
|
||||
rocshmem_wg_init();
|
||||
rocshmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = rocshmem_ctx_n_pes(ctx);
|
||||
|
||||
roc_shmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root);
|
||||
rocshmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root);
|
||||
|
||||
roc_shmem_ctx_quiet(ctx);
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
__syncthreads();
|
||||
|
||||
roc_shmem_wg_ctx_destroy(&ctx);
|
||||
roc_shmem_wg_finalize();
|
||||
rocshmem_wg_ctx_destroy(&ctx);
|
||||
rocshmem_wg_finalize();
|
||||
}
|
||||
|
||||
static void init_sendbuf(int *source, int nelem, int my_pe)
|
||||
@@ -80,23 +80,23 @@ int main(int argc, char **argv)
|
||||
nelem = atoi(argv[1]);
|
||||
}
|
||||
|
||||
int my_pe = roc_shmem_my_pe();
|
||||
int npes = roc_shmem_n_pes();
|
||||
int my_pe = rocshmem_my_pe();
|
||||
int npes = rocshmem_n_pes();
|
||||
|
||||
int ndevices, my_device = 0;
|
||||
CHECK_HIP(hipGetDeviceCount(&ndevices));
|
||||
my_device = my_pe % ndevices;
|
||||
CHECK_HIP(hipSetDevice(my_device));
|
||||
|
||||
roc_shmem_init();
|
||||
rocshmem_init();
|
||||
|
||||
int *source = (int *)roc_shmem_malloc(nelem * sizeof(int));
|
||||
int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int));
|
||||
int *source = (int *)rocshmem_malloc(nelem * sizeof(int));
|
||||
int *dest = (int *)rocshmem_malloc(nelem * sizeof(int));
|
||||
if (NULL == source || NULL == dest) {
|
||||
std::cout << "Error allocating memory from symmetric heap" << std::endl;
|
||||
std::cout << "source: " << source << ", dest: " << dest << ", size: "
|
||||
<< sizeof(int) * nelem << std::endl;
|
||||
roc_shmem_global_exit(1);
|
||||
rocshmem_global_exit(1);
|
||||
}
|
||||
|
||||
init_sendbuf(source, nelem, my_pe);
|
||||
@@ -105,9 +105,9 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
int root = 0;
|
||||
roc_shmem_team_t team_reduce_world_dup;
|
||||
team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID;
|
||||
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
|
||||
rocshmem_team_t team_reduce_world_dup;
|
||||
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
|
||||
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
|
||||
&team_reduce_world_dup);
|
||||
|
||||
CHECK_HIP(hipDeviceSynchronize());
|
||||
@@ -122,9 +122,9 @@ int main(int argc, char **argv)
|
||||
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
|
||||
}
|
||||
|
||||
roc_shmem_free(source);
|
||||
roc_shmem_free(dest);
|
||||
rocshmem_free(source);
|
||||
rocshmem_free(dest);
|
||||
|
||||
roc_shmem_finalize();
|
||||
rocshmem_finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -10,14 +10,14 @@ hipcc -fgpu-rdc --hip-link rocshmem_getmem_test.o -o rocshmem_getmem_test \
|
||||
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
|
||||
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
|
||||
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 ./rocshmem_getmem_test
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 ./rocshmem_getmem_test
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
#define CHECK_HIP(condition) { \
|
||||
hipError_t error = condition; \
|
||||
@@ -31,25 +31,25 @@ using namespace rocshmem;
|
||||
|
||||
__global__ void simple_getmem_test(int *src, int *dst, size_t nelem)
|
||||
{
|
||||
roc_shmem_wg_init();
|
||||
rocshmem_wg_init();
|
||||
|
||||
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (threadId == 0) {
|
||||
int rank = roc_shmem_my_pe();
|
||||
int rank = rocshmem_my_pe();
|
||||
int peer = rank ? 0 : 1;
|
||||
roc_shmem_getmem(dst, src, nelem * sizeof(int), peer);
|
||||
roc_shmem_quiet();
|
||||
rocshmem_getmem(dst, src, nelem * sizeof(int), peer);
|
||||
rocshmem_quiet();
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
roc_shmem_wg_finalize();
|
||||
rocshmem_wg_finalize();
|
||||
}
|
||||
|
||||
#define MAX_ELEM 256
|
||||
|
||||
int main (int argc, char **argv)
|
||||
{
|
||||
int rank = roc_shmem_my_pe();
|
||||
int rank = rocshmem_my_pe();
|
||||
int ndevices, my_device = 0;
|
||||
CHECK_HIP(hipGetDeviceCount(&ndevices));
|
||||
my_device = rank % ndevices;
|
||||
@@ -60,15 +60,15 @@ int main (int argc, char **argv)
|
||||
nelem = atoi(argv[1]);
|
||||
}
|
||||
|
||||
roc_shmem_init();
|
||||
int npes = roc_shmem_n_pes();
|
||||
int *src = (int *)roc_shmem_malloc(nelem * sizeof(int));
|
||||
int *dst = (int *)roc_shmem_malloc(nelem * sizeof(int));
|
||||
rocshmem_init();
|
||||
int npes = rocshmem_n_pes();
|
||||
int *src = (int *)rocshmem_malloc(nelem * sizeof(int));
|
||||
int *dst = (int *)rocshmem_malloc(nelem * sizeof(int));
|
||||
if (NULL == src || NULL == dst) {
|
||||
std::cout << "Error allocating memory from symmetric heap" << std::endl;
|
||||
std::cout << "source: " << src << ", dest: " << dst << ", size: "
|
||||
<< sizeof(int) * nelem << std::endl;
|
||||
roc_shmem_global_exit(1);
|
||||
rocshmem_global_exit(1);
|
||||
}
|
||||
|
||||
for (int i=0; i<nelem; i++) {
|
||||
@@ -79,7 +79,7 @@ int main (int argc, char **argv)
|
||||
|
||||
int threadsPerBlock=256;
|
||||
simple_getmem_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(src, dst, nelem);
|
||||
roc_shmem_barrier_all();
|
||||
rocshmem_barrier_all();
|
||||
CHECK_HIP(hipDeviceSynchronize());
|
||||
|
||||
bool pass = true;
|
||||
@@ -93,8 +93,8 @@ int main (int argc, char **argv)
|
||||
}
|
||||
printf("Test %s \t %s\n", argv[0], pass ? "[PASS]" : "[FAIL]");
|
||||
|
||||
roc_shmem_free(src);
|
||||
roc_shmem_free(dst);
|
||||
roc_shmem_finalize();
|
||||
rocshmem_free(src);
|
||||
rocshmem_free(dst);
|
||||
rocshmem_finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -22,7 +22,7 @@
|
||||
|
||||
#include "primitive_tester.hpp"
|
||||
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
#include <debug.hpp>
|
||||
|
||||
#include <unistd.h>
|
||||
@@ -41,26 +41,26 @@ PrimitiveTest(int loop,
|
||||
int my_pe,
|
||||
ShmemContextType ctx_type)
|
||||
{
|
||||
__shared__ roc_shmem_ctx_t ctx;
|
||||
roc_shmem_wg_init();
|
||||
roc_shmem_wg_ctx_create(ctx_type, &ctx);
|
||||
__shared__ rocshmem_ctx_t ctx;
|
||||
rocshmem_wg_init();
|
||||
rocshmem_wg_ctx_create(ctx_type, &ctx);
|
||||
|
||||
int block_id = hipBlockIdx_x;
|
||||
for(int i =0; i< loop; i++){
|
||||
roc_shmem_ctx_putmem_nbi_wg(ctx, &r_buf[my_pe*size], &s_buf[block_id * size], size, block_id);
|
||||
rocshmem_ctx_putmem_nbi_wg(ctx, &r_buf[my_pe*size], &s_buf[block_id * size], size, block_id);
|
||||
if(hipThreadIdx_x==0){
|
||||
//roc_shmem_ctx_quiet(ctx);
|
||||
//roc_shmem_ctx_threadfence_system(ctx);
|
||||
roc_shmem_ctx_int_p(ctx, &flag[my_pe], i+1, block_id);
|
||||
//roc_shmem_ctx_quiet(ctx);
|
||||
roc_shmem_int_wait_until(&flag[block_id], ROC_SHMEM_CMP_EQ, i+1);
|
||||
//rocshmem_ctx_quiet(ctx);
|
||||
//rocshmem_ctx_threadfence_system(ctx);
|
||||
rocshmem_ctx_int_p(ctx, &flag[my_pe], i+1, block_id);
|
||||
//rocshmem_ctx_quiet(ctx);
|
||||
rocshmem_int_wait_until(&flag[block_id], ROCSHMEM_CMP_EQ, i+1);
|
||||
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
roc_shmem_wg_ctx_destroy(ctx);
|
||||
roc_shmem_wg_finalize();
|
||||
rocshmem_wg_ctx_destroy(ctx);
|
||||
rocshmem_wg_finalize();
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
@@ -69,16 +69,16 @@ PrimitiveTest(int loop,
|
||||
PrimitiveTester::PrimitiveTester(TesterArguments args)
|
||||
: Tester(args)
|
||||
{
|
||||
flag = (int*) roc_shmem_malloc(args.numprocs);
|
||||
flag = (int*) rocshmem_malloc(args.numprocs);
|
||||
memset(flag, 0, args.numprocs*sizeof(int));
|
||||
// s_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size);
|
||||
// r_buf = (char *)roc_shmem_malloc(args.max_msg_size * args.wg_size);
|
||||
// s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size);
|
||||
// r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size);
|
||||
}
|
||||
|
||||
PrimitiveTester::~PrimitiveTester()
|
||||
{
|
||||
roc_shmem_free(s_buf);
|
||||
roc_shmem_free(r_buf);
|
||||
rocshmem_free(s_buf);
|
||||
rocshmem_free(r_buf);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -99,8 +99,8 @@ PrimitiveTester::launchKernel(dim3 gridSize,
|
||||
void* sendBuf = malloc(64);
|
||||
void* recvBuf = malloc(64 * nproc);
|
||||
|
||||
s_buf = (char *)roc_shmem_malloc(size * nproc);
|
||||
r_buf = (char *)roc_shmem_malloc(size * nproc);
|
||||
s_buf = (char *)rocshmem_malloc(size * nproc);
|
||||
r_buf = (char *)rocshmem_malloc(size * nproc);
|
||||
resetBuffers(size);
|
||||
|
||||
MPI_Allgather(sendBuf, 64, MPI_CHAR,
|
||||
@@ -108,7 +108,7 @@ PrimitiveTester::launchKernel(dim3 gridSize,
|
||||
MPI_COMM_WORLD);
|
||||
|
||||
size_t shared_bytes;
|
||||
roc_shmem_dynamic_shared(&shared_bytes);
|
||||
rocshmem_dynamic_shared(&shared_bytes);
|
||||
|
||||
hipLaunchKernelGGL(PrimitiveTest,
|
||||
gridSize,
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
#include "tester.hpp"
|
||||
#include "tester_arguments.hpp"
|
||||
@@ -39,7 +39,7 @@ int main(int argc, char * argv[])
|
||||
/***
|
||||
* Select a GPU
|
||||
*/
|
||||
int rank = roc_shmem_my_pe();
|
||||
int rank = rocshmem_my_pe();
|
||||
int ndevices, my_device=0;
|
||||
hipGetDeviceCount (&ndevices);
|
||||
my_device = rank % ndevices;
|
||||
@@ -48,7 +48,7 @@ int main(int argc, char * argv[])
|
||||
/**
|
||||
* Must initialize rocshmem to access arguments needed by the tester.
|
||||
*/
|
||||
roc_shmem_init(args.num_wgs);
|
||||
rocshmem_init(args.num_wgs);
|
||||
|
||||
/**
|
||||
* Now grab the arguments from rocshmem.
|
||||
@@ -78,7 +78,7 @@ int main(int argc, char * argv[])
|
||||
* The rocshmem library needs to be cleaned up with this call. It pairs
|
||||
* with the init function above.
|
||||
*/
|
||||
roc_shmem_finalize();
|
||||
rocshmem_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
#include <iostream>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <mpi.h>
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
//#include "broadcast_tester.hpp"
|
||||
#include "primitive_tester.hpp"
|
||||
@@ -125,8 +125,8 @@ Tester::execute()
|
||||
printf("error = %d \n", err);
|
||||
}
|
||||
|
||||
// roc_shmem_dump_stats();
|
||||
// roc_shmem_reset_stats();
|
||||
// rocshmem_dump_stats();
|
||||
// rocshmem_reset_stats();
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
#include "tester_arguments.hpp"
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
using namespace rocshmem;
|
||||
|
||||
@@ -64,7 +64,7 @@ void
|
||||
TesterArguments::show_usage(std::string executable_name)
|
||||
{
|
||||
std::cout << "Usage: " << executable_name << std::endl;
|
||||
std::cout << "\t-t <number of roc_shmem service threads>\n";
|
||||
std::cout << "\t-t <number of rocshmem service threads>\n";
|
||||
std::cout << "\t-w <number of workgroups>\n";
|
||||
std::cout << "\t-s <maximum message size (in bytes)>\n";
|
||||
std::cout << "\t-a <algorithm number to test>\n";
|
||||
@@ -78,7 +78,7 @@ TesterArguments::show_usage(std::string executable_name)
|
||||
void
|
||||
TesterArguments::get_rocshmem_arguments()
|
||||
{
|
||||
numprocs = roc_shmem_n_pes();
|
||||
myid = roc_shmem_my_pe();
|
||||
numprocs = rocshmem_n_pes();
|
||||
myid = rocshmem_my_pe();
|
||||
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ class TesterArguments
|
||||
|
||||
/**
|
||||
* Initialize rocshmem members
|
||||
* Valid after roc_shmem_init function called.
|
||||
* Valid after rocshmem_init function called.
|
||||
*/
|
||||
void get_rocshmem_arguments();
|
||||
|
||||
@@ -54,7 +54,7 @@ class TesterArguments
|
||||
uint64_t min_msg_size = 1;
|
||||
uint64_t max_msg_size = 1 << 20;
|
||||
unsigned wg_size = 64;
|
||||
unsigned shmem_context = 8; // ROC_SHMEM_CTX_WG_PRIVATE
|
||||
unsigned shmem_context = 8; // ROCSHMEM_CTX_WG_PRIVATE
|
||||
|
||||
/**
|
||||
* Arguments obtained from rocshmem
|
||||
|
||||
@@ -55,7 +55,7 @@ project(spts VERSION 1.1.0 LANGUAGES CXX)
|
||||
# CONFIGURATION OPTIONS
|
||||
###############################################################################
|
||||
option(USE_HIP "Build HIP version of the solver" OFF)
|
||||
option(USE_ROCSHMEM "Build ROC_SHMEM enabled version of the solver" OFF)
|
||||
option(USE_ROCSHMEM "Build rocSHMEM enabled version of the solver" OFF)
|
||||
option(ALL_ANALYZE "Build analyze and solve algorithm" OFF)
|
||||
option(USE_DOUBLE "Use double precision floats for the data" OFF)
|
||||
option(ALL_LEVELSET "Build levelset algorithm" OFF)
|
||||
@@ -84,7 +84,7 @@ target_sources(
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
# HIP / HIP + ROC_SHMEM
|
||||
# HIP / HIP + rocSHMEM
|
||||
###############################################################################
|
||||
if(USE_HIP)
|
||||
find_package(hip REQUIRED)
|
||||
@@ -95,7 +95,7 @@ if(USE_HIP)
|
||||
HIPHelper.cpp
|
||||
)
|
||||
|
||||
if(USE_ROC_SHMEM)
|
||||
if(USE_ROCSHMEM)
|
||||
find_package(rocshmem CONFIG REQUIRED)
|
||||
|
||||
target_include_directories(
|
||||
@@ -118,8 +118,8 @@ if(USE_HIP)
|
||||
###############################################################################
|
||||
else()
|
||||
|
||||
if(USE_ROC_SHMEM)
|
||||
message(FATAL_ERROR "Cannot use ROC_SHMEM without USE_HIP")
|
||||
if(USE_ROCSHMEM)
|
||||
message(FATAL_ERROR "Cannot use rocSHMEM without USE_HIP")
|
||||
endif()
|
||||
|
||||
target_sources(
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
|
||||
static int SPTS_BLOCK_SIZE = 0;
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
#define WF_PER_WG 1
|
||||
#else
|
||||
#define WF_PER_WG 16
|
||||
|
||||
@@ -179,7 +179,7 @@ int main(int argc, char *argv[])
|
||||
else
|
||||
printf("%lf )", ((double)ns_per_levelsync_iter/1000000.));
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
MPI_Allreduce(MPI_IN_PLACE, (void *) &ns_per_analysis_iter, 1,
|
||||
MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
|
||||
@@ -40,8 +40,8 @@
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#include "roc_shmem.hpp"
|
||||
#ifdef USE_ROCSHMEM
|
||||
#include "rocshmem.hpp"
|
||||
#include "mpi.h"
|
||||
#endif
|
||||
|
||||
@@ -100,8 +100,8 @@ class SparseTriangularSolve :
|
||||
int nCols;
|
||||
int numBlocks;
|
||||
/*
|
||||
#ifdef USE_ROC_SHMEM
|
||||
roc_shmem_t* handle;
|
||||
#ifdef USE_ROCSHMEM
|
||||
rocshmem_t* handle;
|
||||
#endif
|
||||
*/
|
||||
std::unordered_map<int, FloatType> *observed_errors;
|
||||
@@ -114,15 +114,15 @@ class SparseTriangularSolve :
|
||||
x = NULL; y = NULL; y_zero = NULL, yref = NULL, observed_errors = NULL, errors_seen = NULL;
|
||||
xDev = yDev = completedRowsDev = remoteInProgressArrayDev = rowBlocksDev = doneArrayDev = shadowDoneArrayDev = numRowsAtLevelDev = maxDepthDev = rowMapDev = totalSpinDev = oneBufDev = 0;
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
int roc_shmem_queues = (2560 / WF_PER_WG);
|
||||
#ifdef USE_ROCSHMEM
|
||||
int rocshmem_queues = (2560 / WF_PER_WG);
|
||||
if (2560 % WF_PER_WG)
|
||||
roc_shmem_queues++;
|
||||
printf("roc_shmem_queues %d WF_PER_WG %d \n",roc_shmem_queues, WF_PER_WG);
|
||||
roc_shmem_init(roc_shmem_queues);
|
||||
rocshmem_queues++;
|
||||
printf("rocshmem_queues %d WF_PER_WG %d \n",rocshmem_queues, WF_PER_WG);
|
||||
rocshmem_init(rocshmem_queues);
|
||||
|
||||
this->Set_total_pes(roc_shmem_n_pes());
|
||||
this->Set_this_pe(roc_shmem_my_pe());
|
||||
this->Set_total_pes(rocshmem_n_pes());
|
||||
this->Set_this_pe(rocshmem_my_pe());
|
||||
#else
|
||||
this->Set_total_pes(1);
|
||||
this->Set_this_pe(0);
|
||||
@@ -173,7 +173,7 @@ class SparseTriangularSolve :
|
||||
if (remoteInProgressArrayDev != 0)
|
||||
this->GPU->FreeMem(remoteInProgressArrayDev);
|
||||
|
||||
#ifndef USE_ROC_SHMEM
|
||||
#ifndef USE_ROCSHMEM
|
||||
if (yDev != 0)
|
||||
this->GPU->FreeMem(yDev);
|
||||
if (doneArrayDev != 0)
|
||||
@@ -184,14 +184,14 @@ class SparseTriangularSolve :
|
||||
this->GPU->FreeMem(shadowDoneArrayDev);
|
||||
#else
|
||||
if (yDev != 0)
|
||||
roc_shmem_free(yDev);
|
||||
rocshmem_free(yDev);
|
||||
if (doneArrayDev != 0)
|
||||
roc_shmem_free(doneArrayDev);
|
||||
rocshmem_free(doneArrayDev);
|
||||
if (reqUpdateArrayDev != 0)
|
||||
roc_shmem_free(reqUpdateArrayDev);
|
||||
rocshmem_free(reqUpdateArrayDev);
|
||||
if (shadowDoneArrayDev != 0)
|
||||
roc_shmem_free(shadowDoneArrayDev);
|
||||
roc_shmem_finalize();
|
||||
rocshmem_free(shadowDoneArrayDev);
|
||||
rocshmem_finalize();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
@@ -207,8 +207,8 @@ void SparseTriangularSolve<FloatType>::AddDerivedInputFlags()
|
||||
AddInputFlag("non_symmetric", 'n', "false", "Force the program to work on non-symmetric matrices. This will ignore the upper triangular entirely. (Default=false)", "bool");
|
||||
AddInputFlag("levelsync_size", 'l', "0", "Number of rows to launch in a level-sync kernel invocation (Default = auto-tune)", "int");
|
||||
AddInputFlag("verify", 'v', "false", "Verify results", "bool");
|
||||
AddInputFlag("rocshmem_algorithm", 'a', "0", "ROC_SHMEM algorithm type", "int");
|
||||
AddInputFlag("block_size", 'b', "32768", "Use get-based algorithm for ROC_SHMEM", "int");
|
||||
AddInputFlag("rocshmem_algorithm", 'a', "0", "rocSHMEM algorithm type", "int");
|
||||
AddInputFlag("block_size", 'b', "32768", "Use get-based algorithm for rocSHMEM", "int");
|
||||
AddInputFlag("put_block_size", 'p', "1024", "Block size for puts", "int");
|
||||
AddInputFlag("get_backoff_factor", 'g', "128", "Backoff factor for gets", "int");
|
||||
}
|
||||
@@ -241,10 +241,10 @@ void SparseTriangularSolve<FloatType>::AllocateVectors(
|
||||
}
|
||||
|
||||
xDev = this->GPU->AllocateMem("xDev", nCols*sizeof(FloatType), GPU_MEM_READ_ONLY, NULL);
|
||||
#ifndef USE_ROC_SHMEM
|
||||
#ifndef USE_ROCSHMEM
|
||||
yDev = this->GPU->AllocateMem("yDev", nRows*sizeof(FloatType), GPU_MEM_READ_WRITE, NULL);
|
||||
#else
|
||||
yDev = (memPointer) roc_shmem_malloc(nRows*sizeof(FloatType));
|
||||
yDev = (memPointer) rocshmem_malloc(nRows*sizeof(FloatType));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -742,10 +742,10 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
/****** SpTS Meta-Data Setup Code ******/
|
||||
/* Set up the OpenCL buffers for the SpTS meta-data */
|
||||
// TODO -- is this +1 in doneArray nRows+1 required? Why?
|
||||
#ifdef USE_ROC_SHMEM
|
||||
doneArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t));
|
||||
reqUpdateArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t));
|
||||
shadowDoneArrayDev = roc_shmem_malloc((nRows+1)*sizeof(uint32_t));
|
||||
#ifdef USE_ROCSHMEM
|
||||
doneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
|
||||
reqUpdateArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
|
||||
shadowDoneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
|
||||
#else
|
||||
doneArrayDev = this->GPU->AllocateMem("doneArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
|
||||
reqUpdateArrayDev = this->GPU->AllocateMem("reqUpdateArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
|
||||
@@ -835,7 +835,7 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
bool syncfree_better = false;
|
||||
|
||||
int total_workitems_per_workgroup = WF_SIZE * WF_PER_WG;
|
||||
//bool roc_shmem_initialized = false;
|
||||
//bool rocshmem_initialized = false;
|
||||
|
||||
/*********************** Actual work of the benchmark *********************/
|
||||
for(int i = 0; i < iter; i++)
|
||||
@@ -883,18 +883,18 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
#else
|
||||
int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
|
||||
/ total_workitems_per_workgroup;
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
global_work_size = this->nRows_p * WF_SIZE;
|
||||
num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
|
||||
/ total_workitems_per_workgroup;
|
||||
/*
|
||||
int roc_shmem_queues = (2560 / WF_PER_WG);
|
||||
int rocshmem_queues = (2560 / WF_PER_WG);
|
||||
if (2560 % WF_PER_WG)
|
||||
roc_shmem_queues++;
|
||||
if (!roc_shmem_initialized) {
|
||||
rocshmem_queues++;
|
||||
if (!rocshmem_initialized) {
|
||||
int num_threads = InputFlags::GetValueInt("num_roshmem_threads");
|
||||
roc_shmem_init(&handle, roc_shmem_queues);
|
||||
roc_shmem_initialized = true;
|
||||
rocshmem_init(&handle, rocshmem_queues);
|
||||
rocshmem_initialized = true;
|
||||
}
|
||||
*/
|
||||
int rocshmem_algorithm = InputFlags::GetValueInt("rocshmem_algorithm");
|
||||
@@ -915,11 +915,11 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
printf("Using put/get hybrid intra-kernel algorithm\n");
|
||||
break;
|
||||
default:
|
||||
printf("Unknown ROC_SHMEM algoirthm\n");
|
||||
printf("Unknown rocSHMEM algorithm\n");
|
||||
exit(-1);
|
||||
}
|
||||
size_t LDS_size;
|
||||
roc_shmem_dynamic_shared(&LDS_size);
|
||||
rocshmem_dynamic_shared(&LDS_size);
|
||||
printf("Work size %zu, wg size %d num workgroups %d LDS %zu thisPE %d Global %d \n", global_work_size, total_workitems_per_workgroup, num_of_workgroups, LDS_size, this->Get_this_pe(), this->Get_total_pes());
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
hipEventRecord(event_array[0], NULL);
|
||||
@@ -969,7 +969,7 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
hipEventRecord(event_array[1], NULL);
|
||||
hipEventSynchronize(event_array[1]);
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
// Wait for any outstanding network messages to finish up. We
|
||||
// can have straggler updates to the doneArray that we don't
|
||||
// have any dependencies for but we still eed it to finish so
|
||||
@@ -1000,7 +1000,7 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
this->GPU->CopyToHost(numRowsAtLevelDev, numRowsAtLevel, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL);
|
||||
this->GPU->Flush();
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
// Combine global statistics
|
||||
MPI_Allreduce(MPI_IN_PLACE, (void *) &maxDepth, 1, MPI_UNSIGNED, MPI_MAX, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(MPI_IN_PLACE, (void *) &totalSpin, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
|
||||
@@ -1114,8 +1114,8 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
|
||||
uint32_t current_iteration = 0;
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n");
|
||||
#ifdef USE_ROCSHMEM
|
||||
fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
|
||||
exit(-1);
|
||||
#endif
|
||||
|
||||
@@ -1215,8 +1215,8 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
level_sync_cutoff = 81920;
|
||||
}
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n");
|
||||
#ifdef USE_ROCSHMEM
|
||||
fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
|
||||
exit(-1);
|
||||
#endif
|
||||
|
||||
@@ -1346,8 +1346,8 @@ float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64
|
||||
// Number of levels is maxDepth. */
|
||||
levelset_iter++;
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
fprintf(stderr, "ROC_SHMEM not supported for selected algorithm\n");
|
||||
#ifdef USE_ROCSHMEM
|
||||
fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
|
||||
exit(-1);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -77,8 +77,8 @@ class SparseMatrix
|
||||
d_vals = NULL;
|
||||
d_row_ptrs = NULL;
|
||||
|
||||
this_pe = -1;//roc_shmem_my_pe(handle); // this pe
|
||||
total_pes = -1;//roc_shmem_n_pes(handle); // total number of pes
|
||||
this_pe = -1;//rocshmem_my_pe(handle); // this pe
|
||||
total_pes = -1;//rocshmem_n_pes(handle); // total number of pes
|
||||
|
||||
}
|
||||
void AllocateSparseMatrix(MatrixMarketReader<FloatType> &mm_reader,
|
||||
|
||||
@@ -5,7 +5,7 @@ src_path=$(dirname "$(realpath $0)")/..
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DUSE_ROC_SHMEM=OFF \
|
||||
-DUSE_ROCSHMEM=OFF \
|
||||
-DUSE_HIP=ON \
|
||||
-DALL_ANALYZE=ON \
|
||||
-DUSE_DOUBLE=OFF \
|
||||
|
||||
@@ -5,7 +5,7 @@ src_path=$(dirname "$(realpath $0)")/..
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DUSE_ROC_SHMEM=OFF \
|
||||
-DUSE_ROCSHMEM=OFF \
|
||||
-DUSE_HIP=OFF \
|
||||
-DALL_ANALYZE=ON \
|
||||
-DUSE_DOUBLE=OFF \
|
||||
|
||||
@@ -12,7 +12,7 @@ src_path=$(dirname "$(realpath $0)")/..
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DUSE_ROC_SHMEM=ON \
|
||||
-DUSE_ROCSHMEM=ON \
|
||||
-DUSE_HIP=ON \
|
||||
-DALL_ANALYZE=ON \
|
||||
-DUSE_DOUBLE=OFF \
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#cmakedefine USE_ROC_SHMEM
|
||||
#cmakedefine USE_ROCSHMEM
|
||||
#cmakedefine USE_HIP
|
||||
#cmakedefine ALL_ANALYZE
|
||||
#cmakedefine USE_DOUBLE
|
||||
|
||||
@@ -26,8 +26,8 @@
|
||||
#include <hip/math_functions.h>
|
||||
#include <hip/device_functions.h>
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#include "roc_shmem.hpp"
|
||||
#ifdef USE_ROCSHMEM
|
||||
#include "rocshmem.hpp"
|
||||
using namespace rocshmem;
|
||||
#endif
|
||||
|
||||
@@ -991,7 +991,7 @@ inline FPTYPE cross_lane_reduction_three(FPTYPE temp_sum, unsigned int *row_max_
|
||||
__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1)
|
||||
amd_spts_analyze_and_solve(
|
||||
const size_t global_work_size,
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
const int this_pe,
|
||||
const int total_pes,
|
||||
unsigned int * __restrict__ shadowDoneArray,
|
||||
@@ -1002,9 +1002,9 @@ amd_spts_analyze_and_solve(
|
||||
// 1: Naive gets
|
||||
// 2: blocked puts
|
||||
// 3: put/get hybrid
|
||||
int roc_shmem_algorithm,
|
||||
int roc_shmem_put_block_size,
|
||||
int roc_shmem_get_backoff_factor,
|
||||
int rocshmem_algorithm,
|
||||
int rocshmem_put_block_size,
|
||||
int rocshmem_get_backoff_factor,
|
||||
int spts_block_size,
|
||||
#endif
|
||||
const FPTYPE * __restrict__ vals,
|
||||
@@ -1043,13 +1043,13 @@ amd_spts_analyze_and_solve(
|
||||
const unsigned int wg_lid = hipThreadIdx_x;
|
||||
const unsigned int lid = wg_lid % WF_SIZE;
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
__shared__ roc_shmem_ctx_t ctx;
|
||||
#ifdef USE_ROCSHMEM
|
||||
__shared__ rocshmem_ctx_t ctx;
|
||||
|
||||
|
||||
//if (wg_lid == OUTPUT_THREAD) {
|
||||
roc_shmem_wg_init();
|
||||
roc_shmem_wg_ctx_create(ROC_SHMEM_CTX_WG_PRIVATE, &ctx);
|
||||
rocshmem_wg_init();
|
||||
rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx);
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
@@ -1061,7 +1061,7 @@ amd_spts_analyze_and_solve(
|
||||
// Actual row this wavefront will work on.
|
||||
const unsigned int local_row = local_first_row + local_offset;
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
// Get the global row for this wavefront assuming a row-cyclic
|
||||
// decomposition. Basically we need to account for other PEs here.
|
||||
int local_block_id = local_row / spts_block_size;
|
||||
@@ -1144,7 +1144,7 @@ amd_spts_analyze_and_solve(
|
||||
// While there are threads in this workgroup that have been unable to
|
||||
// get their input, loop and wait for the flag to exist.
|
||||
__asm__ volatile ("s_setprio 0");
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
int target_pe = (local_col / spts_block_size) % total_pes;
|
||||
int backoff_counter = 0;
|
||||
bool need_remote_notify = true;
|
||||
@@ -1179,8 +1179,8 @@ amd_spts_analyze_and_solve(
|
||||
|
||||
spin_times++;
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
if ((total_pes > 1) && (target_pe != this_pe) && (roc_shmem_algorithm == 1)) {
|
||||
#ifdef USE_ROCSHMEM
|
||||
if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 1)) {
|
||||
if (first_time) {
|
||||
if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0)
|
||||
need_comm = false;
|
||||
@@ -1188,12 +1188,12 @@ amd_spts_analyze_and_solve(
|
||||
first_time = false;
|
||||
if (need_comm)
|
||||
{
|
||||
for (int i = 0; i < (backoff_counter * roc_shmem_get_backoff_factor); i++)
|
||||
for (int i = 0; i < (backoff_counter * rocshmem_get_backoff_factor); i++)
|
||||
__asm__ volatile("s_sleep 127");
|
||||
|
||||
|
||||
roc_shmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
|
||||
//roc_shmem_ctx_quiet(ctx);
|
||||
rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
|
||||
//rocshmem_ctx_quiet(ctx);
|
||||
|
||||
__asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n"
|
||||
"s_waitcnt vmcnt(0)"
|
||||
@@ -1203,7 +1203,7 @@ amd_spts_analyze_and_solve(
|
||||
|
||||
if (local_done)
|
||||
{
|
||||
roc_shmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
|
||||
rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
|
||||
|
||||
__asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP
|
||||
:
|
||||
@@ -1217,19 +1217,19 @@ amd_spts_analyze_and_solve(
|
||||
}
|
||||
}
|
||||
|
||||
if ((total_pes > 1) && (target_pe != this_pe) && (roc_shmem_algorithm == 3)) {
|
||||
if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 3)) {
|
||||
if (need_remote_notify) {
|
||||
need_remote_notify = false;
|
||||
//if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0)
|
||||
//if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) == 0)
|
||||
{
|
||||
roc_shmem_ctx_putmem_nbi(ctx, &reqUpdateArray[local_col], oneBuf, sizeof(int), target_pe);
|
||||
rocshmem_ctx_putmem_nbi(ctx, &reqUpdateArray[local_col], oneBuf, sizeof(int), target_pe);
|
||||
//printf("Put 111 blockIDx %d threadID %d target_pe %d local_col %d oneBuf[0]= %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, oneBuf[0]);
|
||||
|
||||
roc_shmem_ctx_fence(ctx);
|
||||
rocshmem_ctx_fence(ctx);
|
||||
//printf("fence 222 blockIDx %d threadID %d target_pe %d local_col %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col);
|
||||
roc_shmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
|
||||
roc_shmem_ctx_quiet(ctx);
|
||||
rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
//printf("Get 333 blockIDx %d threadID %d target_pe %d local_col %d shadowDone %d \n \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, shadowDoneArray[local_col]);
|
||||
|
||||
__asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n"
|
||||
@@ -1239,8 +1239,8 @@ amd_spts_analyze_and_solve(
|
||||
|
||||
if (local_done)
|
||||
{
|
||||
roc_shmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
|
||||
roc_shmem_ctx_quiet(ctx);
|
||||
rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
__asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP
|
||||
:
|
||||
: "v"(&doneArray[local_col]),
|
||||
@@ -1313,34 +1313,34 @@ amd_spts_analyze_and_solve(
|
||||
__asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(row_max_depth));
|
||||
asm volatile ("s_waitcnt vmcnt(0)\n\t");
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
if (roc_shmem_algorithm == 2 && total_pes > 1) {
|
||||
int CHUNK = roc_shmem_put_block_size;
|
||||
#ifdef USE_ROCSHMEM
|
||||
if (rocshmem_algorithm == 2 && total_pes > 1) {
|
||||
int CHUNK = rocshmem_put_block_size;
|
||||
bool sendTime = true;
|
||||
int row_base = (row / CHUNK) * CHUNK;
|
||||
int num_done = atomicAdd(&shadowDoneArray[row_base], 1);
|
||||
sendTime = (num_done == (CHUNK - 1));
|
||||
for(int p=0; p<total_pes; p++){
|
||||
if(p != this_pe && sendTime){
|
||||
roc_shmem_ctx_putmem_nbi(ctx, &out_y[row_base], &out_y[row_base], sizeof(FPTYPE) * CHUNK, p);
|
||||
roc_shmem_ctx_fence(ctx);
|
||||
roc_shmem_ctx_putmem_nbi(ctx, &doneArray[row_base], &doneArray[row_base], sizeof(int) * CHUNK, p);
|
||||
roc_shmem_ctx_quiet(ctx);
|
||||
rocshmem_ctx_putmem_nbi(ctx, &out_y[row_base], &out_y[row_base], sizeof(FPTYPE) * CHUNK, p);
|
||||
rocshmem_ctx_fence(ctx);
|
||||
rocshmem_ctx_putmem_nbi(ctx, &doneArray[row_base], &doneArray[row_base], sizeof(int) * CHUNK, p);
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (roc_shmem_algorithm == 0) {
|
||||
if (rocshmem_algorithm == 0) {
|
||||
for(int p=0; p<total_pes; p++){
|
||||
if(p != this_pe){
|
||||
roc_shmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
|
||||
roc_shmem_ctx_fence(ctx);
|
||||
roc_shmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
|
||||
rocshmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
|
||||
rocshmem_ctx_fence(ctx);
|
||||
rocshmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (roc_shmem_algorithm == 3) {
|
||||
if (rocshmem_algorithm == 3) {
|
||||
// Only broadcast update if another node explicitly registered for this row. TODO:
|
||||
// Make 2D array to scale
|
||||
unsigned int need_broadcast;
|
||||
@@ -1349,9 +1349,9 @@ amd_spts_analyze_and_solve(
|
||||
if (need_broadcast == 1) {
|
||||
for(int p=0; p<total_pes; p++) {
|
||||
if (p != this_pe) {
|
||||
roc_shmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
|
||||
roc_shmem_ctx_fence(ctx);
|
||||
roc_shmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
|
||||
rocshmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
|
||||
rocshmem_ctx_fence(ctx);
|
||||
rocshmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1373,11 +1373,11 @@ amd_spts_analyze_and_solve(
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef USE_ROC_SHMEM
|
||||
#ifdef USE_ROCSHMEM
|
||||
__syncthreads();
|
||||
//if (wg_lid == OUTPUT_THREAD)
|
||||
roc_shmem_wg_ctx_destroy(ctx);
|
||||
roc_shmem_wg_finalize();
|
||||
rocshmem_wg_ctx_destroy(ctx);
|
||||
rocshmem_wg_finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
+2
-2
@@ -184,11 +184,11 @@ pipeline {
|
||||
steps {
|
||||
dir("clients/functional_tests") {
|
||||
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC true'
|
||||
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC true'
|
||||
}
|
||||
//dir("internal/clients/spts") {
|
||||
// sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
|
||||
// sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
|
||||
// sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
+3
-3
@@ -396,14 +396,14 @@ pipeline {
|
||||
steps {
|
||||
dir("clients/functional_tests") {
|
||||
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
|
||||
}
|
||||
dir("clients/sos_tests") {
|
||||
sh 'ROC_SHMEM_RO=1 ./driver.sh ${build_dir}/RC_MULTI all ${build_dir}/RC_MULTI'
|
||||
sh 'ROCSHMEM_RO=1 ./driver.sh ${build_dir}/RC_MULTI all ${build_dir}/RC_MULTI'
|
||||
}
|
||||
dir("internal/clients/spts") {
|
||||
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+2
-2
@@ -321,11 +321,11 @@ pipeline {
|
||||
steps {
|
||||
dir("clients/functional_tests") {
|
||||
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
|
||||
}
|
||||
dir("internal/clients/spts") {
|
||||
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+6
-6
@@ -15,9 +15,9 @@ pipeline {
|
||||
UCX_WARN_UNUSED_ENV_VARS="n"
|
||||
HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
UCX_TLS="rc"
|
||||
ROC_SHMEM_USE_SQ_GPU_MEM=0
|
||||
ROC_SHMEM_USE_CQ_GPU_MEM=0
|
||||
ROC_SHMEM_NUM_BLOCKS=128
|
||||
ROCSHMEM_USE_SQ_GPU_MEM=0
|
||||
ROCSHMEM_USE_CQ_GPU_MEM=0
|
||||
ROCSHMEM_NUM_BLOCKS=128
|
||||
}
|
||||
stages {
|
||||
stage('Synchronize Source Code') {
|
||||
@@ -244,13 +244,13 @@ pipeline {
|
||||
stage('RO_NET') {
|
||||
steps {
|
||||
dir("clients/functional_tests") {
|
||||
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/rocshmem_example_driver ro ${build_dir}/RO_NET'
|
||||
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/rocshmem_example_driver ro ${build_dir}/RO_NET'
|
||||
}
|
||||
dir("clients/sos_tests") {
|
||||
sh 'ROC_SHMEM_RO=1 ./driver.sh ${build_dir}/RO_NET short ${build_dir}/RO_NET'
|
||||
sh 'ROCSHMEM_RO=1 ./driver.sh ${build_dir}/RO_NET short ${build_dir}/RO_NET'
|
||||
}
|
||||
//dir("internal/clients/spts") {
|
||||
// sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/spts multi_thread ${build_dir}/RO_NET'
|
||||
// sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/spts multi_thread ${build_dir}/RO_NET'
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
+1
-1
@@ -141,7 +141,7 @@ pipeline {
|
||||
steps {
|
||||
dir("clients/functional_tests") {
|
||||
sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
|
||||
sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ ${BUILD}/sort_mpi: ${BUILD}/sort_mpi.o
|
||||
${BUILD}/sort_mpi.o: ${SRC}/sort_mpi.cu
|
||||
${HIPCC} $^ -I${MPI_HOME}/include -fgpu-rdc -o $@ -c
|
||||
|
||||
RO_FLAGS=ROC_SHMEM_RO=1 RO_NET_CPU_QUEUE=1
|
||||
RO_FLAGS=ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1
|
||||
ITERS?=0 1 2 3 4 5 6 7 8 9
|
||||
TIMEOUT=1m
|
||||
HOSTS=sv-pdp-0,sv-pdp-1,sv-pdp-2,sv-pdp-3
|
||||
|
||||
@@ -9,13 +9,13 @@ __device__ uint64_t timers[TIMERS] = {0};
|
||||
__device__ uint64_t time_start;
|
||||
#define TIMERS_START() \
|
||||
if(threadIdx.x == 0) {\
|
||||
time_start = roc_shmem_timer();\
|
||||
time_start = rocshmem_timer();\
|
||||
}
|
||||
|
||||
#define TIME(TIMER_NUM) \
|
||||
if(threadIdx.x == 0) {\
|
||||
timers[TIMER_NUM] = roc_shmem_timer() - time_start;\
|
||||
time_start = roc_shmem_timer();\
|
||||
timers[TIMER_NUM] = rocshmem_timer() - time_start;\
|
||||
time_start = rocshmem_timer();\
|
||||
}
|
||||
|
||||
#define OUTPUT_TIME() \
|
||||
@@ -268,14 +268,14 @@ void initGPU()
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
void *roc_shmem_malloc(size_t size)
|
||||
void *rocshmem_malloc(size_t size)
|
||||
{
|
||||
void *v;
|
||||
hipMalloc((void **)&v, size);
|
||||
return v;
|
||||
}
|
||||
|
||||
int roc_shmem_free(void *v)
|
||||
int rocshmem_free(void *v)
|
||||
{
|
||||
return hipFree(v);
|
||||
}
|
||||
@@ -316,14 +316,14 @@ int main(int argc, char *argv[])
|
||||
|
||||
// Init buffers
|
||||
int *keyBuffer1, *keyBuffer2;
|
||||
keyBuffer1 = (int*)roc_shmem_malloc(sizeof(int) * size);
|
||||
keyBuffer2 = (int*)roc_shmem_malloc(sizeof(int) * size * 4);
|
||||
keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
|
||||
keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
|
||||
|
||||
int *sendCount = 0, *recvCount = 0, *sendOffset = 0, *recvOffset = 0;
|
||||
sendCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
|
||||
printf("Begin untimed run\n");
|
||||
// Untimed run
|
||||
@@ -369,12 +369,12 @@ int main(int argc, char *argv[])
|
||||
// Clean up
|
||||
hipFree(keys);
|
||||
hipFree(outputKeys);
|
||||
roc_shmem_free(keyBuffer1);
|
||||
roc_shmem_free(keyBuffer2);
|
||||
roc_shmem_free(sendCount);
|
||||
roc_shmem_free(recvCount);
|
||||
roc_shmem_free(sendOffset);
|
||||
roc_shmem_free(recvOffset);
|
||||
rocshmem_free(keyBuffer1);
|
||||
rocshmem_free(keyBuffer2);
|
||||
rocshmem_free(sendCount);
|
||||
rocshmem_free(recvCount);
|
||||
rocshmem_free(sendOffset);
|
||||
rocshmem_free(recvOffset);
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,13 +9,13 @@ __device__ uint64_t timers[TIMERS] = {0};
|
||||
__device__ uint64_t time_start;
|
||||
#define TIMERS_START() \
|
||||
if(threadIdx.x == 0) {\
|
||||
time_start = roc_shmem_timer();\
|
||||
time_start = rocshmem_timer();\
|
||||
}
|
||||
|
||||
#define TIME(TIMER_NUM) \
|
||||
if(threadIdx.x == 0) {\
|
||||
timers[TIMER_NUM] = roc_shmem_timer() - time_start;\
|
||||
time_start = roc_shmem_timer();\
|
||||
timers[TIMER_NUM] = rocshmem_timer() - time_start;\
|
||||
time_start = rocshmem_timer();\
|
||||
}
|
||||
|
||||
#define OUTPUT_TIME() \
|
||||
@@ -282,14 +282,14 @@ void initGPU(ncclComm_t &comms)
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
void *roc_shmem_malloc(size_t size)
|
||||
void *rocshmem_malloc(size_t size)
|
||||
{
|
||||
void *v;
|
||||
hipMalloc((void **)&v, size);
|
||||
return v;
|
||||
}
|
||||
|
||||
int roc_shmem_free(void *v)
|
||||
int rocshmem_free(void *v)
|
||||
{
|
||||
return hipFree(v);
|
||||
}
|
||||
@@ -331,14 +331,14 @@ int main(int argc, char *argv[])
|
||||
|
||||
// Init buffers
|
||||
int *keyBuffer1, *keyBuffer2;
|
||||
keyBuffer1 = (int*)roc_shmem_malloc(sizeof(int) * size);
|
||||
keyBuffer2 = (int*)roc_shmem_malloc(sizeof(int) * size * 4);
|
||||
keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
|
||||
keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
|
||||
|
||||
int *sendCount = 0, *recvCount = 0, *sendOffset = 0, *recvOffset = 0;
|
||||
sendCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
|
||||
printf("Begin untimed run\n");
|
||||
// Untimed run
|
||||
@@ -382,13 +382,13 @@ int main(int argc, char *argv[])
|
||||
// Clean up
|
||||
hipFree(keys);
|
||||
hipFree(outputKeys);
|
||||
roc_shmem_free(keyBuffer1);
|
||||
roc_shmem_free(keyBuffer2);
|
||||
roc_shmem_free(sendCount);
|
||||
roc_shmem_free(recvCount);
|
||||
roc_shmem_free(sendOffset);
|
||||
roc_shmem_free(recvOffset);
|
||||
rocshmem_free(keyBuffer1);
|
||||
rocshmem_free(keyBuffer2);
|
||||
rocshmem_free(sendCount);
|
||||
rocshmem_free(recvCount);
|
||||
rocshmem_free(sendOffset);
|
||||
rocshmem_free(recvOffset);
|
||||
ncclCommDestroy(comms);
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <mpi.h>
|
||||
#include <roc_shmem/roc_shmem.hpp>
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
#include <unistd.h>
|
||||
using namespace std;
|
||||
using namespace rocshmem;
|
||||
@@ -16,13 +16,13 @@ __device__ uint64_t timers[TIMERS] = {0};
|
||||
__device__ uint64_t time_start;
|
||||
#define TIMERS_START() \
|
||||
if(threadIdx.x == 0) {\
|
||||
time_start = roc_shmem_timer();\
|
||||
time_start = rocshmem_timer();\
|
||||
}
|
||||
|
||||
#define TIME(TIMER_NUM) \
|
||||
if(threadIdx.x == 0) {\
|
||||
timers[TIMER_NUM] = roc_shmem_timer() - time_start;\
|
||||
time_start = roc_shmem_timer();\
|
||||
timers[TIMER_NUM] = rocshmem_timer() - time_start;\
|
||||
time_start = rocshmem_timer();\
|
||||
}
|
||||
|
||||
#define OUTPUT_TIME() \
|
||||
@@ -41,11 +41,11 @@ __device__ uint64_t time_start;
|
||||
#define OUTPUT_TIME()
|
||||
#endif
|
||||
|
||||
__device__ __inline__ void alltoall(roc_shmem_ctx_t &ctx,
|
||||
roc_shmem_team_t team,
|
||||
__device__ __inline__ void alltoall(rocshmem_ctx_t &ctx,
|
||||
rocshmem_team_t team,
|
||||
int *dst, int *src) {
|
||||
// Perform alltoall
|
||||
roc_shmem_ctx_int_wg_alltoall(ctx,
|
||||
rocshmem_ctx_int_wg_alltoall(ctx,
|
||||
team,
|
||||
dst, // T* dest
|
||||
src, // const T* source
|
||||
@@ -56,18 +56,18 @@ __global__ void sort(volatile int *keys, int *keyBuffer1,
|
||||
int *keyBuffer2, int *sendCount,
|
||||
int *recvCount, int *sendOffset,
|
||||
int *recvOffset, int *outputKeys,
|
||||
size_t size, roc_shmem_team_t team,
|
||||
size_t size, rocshmem_team_t team,
|
||||
int max_iters) {
|
||||
__shared__ roc_shmem_ctx_t ctx;
|
||||
__shared__ rocshmem_ctx_t ctx;
|
||||
__shared__ int bucketCounter[MAX_PES];
|
||||
__shared__ int bucketPtr[MAX_PES];
|
||||
__shared__ int total_size;
|
||||
|
||||
roc_shmem_wg_init();
|
||||
roc_shmem_wg_ctx_create(ROC_SHMEM_CTX_WG_PRIVATE, &ctx);
|
||||
rocshmem_wg_init();
|
||||
rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx);
|
||||
|
||||
int n_pes = roc_shmem_ctx_n_pes(ctx);
|
||||
int my_pe = roc_shmem_my_pe();
|
||||
int n_pes = rocshmem_ctx_n_pes(ctx);
|
||||
int my_pe = rocshmem_my_pe();
|
||||
int buckets = n_pes;
|
||||
|
||||
int tid = threadIdx.x; // + blockDim.x * blockIdx.x;
|
||||
@@ -116,9 +116,9 @@ __global__ void sort(volatile int *keys, int *keyBuffer1,
|
||||
int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1;
|
||||
keyBuffer1[loc] = keys[i];
|
||||
}
|
||||
roc_shmem_ctx_threadfence_system(ctx);
|
||||
rocshmem_ctx_threadfence_system(ctx);
|
||||
// Force sync to wait for all PEs to update bucket sizes
|
||||
roc_shmem_ctx_wg_team_sync(ctx, team);
|
||||
rocshmem_ctx_wg_team_sync(ctx, team);
|
||||
TIME(3)
|
||||
// Let all PEs know how many keys you wish to send
|
||||
alltoall(ctx, team, recvCount, sendCount);
|
||||
@@ -129,11 +129,11 @@ __global__ void sort(volatile int *keys, int *keyBuffer1,
|
||||
if(threadIdx.x == 0) {
|
||||
total_size = 0;
|
||||
for(int i = 0; i < buckets; ++i) {
|
||||
roc_shmem_int_get_nbi(&keyBuffer2[total_size],
|
||||
rocshmem_int_get_nbi(&keyBuffer2[total_size],
|
||||
&keyBuffer1[recvOffset[i]], recvCount[i], i);
|
||||
total_size += recvCount[i];
|
||||
}
|
||||
roc_shmem_quiet();
|
||||
rocshmem_quiet();
|
||||
}
|
||||
for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x)
|
||||
outputKeys[i] = 0;
|
||||
@@ -163,14 +163,14 @@ __global__ void sort(volatile int *keys, int *keyBuffer1,
|
||||
TIME(7)
|
||||
}
|
||||
OUTPUT_TIME()
|
||||
roc_shmem_wg_ctx_destroy(ctx);
|
||||
roc_shmem_wg_finalize();
|
||||
rocshmem_wg_ctx_destroy(ctx);
|
||||
rocshmem_wg_finalize();
|
||||
}
|
||||
|
||||
bool verify(int *outputKeys, int *keyBuffer2, size_t size)
|
||||
{
|
||||
int num_pes = roc_shmem_n_pes();
|
||||
int my_pe = roc_shmem_my_pe();
|
||||
int num_pes = rocshmem_n_pes();
|
||||
int my_pe = rocshmem_my_pe();
|
||||
|
||||
MPI_Status status;
|
||||
MPI_Request request;
|
||||
@@ -228,8 +228,8 @@ void initGPU()
|
||||
{
|
||||
// Calculation for local rank, taken from rccl-tests
|
||||
int localRank = 0;
|
||||
int proc = roc_shmem_my_pe();
|
||||
int nProcs = roc_shmem_n_pes();
|
||||
int proc = rocshmem_my_pe();
|
||||
int nProcs = rocshmem_n_pes();
|
||||
char hostname[1024];
|
||||
gethostname(hostname, 1024);
|
||||
for (int i=0; i< 1024; i++) {
|
||||
@@ -261,12 +261,12 @@ void initGPU()
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
// Init roc_shmem stuff
|
||||
// Init rocshmem stuff
|
||||
initGPU();
|
||||
roc_shmem_init(NUM_WGS);
|
||||
int n_pes = roc_shmem_team_n_pes(ROC_SHMEM_TEAM_WORLD);
|
||||
roc_shmem_team_t team_world_dup = ROC_SHMEM_TEAM_INVALID;
|
||||
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD,
|
||||
rocshmem_init(NUM_WGS);
|
||||
int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD);
|
||||
rocshmem_team_t team_world_dup = ROCSHMEM_TEAM_INVALID;
|
||||
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD,
|
||||
0,
|
||||
1,
|
||||
n_pes,
|
||||
@@ -278,8 +278,8 @@ int main(int argc, char *argv[])
|
||||
if(argc > 1)
|
||||
iterations = atoi(argv[1]);
|
||||
|
||||
int num_pes = roc_shmem_n_pes();
|
||||
int my_pe = roc_shmem_my_pe();
|
||||
int num_pes = rocshmem_n_pes();
|
||||
int my_pe = rocshmem_my_pe();
|
||||
|
||||
// Configure input and outputs
|
||||
size_t size = 1024; //atoi(argv[2]);
|
||||
@@ -298,17 +298,17 @@ int main(int argc, char *argv[])
|
||||
|
||||
// Init buffers
|
||||
int *keyBuffer1, *keyBuffer2;
|
||||
keyBuffer1 = (int*)roc_shmem_malloc(sizeof(int) * size);
|
||||
keyBuffer2 = (int*)roc_shmem_malloc(sizeof(int) * size * 4);
|
||||
keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
|
||||
keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
|
||||
|
||||
int *sendCount, *recvCount, *sendOffset, *recvOffset;
|
||||
sendCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvCount = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvOffset = (int*)roc_shmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
|
||||
|
||||
// Untimed run
|
||||
roc_shmem_barrier_all();
|
||||
rocshmem_barrier_all();
|
||||
sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2,
|
||||
sendCount, recvCount, sendOffset, recvOffset,
|
||||
outputKeys, size, team_world_dup, 1);
|
||||
@@ -321,7 +321,7 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
// Timed run
|
||||
roc_shmem_barrier_all();
|
||||
rocshmem_barrier_all();
|
||||
auto time_start = TIME_NOW;
|
||||
sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2,
|
||||
sendCount, recvCount, sendOffset, recvOffset,
|
||||
@@ -347,12 +347,12 @@ int main(int argc, char *argv[])
|
||||
// Clean up
|
||||
hipFree(keys);
|
||||
hipFree(outputKeys);
|
||||
roc_shmem_free(keyBuffer1);
|
||||
roc_shmem_free(keyBuffer2);
|
||||
roc_shmem_free(sendCount);
|
||||
roc_shmem_free(recvCount);
|
||||
roc_shmem_free(sendOffset);
|
||||
roc_shmem_free(recvOffset);
|
||||
roc_shmem_finalize();
|
||||
rocshmem_free(keyBuffer1);
|
||||
rocshmem_free(keyBuffer2);
|
||||
rocshmem_free(sendCount);
|
||||
rocshmem_free(recvCount);
|
||||
rocshmem_free(sendOffset);
|
||||
rocshmem_free(recvOffset);
|
||||
rocshmem_finalize();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
+130
-130
@@ -47,88 +47,88 @@ case $2 in
|
||||
###########################################################################
|
||||
*"serial")
|
||||
echo "get_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log
|
||||
check get_n2_w1_z1_1MB
|
||||
echo "getnbi_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log
|
||||
check getnbi_n2_w1_z1_1MB
|
||||
echo "put_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log
|
||||
check put_n2_w1_z1_1MB
|
||||
echo "putnbi_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log
|
||||
check putnbi_n2_w1_z1_1MB
|
||||
echo "wg_get_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 28 > $3/wg_get_n2_w1_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 28 > $3/wg_get_n2_w1_z64_1MB.log
|
||||
check wg_get_n2_w1_z1_1MB
|
||||
echo "wg_getnbi_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_n2_w1_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_n2_w1_z64_1MB.log
|
||||
check wg_getnbi_n2_w1_z1_1MB
|
||||
echo "wg_put_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 30 > $3/wg_put_n2_w1_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 30 > $3/wg_put_n2_w1_z64_1MB.log
|
||||
check wg_put_n2_w1_z1_1MB
|
||||
echo "wg_putnbi_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_n2_w1_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_n2_w1_z64_1MB.log
|
||||
check wg_putnbi_n2_w1_z1_1MB
|
||||
echo "wg_get_tiled_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 28 > $3/wg_get_tiled_n2_w2_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 28 > $3/wg_get_tiled_n2_w2_z64_1MB.log
|
||||
check wg_get_tiled_n2_w1_z1_1MB
|
||||
echo "wg_getnbi_tiled_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_tiled_n2_w2_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 29 > $3/wg_getnbi_tiled_n2_w2_z64_1MB.log
|
||||
check wg_getnbi_tiled_n2_w1_z1_1MB
|
||||
echo "wg_put_tiled_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 30 > $3/wg_put_tiled_n2_w2_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 30 > $3/wg_put_tiled_n2_w2_z64_1MB.log
|
||||
check wg_put_tiled_n2_w1_z1_1MB
|
||||
echo "wg_putnbi_tiled_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_tiled_n2_w2_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 64 -s 1048576 -a 31 > $3/wg_putnbi_tiled_n2_w2_z64_1MB.log
|
||||
check wg_putnbi_tiled_n2_w1_z1_1MB
|
||||
echo "wave_get_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 32 > $3/wave_get_n2_w1_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 32 > $3/wave_get_n2_w1_z64_1MB.log
|
||||
check wave_get_n2_w1_z1_1MB
|
||||
echo "wave_getnbi_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 33 > $3/wave_getnbi_n2_w1_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 33 > $3/wave_getnbi_n2_w1_z64_1MB.log
|
||||
check wave_getnbi_n2_w1_z1_1MB
|
||||
echo "wave_put_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 34 > $3/wave_put_n2_w1_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 34 > $3/wave_put_n2_w1_z64_1MB.log
|
||||
check wave_put_n2_w1_z1_1MB
|
||||
echo "wave_putnbi_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 35 > $3/wave_putnbi_n2_w1_z64_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 1048576 -a 35 > $3/wave_putnbi_n2_w1_z64_1MB.log
|
||||
check wave_putnbi_n2_w1_z1_1MB
|
||||
echo "wave_get_tiled_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 32 > $3/wave_get_tiled_n2_w2_z128_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 32 > $3/wave_get_tiled_n2_w2_z128_1MB.log
|
||||
check wave_get_tiled_n2_w1_z1_1MB
|
||||
echo "wave_getnbi_tiled_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 33 > $3/wave_getnbi_tiled_n2_w2_z128_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 33 > $3/wave_getnbi_tiled_n2_w2_z128_1MB.log
|
||||
check wave_getnbi_tiled_n2_w1_z1_1MB
|
||||
echo "wave_put_tiled_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 34 > $3/wave_put_tiled_n2_w2_z128_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 34 > $3/wave_put_tiled_n2_w2_z128_1MB.log
|
||||
check wave_put_tiled_n2_w1_z1_1MB
|
||||
echo "wave_putnbi_tiled_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 35 > $3/wave_putnbi_tiled_n2_w2_z128_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 2 $1 -w 2 -z 128 -s 1048576 -a 35 > $3/wave_putnbi_tiled_n2_w2_z128_1MB.log
|
||||
check wave_putnbi_tiled_n2_w1_z1_1MB
|
||||
echo "amofadd_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log
|
||||
check amofadd_n2_w1_z1
|
||||
echo "amofinc_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log
|
||||
check amofinc_n2_w1_z1
|
||||
echo "amofetch_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log
|
||||
check amofetch_n2_w1_z1
|
||||
echo "amofcswap_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log
|
||||
check amofcswap_n2_w1_z1
|
||||
echo "amoadd_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log
|
||||
check amoadd_n2_w1_z1
|
||||
echo "amoinc_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log
|
||||
check amoinc_n2_w1_z1
|
||||
# echo "pingpong_n2_w1"
|
||||
# ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
|
||||
# ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
|
||||
# check pingpong_n2_w1
|
||||
echo "amoset_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log
|
||||
check amoset_n2_w1_z1
|
||||
;;
|
||||
|
||||
@@ -137,88 +137,88 @@ case $2 in
|
||||
###########################################################################
|
||||
*"short")
|
||||
echo "get_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log
|
||||
check get_n2_w16_z128_8B
|
||||
echo "getnbi_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log
|
||||
check getnbi_n2_w16_z128_8B
|
||||
echo "put_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log
|
||||
check put_n2_w16_z128_8B
|
||||
echo "putnbi_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log
|
||||
check putnbi_n2_w16_z128_8B
|
||||
echo "wg_get_n2_w1_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 28 > $3/wg_get_n2_w1_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 28 > $3/wg_get_n2_w1_z64_8B.log
|
||||
check wg_get_n2_w1_z64_8B
|
||||
echo "wg_getnbi_n2_w1_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 29 > $3/wg_getnbi_n2_w1_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 29 > $3/wg_getnbi_n2_w1_z64_8B.log
|
||||
check wg_getnbi_n2_w1_z64_8B
|
||||
echo "wg_put_n2_w1_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 30 > $3/wg_put_n2_w1_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 30 > $3/wg_put_n2_w1_z64_8B.log
|
||||
check wg_put_n2_w1_z64_8B
|
||||
echo "wg_putnbi_n2_w1_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 31 > $3/wg_putnbi_n2_w1_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 31 > $3/wg_putnbi_n2_w1_z64_8B.log
|
||||
check wg_putnbi_n2_w1_z64_8B
|
||||
echo "wg_get_tiled_n2_w16_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 28 > $3/wg_get_tiled_n2_w16_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 28 > $3/wg_get_tiled_n2_w16_z64_8B.log
|
||||
check wg_get_tiled_n2_w16_z64_8B
|
||||
echo "wg_getnbi_tiled_n2_w16_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 29 > $3/wg_getnbi_tiled_n2_w16_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 29 > $3/wg_getnbi_tiled_n2_w16_z64_8B.log
|
||||
check wg_getnbi_tiled_n2_w16_z64_8B
|
||||
echo "wg_put_tiled_n2_w16_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 30 > $3/wg_put_tiled_n2_w16_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 30 > $3/wg_put_tiled_n2_w16_z64_8B.log
|
||||
check wg_put_tiled_n2_w16_z64_8B
|
||||
echo "wg_putnbi_tiled_n2_w16_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 31 > $3/wg_putnbi_tiled_n2_w16_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 64 -s 8 -a 31 > $3/wg_putnbi_tiled_n2_w16_z64_8B.log
|
||||
check wg_putnbi_tiled_n2_w16_z64_8B
|
||||
echo "wave_get_n2_w1_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 32 > $3/wave_get_n2_w1_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 32 > $3/wave_get_n2_w1_z64_8B.log
|
||||
check wave_get_n2_w1_z64_8B
|
||||
echo "wave_getnbi_n2_w1_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 33 > $3/wave_getnbi_n2_w1_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 33 > $3/wave_getnbi_n2_w1_z64_8B.log
|
||||
check wave_getnbi_n2_w1_z64_8B
|
||||
echo "wave_put_n2_w1_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 34 > $3/wave_put_n2_w1_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 34 > $3/wave_put_n2_w1_z64_8B.log
|
||||
check wave_put_n2_w1_z64_8B
|
||||
echo "wave_putnbi_n2_w1_z64_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 35 > $3/wave_putnbi_n2_w1_z64_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 64 -s 8 -a 35 > $3/wave_putnbi_n2_w1_z64_8B.log
|
||||
check wave_putnbi_n2_w1_z64_8B
|
||||
echo "wave_get_tiled_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 32 > $3/wave_get_tiled_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 32 > $3/wave_get_tiled_n2_w16_z128_8B.log
|
||||
check wave_get_tiled_n2_w16_z128_8B
|
||||
echo "wave_getnbi_tiled_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 33 > $3/wave_getnbi_tiled_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 33 > $3/wave_getnbi_tiled_n2_w16_z128_8B.log
|
||||
check wave_getnbi_tiled_n2_w16_z128_8B
|
||||
echo "wave_put_tiled_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 34 > $3/wave_put_tiled_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 34 > $3/wave_put_tiled_n2_w16_z128_8B.log
|
||||
check wave_put_tiled_n2_w16_z128_8B
|
||||
echo "wave_putnbi_tiled_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 35 > $3/wave_putnbi_tiled_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 35 > $3/wave_putnbi_tiled_n2_w16_z128_8B.log
|
||||
check wave_putnbi_tiled_n2_w16_z128_8B
|
||||
echo "amofadd_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log
|
||||
check amofadd_n2_w8_z1
|
||||
echo "amofinc_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log
|
||||
check amofinc_n2_w8_z1
|
||||
echo "amofetch_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log
|
||||
check amofetch_n2_w8_z1
|
||||
echo "amofcswap_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log
|
||||
check amofcswap_n2_w8_z1
|
||||
echo "amoadd_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log
|
||||
check amoadd_n2_w8_z1
|
||||
echo "amoinc_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log
|
||||
check amoinc_n2_w8_z1
|
||||
# echo "pingpong_n2_w1"
|
||||
# ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
|
||||
# ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
|
||||
# check pingpong_n2_w1
|
||||
echo "amoset_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log
|
||||
check amoset_n2_w8_z1
|
||||
;;
|
||||
|
||||
@@ -228,246 +228,246 @@ case $2 in
|
||||
*"exhaustive")
|
||||
############################### GET ###################################
|
||||
echo "get_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 0 > $3/get_n2_w1_z1_1MB.log
|
||||
check get_n2_w1_z1_1MB
|
||||
echo "get_n2_w1_z1024_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 0 > $3/get_n2_w1_z1024_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 0 > $3/get_n2_w1_z1024_512B.log
|
||||
check get_n2_w1_z1024_512B
|
||||
echo "get_n2_w8_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 0 > $3/get_n2_w8_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 0 > $3/get_n2_w8_z1_1MB.log
|
||||
check get_n2_w8_z1_1MB
|
||||
echo "get_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 0 > $3/get_n2_w16_z128_8B.log
|
||||
check get_n2_w16_z128_8B
|
||||
echo "get_n2_w32_z256_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 0 > $3/get_n2_w32_z256_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 0 > $3/get_n2_w32_z256_512B.log
|
||||
check get_n2_w32_z256_512B
|
||||
echo "get_n2_w64_z1024_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 0 > $3/get_n2_w64_z1024_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 0 > $3/get_n2_w64_z1024_8B.log
|
||||
check get_n2_w64_z1024_8B
|
||||
############################### GETNBI ################################
|
||||
echo "getnbi_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w1_z1_1MB.log
|
||||
check getnbi_n2_w1_z1_1MB
|
||||
echo "getnbi_n2_w1_z1024_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 1 > $3/getnbi_n2_w1_z1024_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 1 > $3/getnbi_n2_w1_z1024_512B.log
|
||||
check getnbi_n2_w1_z1024_512B
|
||||
echo "getnbi_n2_w8_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w8_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 1 > $3/getnbi_n2_w8_z1_1MB.log
|
||||
check getnbi_n2_w8_z1_1MB
|
||||
echo "getnbi_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 1 > $3/getnbi_n2_w16_z128_8B.log
|
||||
check getnbi_n2_w16_z128_8B
|
||||
echo "getnbi_n2_w32_z256_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 1 > $3/getnbi_n2_w32_z256_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 1 > $3/getnbi_n2_w32_z256_512B.log
|
||||
check getnbi_n2_w32_z256_512B
|
||||
echo "getnbi_n2_w64_z1024_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 1 > $3/getnbi_n2_w64_z1024_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 1 > $3/getnbi_n2_w64_z1024_8B.log
|
||||
check getnbi_n2_w64_z1024_8B
|
||||
############################### PUT ###################################
|
||||
echo "put_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 2 > $3/put_n2_w1_z1_1MB.log
|
||||
check put_n2_w1_z1_1MB
|
||||
echo "put_n2_w1_z1024_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 2 > $3/put_n2_w1_z1024_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 2 > $3/put_n2_w1_z1024_512B.log
|
||||
check put_n2_w1_z1024_512B
|
||||
echo "put_n2_w8_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 2 > $3/put_n2_w8_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 2 > $3/put_n2_w8_z1_1MB.log
|
||||
check put_n2_w8_z1_1MB
|
||||
echo "put_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 2 > $3/put_n2_w16_z128_8B.log
|
||||
check put_n2_w16_z128_8B
|
||||
echo "put_n2_w32_z256_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 2 > $3/put_n2_w32_z256_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 2 > $3/put_n2_w32_z256_512B.log
|
||||
check put_n2_w32_z256_512B
|
||||
echo "put_n2_w64_z1024_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 2 > $3/put_n2_w64_z1024_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 2 > $3/put_n2_w64_z1024_8B.log
|
||||
check put_n2_w64_z1024_8B
|
||||
############################### PUTNBI ################################
|
||||
echo "putnbi_n2_w1_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w1_z1_1MB.log
|
||||
check putnbi_n2_w1_z1_1MB
|
||||
echo "putnbi_n2_w1_z1024_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 3 > $3/putnbi_n2_w1_z1024_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -s 512 -a 3 > $3/putnbi_n2_w1_z1024_512B.log
|
||||
check putnbi_n2_w1_z1024_512B
|
||||
echo "putnbi_n2_w8_z1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w8_z1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 1048576 -a 3 > $3/putnbi_n2_w8_z1_1MB.log
|
||||
check putnbi_n2_w8_z1_1MB
|
||||
echo "putnbi_n2_w16_z128_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=16 mpirun -np 2 $1 -w 16 -z 128 -s 8 -a 3 > $3/putnbi_n2_w16_z128_8B.log
|
||||
check putnbi_n2_w16_z128_8B
|
||||
echo "putnbi_n2_w32_z256_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 3 > $3/putnbi_n2_w32_z256_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 256 -s 512 -a 3 > $3/putnbi_n2_w32_z256_512B.log
|
||||
check putnbi_n2_w32_z256_512B
|
||||
echo "putnbi_n2_w64_z1024_8B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 3 > $3/putnbi_n2_w64_z1024_8B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=64 mpirun -np 2 $1 -w 64 -z 1024 -s 8 -a 3 > $3/putnbi_n2_w64_z1024_8B.log
|
||||
check putnbi_n2_w64_z1024_8B
|
||||
############################# REDUCTION ##############################
|
||||
echo "reduction_n2_w1_z1_32K"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 32768 -a 5 > $3/reduction_n2_w1_z1_32K.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -s 32768 -a 5 > $3/reduction_n2_w1_z1_32K.log
|
||||
check reduction_n2_w1_z1_32K
|
||||
echo "reduction_n2_w8_z1_32K"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 32768 -a 5 > $3/reduction_n2_w8_z1_32K.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -s 32768 -a 5 > $3/reduction_n2_w8_z1_32K.log
|
||||
check reduction_n2_w8_z1_32K
|
||||
echo "reduction_n2_w32_z1_32K"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -s 32768 -a 5 > $3/reduction_n2_w32_z1_32K.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -s 32768 -a 5 > $3/reduction_n2_w32_z1_32K.log
|
||||
check reduction_n2_w32_z1_32K
|
||||
############################## AMOFADD ###############################
|
||||
echo "amofadd_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 6 > $3/amofadd_n2_w1_z1.log
|
||||
check amofadd_n2_w1_z1
|
||||
echo "amofadd_n2_w1_z1024"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 6 > $3/amofadd_n2_w1_z1024.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 6 > $3/amofadd_n2_w1_z1024.log
|
||||
check amofadd_n2_w1_z1024
|
||||
echo "amofadd_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 6 > $3/amofadd_n2_w8_z1.log
|
||||
check amofadd_n2_w8_z1
|
||||
echo "amofadd_n2_w32_z128"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 6 > $3/amofadd_n2_w32_z128.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 6 > $3/amofadd_n2_w32_z128.log
|
||||
check amofadd_n2_w32_z128
|
||||
############################## AMOFINC ###############################
|
||||
echo "amofinc_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 7 > $3/amofinc_n2_w1_z1.log
|
||||
check amofinc_n2_w1_z1
|
||||
echo "amofinc_n2_w1_z1024"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 7 > $3/amofinc_n2_w1_z1024.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 7 > $3/amofinc_n2_w1_z1024.log
|
||||
check amofinc_n2_w1_z1024
|
||||
echo "amofinc_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 7 > $3/amofinc_n2_w8_z1.log
|
||||
check amofinc_n2_w8_z1
|
||||
echo "amofinc_n2_w32_z128"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 7 > $3/amofinc_n2_w32_z128.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 7 > $3/amofinc_n2_w32_z128.log
|
||||
check amofinc_n2_w32_z128
|
||||
############################ AMOFETCH ################################
|
||||
echo "amofetch_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 8 > $3/amofetch_n2_w1_z1.log
|
||||
check amofetch_n2_w1_z1
|
||||
echo "amofetch_n2_w1_z1024"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 8 > $3/amofetch_n2_w1_z1024.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 8 > $3/amofetch_n2_w1_z1024.log
|
||||
check amofetch_n2_w1_z1024
|
||||
echo "amofetch_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 8 > $3/amofetch_n2_w8_z1.log
|
||||
check amofetch_n2_w8_z1
|
||||
echo "amofetch_n2_w32_z128"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 8 > $3/amofetch_n2_w32_z128.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 8 > $3/amofetch_n2_w32_z128.log
|
||||
check amofetch_n2_w32_z128
|
||||
########################### AMOFCSWAP ################################
|
||||
echo "amofcswap_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 9 > $3/amofcswap_n2_w1_z1.log
|
||||
check amofcswap_n2_w1_z1
|
||||
echo "amofcswap_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 9 > $3/amofcswap_n2_w8_z1.log
|
||||
check amofcswap_n2_w8_z1
|
||||
echo "amofcswap_n2_w32_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 9 > $3/amofcswap_n2_w32_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 9 > $3/amofcswap_n2_w32_z1.log
|
||||
check amofcswap_n2_w32_z1
|
||||
############################# AMOADD ################################
|
||||
echo "amoadd_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 10 > $3/amoadd_n2_w1_z1.log
|
||||
check amoadd_n2_w1_z1
|
||||
echo "amoadd_n2_w1_z1024"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 10 > $3/amoadd_n2_w1_z1024.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 10 > $3/amoadd_n2_w1_z1024.log
|
||||
check amoadd_n2_w1_z1024
|
||||
echo "amoadd_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 10 > $3/amoadd_n2_w8_z1.log
|
||||
check amoadd_n2_w8_z1
|
||||
echo "amoadd_n2_w32_z128"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 10 > $3/amoadd_n2_w32_z128.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 10 > $3/amoadd_n2_w32_z128.log
|
||||
check amoadd_n2_w32_z128
|
||||
############################# AMOINC ################################
|
||||
echo "amoinc_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 11 > $3/amoinc_n2_w1_z1.log
|
||||
check amoinc_n2_w1_z1
|
||||
echo "amoinc_n2_w1_z1024"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 11 > $3/amoinc_n2_w1_z1024.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1024 -a 11 > $3/amoinc_n2_w1_z1024.log
|
||||
check amoinc_n2_w1_z1024
|
||||
echo "amoinc_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 11 > $3/amoinc_n2_w8_z1.log
|
||||
check amoinc_n2_w8_z1
|
||||
echo "amoinc_n2_w32_z128"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 11 > $3/amoinc_n2_w32_z128.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 128 -a 11 > $3/amoinc_n2_w32_z128.log
|
||||
check amoinc_n2_w32_z128
|
||||
############################## INIT #################################
|
||||
echo "init_n2"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -a 13 > $3/init_n2.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -a 13 > $3/init_n2.log
|
||||
check init_n2
|
||||
########################### PINGPONG ################################
|
||||
echo "pingpong_n2_w1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 14 > $3/pingpong_n2_w1.log
|
||||
check pingpong_n2_w1
|
||||
echo "pingpong_n2_w8"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 14 > $3/pingpong_n2_w8.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 14 > $3/pingpong_n2_w8.log
|
||||
check pingpong_n2_w8
|
||||
echo "pingpong_n2_w32"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 14 > $3/pingpong_n2_w32.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 14 > $3/pingpong_n2_w32.log
|
||||
check pingpong_n2_w32
|
||||
############################ BARRIER ################################
|
||||
echo "barrier_n2_w1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 17 > $3/barrier_n2_w1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 17 > $3/barrier_n2_w1.log
|
||||
check barrier_n2_w1
|
||||
echo "barrier_n2_w8"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 17 > $3/barrier_n2_w8.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 17 > $3/barrier_n2_w8.log
|
||||
check barrier_n2_w8
|
||||
echo "barrier_n2_w32"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 17 > $3/barrier_n2_w32.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 17 > $3/barrier_n2_w32.log
|
||||
check barrier_n2_w32
|
||||
############################ SYNCALL ################################
|
||||
echo "syncall_n2_w1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 18 > $3/syncall_n2_w1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 18 > $3/syncall_n2_w1.log
|
||||
check syncall_n2_w1
|
||||
echo "syncall_n2_w8"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 18 > $3/syncall_n2_w8.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 18 > $3/syncall_n2_w8.log
|
||||
check syncall_n2_w8
|
||||
echo "syncall_n2_w32"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 18 > $3/syncall_n2_w32.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 18 > $3/syncall_n2_w32.log
|
||||
check syncall_n2_w32
|
||||
############################# SYNC ##################################
|
||||
echo "sync_n2_w1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 19 > $3/sync_n2_w1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -a 19 > $3/sync_n2_w1.log
|
||||
check sync_n2_w1
|
||||
echo "sync_n2_w8"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 19 > $3/sync_n2_w8.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -a 19 > $3/sync_n2_w8.log
|
||||
check sync_n2_w8
|
||||
echo "sync_n2_w32"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 19 > $3/sync_n2_w32.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -a 19 > $3/sync_n2_w32.log
|
||||
check sync_n2_w32
|
||||
########################### FCOLLECT ################################
|
||||
echo "fcollect_n2_w1_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 22 > $3/fcollect_n2_w1_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 22 > $3/fcollect_n2_w1_512B.log
|
||||
check fcollect_n2_w1_512B
|
||||
echo "fcollect_n2_w8_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 22 > $3/fcollect_n2_w8_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 22 > $3/fcollect_n2_w8_512B.log
|
||||
check fcollect_n2_w8_512B
|
||||
echo "fcollect_n2_w32_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 22 > $3/fcollect_n2_w32_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 22 > $3/fcollect_n2_w32_512B.log
|
||||
check fcollect_n2_w32_512B
|
||||
########################### ALLTOALL ################################
|
||||
echo "alltoall_n2_w1_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 23 > $3/alltoall_n2_w1_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 512 -a 23 > $3/alltoall_n2_w1_512B.log
|
||||
check alltoall_n2_w1_512B
|
||||
echo "alltoall_n2_w8_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 23 > $3/alltoall_n2_w8_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -s 512 -a 23 > $3/alltoall_n2_w8_512B.log
|
||||
check alltoall_n2_w8_512B
|
||||
echo "alltoall_n2_w32_512B"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 23 > $3/alltoall_n2_w32_512B.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -s 512 -a 23 > $3/alltoall_n2_w32_512B.log
|
||||
check alltoall_n2_w32_512B
|
||||
########################## TEAMGETNBI ###############################
|
||||
echo "teamgetnbi_n2_w1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 39 > $3/teamgetnbi_n2_w1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 39 > $3/teamgetnbi_n2_w1_1MB.log
|
||||
check teamgetnbi_n2_w1_1MB
|
||||
########################## TEAMPUTNBI ###############################
|
||||
echo "teamputnbi_n2_w1_1MB"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 41 > $3/teamputnbi_n2_w1_1MB.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -s 1048576 -a 41 > $3/teamputnbi_n2_w1_1MB.log
|
||||
check teamputnbi_n2_w1_1MB
|
||||
############################ AMOSET #################################
|
||||
echo "amoset_n2_w1_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=1 mpirun -np 2 $1 -w 1 -z 1 -a 44 > $3/amoset_n2_w1_z1.log
|
||||
check amoset_n2_w1_z1
|
||||
echo "amoset_n2_w8_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=8 mpirun -np 2 $1 -w 8 -z 1 -a 44 > $3/amoset_n2_w8_z1.log
|
||||
check amoset_n2_w8_z1
|
||||
echo "amoset_n2_w32_z1"
|
||||
ROC_SHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 44 > $3/amoset_n2_w32_z1.log
|
||||
ROCSHMEM_MAX_NUM_CONTEXTS=32 mpirun -np 2 $1 -w 32 -z 1 -a 44 > $3/amoset_n2_w32_z1.log
|
||||
check amoset_n2_w32_z1
|
||||
;;
|
||||
|
||||
|
||||
@@ -82,10 +82,10 @@ do
|
||||
# test exeuction based on lib
|
||||
if [ "$libnm" == "ro_net" ]
|
||||
then
|
||||
ROC_SHMEM_RO=1
|
||||
ROCSHMEM_RO=1
|
||||
ROC_NET_CPU_QUEUE=1
|
||||
UCX_TLS=rc
|
||||
#echo $ROC_SHMEM_RO"--"$ROC_NET_CPU_QUEUE "--"$UCX_TLS
|
||||
#echo $ROCSHMEM_RO"--"$ROC_NET_CPU_QUEUE "--"$UCX_TLS
|
||||
../scripts/functional_tests/driver.sh tests/functional_tests/rocshmem_example_driver $threadType .
|
||||
else
|
||||
../scripts/functional_tests/driver.sh tests/functional_tests/rocshmem_example_driver $threadType .
|
||||
|
||||
+2
-2
@@ -31,8 +31,8 @@ target_sources(
|
||||
context_host.cpp
|
||||
context_device.cpp
|
||||
mpi_init_singleton.cpp
|
||||
roc_shmem_gpu.cpp
|
||||
roc_shmem.cpp
|
||||
rocshmem_gpu.cpp
|
||||
rocshmem.cpp
|
||||
team.cpp
|
||||
team_tracker.cpp
|
||||
util.cpp
|
||||
|
||||
+2
-2
@@ -202,7 +202,7 @@ void Backend::reset_stats() {
|
||||
reset_backend_stats();
|
||||
}
|
||||
|
||||
__device__ bool Backend::create_ctx(int64_t option, roc_shmem_ctx_t* ctx) {
|
||||
__device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
|
||||
#ifdef USE_GPU_IB
|
||||
return static_cast<GPUIBBackend*>(this)->create_ctx(option, ctx);
|
||||
#elif defined(USE_RO)
|
||||
@@ -212,7 +212,7 @@ __device__ bool Backend::create_ctx(int64_t option, roc_shmem_ctx_t* ctx) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ void Backend::destroy_ctx(roc_shmem_ctx_t* ctx) {
|
||||
__device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) {
|
||||
#ifdef USE_GPU_IB
|
||||
static_cast<GPUIBBackend*>(this)->destroy_ctx(ctx);
|
||||
#elif defined(USE_RO)
|
||||
|
||||
+6
-6
@@ -36,7 +36,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "backend_type.hpp"
|
||||
#include "ipc_policy.hpp"
|
||||
#include "memory/symmetric_heap.hpp"
|
||||
@@ -56,7 +56,7 @@ class TeamInfo;
|
||||
* It uses this state to populate Context objects which the GPU may use to
|
||||
* perform networking operations.
|
||||
*
|
||||
* The roc_shmem.cpp implementation file wraps many the Backend public
|
||||
* The rocshmem.cpp implementation file wraps many the Backend public
|
||||
* members to implement the library's public API.
|
||||
*/
|
||||
class Backend {
|
||||
@@ -74,8 +74,8 @@ class Backend {
|
||||
*/
|
||||
virtual ~Backend();
|
||||
|
||||
__device__ bool create_ctx(int64_t option, roc_shmem_ctx_t* ctx);
|
||||
__device__ void destroy_ctx(roc_shmem_ctx_t* ctx);
|
||||
__device__ bool create_ctx(int64_t option, rocshmem_ctx_t* ctx);
|
||||
__device__ void destroy_ctx(rocshmem_ctx_t* ctx);
|
||||
|
||||
/**
|
||||
* @brief Create a new team object and initialize it.
|
||||
@@ -93,14 +93,14 @@ class Backend {
|
||||
TeamInfo* team_info_wrt_parent,
|
||||
TeamInfo* team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t* new_team) = 0;
|
||||
rocshmem_team_t* new_team) = 0;
|
||||
|
||||
/**
|
||||
* @brief Destruct a team
|
||||
*
|
||||
* @param[in] team Handle to the team to destroy.
|
||||
*/
|
||||
virtual void team_destroy(roc_shmem_team_t team) = 0;
|
||||
virtual void team_destroy(rocshmem_team_t team) = 0;
|
||||
|
||||
/**
|
||||
* @brief Reports processing element number id.
|
||||
|
||||
+1
-1
@@ -26,7 +26,7 @@
|
||||
/**
|
||||
* @file constants.hpp
|
||||
*
|
||||
* @brief Contains global constants for ROCSHMEM library
|
||||
* @brief Contains global constants for rocSHMEM library
|
||||
*/
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -30,11 +30,11 @@
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
|
||||
#define BARRIER() rocshmem::roc_shmem_wg_barrier_all()
|
||||
#define RANK rocshmem::roc_shmem_my_pe()
|
||||
#define NPES rocshmem::roc_shmem_n_pes()
|
||||
#define BARRIER() rocshmem::rocshmem_wg_barrier_all()
|
||||
#define RANK rocshmem::rocshmem_my_pe()
|
||||
#define NPES rocshmem::rocshmem_n_pes()
|
||||
|
||||
#define PE_BITS ((uint64_t)ceil(log(NPES) / log(2)))
|
||||
#define PE_OF(X) ((X) >> (64 - PE_BITS))
|
||||
|
||||
+12
-12
@@ -67,7 +67,7 @@ class Context {
|
||||
* just removing the dispatch implementations.
|
||||
*
|
||||
* No comments for these guys since its basically the same as in the
|
||||
* roc_shmem.hpp public header.
|
||||
* rocshmem.hpp public header.
|
||||
*/
|
||||
|
||||
/**************************************************************************
|
||||
@@ -139,7 +139,7 @@ class Context {
|
||||
|
||||
__device__ void sync_all();
|
||||
|
||||
__device__ void sync(roc_shmem_team_t team);
|
||||
__device__ void sync(rocshmem_team_t team);
|
||||
|
||||
template <typename T>
|
||||
__device__ T amo_fetch(void* dst, T value, T cond, int pe, uint8_t atomic_op);
|
||||
@@ -186,13 +186,13 @@ class Context {
|
||||
template <typename T>
|
||||
__device__ T g(T* source, int pe);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void to_all(T* dest, const T* source, int nreduce, int PE_start,
|
||||
int logPE_stride, int PE_size, T* pWrk,
|
||||
long* pSync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__device__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce);
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce);
|
||||
|
||||
template <typename T>
|
||||
__device__ void put(T* dest, const T* source, size_t nelems, int pe);
|
||||
@@ -207,15 +207,15 @@ class Context {
|
||||
__device__ void get_nbi(T* dest, const T* source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall(roc_shmem_team_t team, T* dest, const T* source,
|
||||
__device__ void alltoall(rocshmem_team_t team, T* dest, const T* source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect(roc_shmem_team_t team, T* dest, const T* source,
|
||||
__device__ void fcollect(rocshmem_team_t team, T* dest, const T* source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void broadcast(roc_shmem_team_t team, T* dest, const T* source,
|
||||
__device__ void broadcast(rocshmem_team_t team, T* dest, const T* source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T>
|
||||
@@ -351,16 +351,16 @@ class Context {
|
||||
long* p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__host__ void broadcast(roc_shmem_team_t team, T* dest, const T* source,
|
||||
__host__ void broadcast(rocshmem_team_t team, T* dest, const T* source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(T* dest, const T* source, int nreduce, int PE_start,
|
||||
int logPE_stride, int PE_size, T* pWrk,
|
||||
long* pSync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce);
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce);
|
||||
|
||||
template <typename T>
|
||||
__host__ void wait_until(T *ivars, int cmp, T val);
|
||||
|
||||
@@ -154,7 +154,7 @@ __device__ void Context::sync_all() {
|
||||
DISPATCH(sync_all());
|
||||
}
|
||||
|
||||
__device__ void Context::sync(roc_shmem_team_t team) {
|
||||
__device__ void Context::sync(rocshmem_team_t team) {
|
||||
ctxStats.incStat(NUM_SYNC_ALL);
|
||||
|
||||
DISPATCH(sync(team));
|
||||
|
||||
+13
-13
@@ -62,7 +62,7 @@ __device__ T Context::g(T *source, int pe) {
|
||||
}
|
||||
|
||||
// The only way to get multi-arg templates to feed into a macro
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void Context::to_all(T *dest, const T *source, int nreduce,
|
||||
int PE_start, int logPE_stride, int PE_size,
|
||||
T *pWrk,
|
||||
@@ -79,11 +79,11 @@ __device__ void Context::to_all(T *dest, const T *source, int nreduce,
|
||||
PE_size, pWrk, pSync));
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__device__ int Context::reduce(roc_shmem_team_t team, T *dest, const T *source,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ int Context::reduce(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nreduce) {
|
||||
if (nreduce == 0) {
|
||||
return ROC_SHMEM_SUCCESS;
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
if (is_thread_zero_in_block()) {
|
||||
@@ -140,7 +140,7 @@ __device__ void Context::get_nbi(T *dest, const T *source, size_t nelems,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void Context::alltoall(roc_shmem_team_t team, T *dest,
|
||||
__device__ void Context::alltoall(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems) {
|
||||
if (nelems == 0) {
|
||||
return;
|
||||
@@ -154,7 +154,7 @@ __device__ void Context::alltoall(roc_shmem_team_t team, T *dest,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void Context::fcollect(roc_shmem_team_t team, T *dest,
|
||||
__device__ void Context::fcollect(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems) {
|
||||
if (nelems == 0) {
|
||||
return;
|
||||
@@ -168,7 +168,7 @@ __device__ void Context::fcollect(roc_shmem_team_t team, T *dest,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void Context::broadcast(roc_shmem_team_t team, T *dest,
|
||||
__device__ void Context::broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems, int pe_root) {
|
||||
if (nelems == 0) {
|
||||
return;
|
||||
@@ -340,32 +340,32 @@ __device__ __forceinline__ int Context::test(T *ivars, int cmp,
|
||||
int ret = 0;
|
||||
volatile T *vol_ivars = reinterpret_cast<T *>(ivars);
|
||||
switch (cmp) {
|
||||
case ROC_SHMEM_CMP_EQ:
|
||||
case ROCSHMEM_CMP_EQ:
|
||||
if (uncached_load(vol_ivars) == val) {
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
case ROC_SHMEM_CMP_NE:
|
||||
case ROCSHMEM_CMP_NE:
|
||||
if (uncached_load(vol_ivars) != val) {
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
case ROC_SHMEM_CMP_GT:
|
||||
case ROCSHMEM_CMP_GT:
|
||||
if (uncached_load(vol_ivars) > val) {
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
case ROC_SHMEM_CMP_GE:
|
||||
case ROCSHMEM_CMP_GE:
|
||||
if (uncached_load(vol_ivars) >= val) {
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
case ROC_SHMEM_CMP_LT:
|
||||
case ROCSHMEM_CMP_LT:
|
||||
if (uncached_load(vol_ivars) < val) {
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
case ROC_SHMEM_CMP_LE:
|
||||
case ROCSHMEM_CMP_LE:
|
||||
if (uncached_load(vol_ivars) <= val) {
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
@@ -194,7 +194,7 @@ __host__ void Context::broadcast(T *dest, const T *source, int nelems,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void Context::broadcast(roc_shmem_team_t team, T *dest,
|
||||
__host__ void Context::broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems,
|
||||
int pe_root) { // NOLINT(runtime/int)
|
||||
if (nelems == 0) {
|
||||
@@ -206,7 +206,7 @@ __host__ void Context::broadcast(roc_shmem_team_t team, T *dest,
|
||||
HOST_DISPATCH(broadcast<T>(team, dest, source, nelems, pe_root));
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void Context::to_all(T *dest, const T *source, int nreduce,
|
||||
int PE_start, int logPE_stride, int PE_size,
|
||||
T *pWrk,
|
||||
@@ -221,11 +221,11 @@ __host__ void Context::to_all(T *dest, const T *source, int nreduce,
|
||||
logPE_stride, PE_size, pWrk, pSync));
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ int Context::reduce(roc_shmem_team_t team, T *dest, const T *source,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ int Context::reduce(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nreduce) { // NOLINT(runtime/int)
|
||||
if (nreduce == 0) {
|
||||
return ROC_SHMEM_SUCCESS;
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
ctxHostStats.incStat(NUM_HOST_TO_ALL);
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_FENCE_POLICY_HPP_
|
||||
#define LIBRARY_SRC_FENCE_POLICY_HPP_
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -43,7 +43,7 @@ class Fence {
|
||||
* @param[in] options interpreted as a bitfield using bitwise operations
|
||||
*/
|
||||
__host__ __device__ Fence(long option) {
|
||||
if (option & ROC_SHMEM_CTX_NOSTORE) {
|
||||
if (option & ROCSHMEM_CTX_NOSTORE) {
|
||||
flush_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
+51
-51
@@ -30,7 +30,7 @@
|
||||
#include <cstdlib>
|
||||
#include <mutex> // NOLINT(build/c++11)
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "gpu_ib_team.hpp"
|
||||
@@ -47,10 +47,10 @@ namespace rocshmem {
|
||||
} \
|
||||
}
|
||||
|
||||
extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT;
|
||||
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
|
||||
|
||||
roc_shmem_team_t get_external_team(GPUIBTeam *team) {
|
||||
return reinterpret_cast<roc_shmem_team_t>(team);
|
||||
rocshmem_team_t get_external_team(GPUIBTeam *team) {
|
||||
return reinterpret_cast<rocshmem_team_t>(team);
|
||||
}
|
||||
|
||||
int get_ls_non_zero_bit(char *bitmask, int mask_length) {
|
||||
@@ -68,7 +68,7 @@ int get_ls_non_zero_bit(char *bitmask, int mask_length) {
|
||||
}
|
||||
|
||||
GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
|
||||
if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) {
|
||||
if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
|
||||
std::stringstream sstream(maximum_num_contexts_str);
|
||||
sstream >> maximum_num_contexts_;
|
||||
}
|
||||
@@ -96,7 +96,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
|
||||
|
||||
setup_team_world();
|
||||
|
||||
roc_shmem_collective_init();
|
||||
rocshmem_collective_init();
|
||||
|
||||
teams_init();
|
||||
|
||||
@@ -109,7 +109,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
|
||||
|
||||
#ifdef USE_HOST_SIDE_HDP_FLUSH
|
||||
hdp_gpu_cpu_flush_flag_ =
|
||||
static_cast<unsigned int *>(roc_shmem_malloc(sizeof(unsigned int)));
|
||||
static_cast<unsigned int *>(rocshmem_malloc(sizeof(unsigned int)));
|
||||
hdp_policy->set_flush_polling_ptr(hdp_gpu_cpu_flush_flag_);
|
||||
hdp_flush_worker_thread = std::thread(&GPUIBBackend::hdp_flush_poll, this);
|
||||
|
||||
@@ -125,7 +125,7 @@ GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
|
||||
}
|
||||
|
||||
__device__ bool GPUIBBackend::create_ctx(int64_t options,
|
||||
roc_shmem_ctx_t *ctx) {
|
||||
rocshmem_ctx_t *ctx) {
|
||||
GPUIBContext *ctx_;
|
||||
|
||||
auto pop_result = ctx_free_list.get()->pop_front();
|
||||
@@ -154,7 +154,7 @@ void GPUIBBackend::ctx_destroy(Context *ctx) {
|
||||
delete gpu_ib_host_ctx;
|
||||
}
|
||||
|
||||
__device__ void GPUIBBackend::destroy_ctx(roc_shmem_ctx_t *ctx) {
|
||||
__device__ void GPUIBBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
|
||||
ctx_free_list.get()->push_back(static_cast<GPUIBContext *>(ctx->ctx_opaque));
|
||||
}
|
||||
|
||||
@@ -167,7 +167,7 @@ GPUIBBackend::~GPUIBBackend() {
|
||||
#ifdef USE_HOST_SIDE_HDP_FLUSH
|
||||
hdp_flush_worker_thread.join();
|
||||
hdp_policy->set_flush_polling_ptr(nullptr);
|
||||
roc_shmem_free(hdp_gpu_cpu_flush_flag_);
|
||||
rocshmem_free(hdp_gpu_cpu_flush_flag_);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@@ -203,7 +203,7 @@ void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team,
|
||||
TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) {
|
||||
rocshmem_team_t *new_team) {
|
||||
/**
|
||||
* Read the bit mask and find out a common index into
|
||||
* the pool of available work arrays.
|
||||
@@ -237,7 +237,7 @@ void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team,
|
||||
*new_team = get_external_team(new_team_obj);
|
||||
}
|
||||
|
||||
void GPUIBBackend::team_destroy(roc_shmem_team_t team) {
|
||||
void GPUIBBackend::team_destroy(rocshmem_team_t team) {
|
||||
GPUIBTeam *team_obj = get_internal_gpu_ib_team(team);
|
||||
|
||||
/* Mark the pool as available */
|
||||
@@ -263,7 +263,7 @@ void GPUIBBackend::initialize_network() { networkImpl.networkHostSetup(this); }
|
||||
|
||||
void GPUIBBackend::setup_default_host_ctx() {
|
||||
default_host_ctx_ = new GPUIBHostContext(this, 0);
|
||||
ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_;
|
||||
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_;
|
||||
}
|
||||
|
||||
void GPUIBBackend::setup_ctxs() {
|
||||
@@ -288,19 +288,19 @@ void GPUIBBackend::setup_default_ctx() {
|
||||
new (default_ctx_) GPUIBContext(this, true, 0);
|
||||
|
||||
/*
|
||||
* Set the ROC_SHMEM_CTX_DEFAULT in constant memory.
|
||||
* Set the ROCSHMEM_CTX_DEFAULT in constant memory.
|
||||
*/
|
||||
int *symbol_address;
|
||||
CHECK_HIP(hipGetSymbolAddress(reinterpret_cast<void **>(&symbol_address),
|
||||
HIP_SYMBOL(ROC_SHMEM_CTX_DEFAULT)));
|
||||
HIP_SYMBOL(ROCSHMEM_CTX_DEFAULT)));
|
||||
|
||||
TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world;
|
||||
roc_shmem_ctx_t ctx_default_host{default_ctx_, tinfo};
|
||||
rocshmem_ctx_t ctx_default_host{default_ctx_, tinfo};
|
||||
|
||||
hipStream_t stream;
|
||||
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
CHECK_HIP(hipMemcpyAsync(symbol_address, &ctx_default_host,
|
||||
sizeof(roc_shmem_ctx_t), hipMemcpyDefault, stream));
|
||||
sizeof(rocshmem_ctx_t), hipMemcpyDefault, stream));
|
||||
CHECK_HIP(hipStreamSynchronize(stream));
|
||||
CHECK_HIP(hipStreamDestroy(stream));
|
||||
}
|
||||
@@ -328,9 +328,9 @@ void GPUIBBackend::setup_team_world() {
|
||||
team_tracker.set_team_world(team_world);
|
||||
|
||||
/**
|
||||
* Copy the address to ROC_SHMEM_TEAM_WORLD.
|
||||
* Copy the address to ROCSHMEM_TEAM_WORLD.
|
||||
*/
|
||||
ROC_SHMEM_TEAM_WORLD = reinterpret_cast<roc_shmem_team_t>(team_world);
|
||||
ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
|
||||
}
|
||||
|
||||
void GPUIBBackend::init_mpi_once(MPI_Comm comm) {
|
||||
@@ -381,19 +381,19 @@ void GPUIBBackend::teams_init() {
|
||||
* Allocate pools for the teams sync and work arrary from the SHEAP.
|
||||
*/
|
||||
auto max_num_teams{team_tracker.get_max_num_teams()};
|
||||
barrier_pSync_pool = reinterpret_cast<long *>(roc_shmem_malloc(
|
||||
sizeof(long) * ROC_SHMEM_BARRIER_SYNC_SIZE * max_num_teams));
|
||||
reduce_pSync_pool = reinterpret_cast<long *>(roc_shmem_malloc(
|
||||
sizeof(long) * ROC_SHMEM_REDUCE_SYNC_SIZE * max_num_teams));
|
||||
bcast_pSync_pool = reinterpret_cast<long *>(roc_shmem_malloc(
|
||||
sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE * max_num_teams));
|
||||
alltoall_pSync_pool = reinterpret_cast<long *>(roc_shmem_malloc(
|
||||
sizeof(long) * ROC_SHMEM_ALLTOALL_SYNC_SIZE * max_num_teams));
|
||||
barrier_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
|
||||
sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams));
|
||||
reduce_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
|
||||
sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams));
|
||||
bcast_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
|
||||
sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams));
|
||||
alltoall_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
|
||||
sizeof(long) * ROCSHMEM_ALLTOALL_SYNC_SIZE * max_num_teams));
|
||||
|
||||
/* Accommodating for largest possible data type for pWrk */
|
||||
pWrk_pool = roc_shmem_malloc(
|
||||
sizeof(double) * ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams);
|
||||
pAta_pool = roc_shmem_malloc(sizeof(double) * ROC_SHMEM_ATA_MAX_WRKDATA_SIZE *
|
||||
pWrk_pool = rocshmem_malloc(
|
||||
sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams);
|
||||
pAta_pool = rocshmem_malloc(sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE *
|
||||
max_num_teams);
|
||||
|
||||
/**
|
||||
@@ -402,25 +402,25 @@ void GPUIBBackend::teams_init() {
|
||||
long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
|
||||
for (int team_i = 0; team_i < max_num_teams; team_i++) {
|
||||
barrier_pSync = reinterpret_cast<long *>(
|
||||
&barrier_pSync_pool[team_i * ROC_SHMEM_BARRIER_SYNC_SIZE]);
|
||||
&barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
|
||||
reduce_pSync = reinterpret_cast<long *>(
|
||||
&reduce_pSync_pool[team_i * ROC_SHMEM_REDUCE_SYNC_SIZE]);
|
||||
&reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
|
||||
bcast_pSync = reinterpret_cast<long *>(
|
||||
&bcast_pSync_pool[team_i * ROC_SHMEM_BCAST_SYNC_SIZE]);
|
||||
&bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
|
||||
alltoall_pSync = reinterpret_cast<long *>(
|
||||
&alltoall_pSync_pool[team_i * ROC_SHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
&alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
|
||||
for (int i = 0; i < ROC_SHMEM_BARRIER_SYNC_SIZE; i++) {
|
||||
barrier_pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
|
||||
barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) {
|
||||
reduce_pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
|
||||
reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROC_SHMEM_BCAST_SYNC_SIZE; i++) {
|
||||
bcast_pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
|
||||
bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROC_SHMEM_ALLTOALL_SYNC_SIZE; i++) {
|
||||
alltoall_pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
|
||||
alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -457,30 +457,30 @@ void GPUIBBackend::teams_init() {
|
||||
}
|
||||
|
||||
void GPUIBBackend::teams_destroy() {
|
||||
roc_shmem_free(barrier_pSync_pool);
|
||||
roc_shmem_free(reduce_pSync_pool);
|
||||
roc_shmem_free(bcast_pSync_pool);
|
||||
roc_shmem_free(alltoall_pSync_pool);
|
||||
roc_shmem_free(pWrk_pool);
|
||||
roc_shmem_free(pAta_pool);
|
||||
rocshmem_free(barrier_pSync_pool);
|
||||
rocshmem_free(reduce_pSync_pool);
|
||||
rocshmem_free(bcast_pSync_pool);
|
||||
rocshmem_free(alltoall_pSync_pool);
|
||||
rocshmem_free(pWrk_pool);
|
||||
rocshmem_free(pAta_pool);
|
||||
|
||||
free(pool_bitmask_);
|
||||
free(reduced_bitmask_);
|
||||
}
|
||||
|
||||
void GPUIBBackend::roc_shmem_collective_init() {
|
||||
void GPUIBBackend::rocshmem_collective_init() {
|
||||
/*
|
||||
* Allocate heap space for barrier_sync
|
||||
*/
|
||||
size_t one_sync_size_bytes{sizeof(*barrier_sync)};
|
||||
size_t sync_size_bytes{one_sync_size_bytes * ROC_SHMEM_BARRIER_SYNC_SIZE};
|
||||
size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
|
||||
heap.malloc(reinterpret_cast<void **>(&barrier_sync), sync_size_bytes);
|
||||
|
||||
/*
|
||||
* Initialize the barrier synchronization array with default values.
|
||||
*/
|
||||
for (int i = 0; i < num_pes; i++) {
|
||||
barrier_sync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
+11
-11
@@ -72,19 +72,19 @@ class GPUIBBackend : public Backend {
|
||||
void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) override;
|
||||
rocshmem_team_t *new_team) override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::team_destroy(roc_shmem_team_t)
|
||||
* @copydoc Backend::team_destroy(rocshmem_team_t)
|
||||
*/
|
||||
void team_destroy(roc_shmem_team_t team) override;
|
||||
void team_destroy(rocshmem_team_t team) override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::ctx_create
|
||||
*/
|
||||
void ctx_create(int64_t options, void **ctx) override;
|
||||
|
||||
__device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx);
|
||||
__device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
|
||||
|
||||
/**
|
||||
* @copydoc Backend::ctx_destroy
|
||||
@@ -94,7 +94,7 @@ class GPUIBBackend : public Backend {
|
||||
/**
|
||||
* @copydoc Backend::ctx_destroy
|
||||
*/
|
||||
__device__ void destroy_ctx(roc_shmem_ctx_t *ctx);
|
||||
__device__ void destroy_ctx(rocshmem_ctx_t *ctx);
|
||||
|
||||
protected:
|
||||
/**
|
||||
@@ -151,10 +151,10 @@ class GPUIBBackend : public Backend {
|
||||
void initialize_ipc();
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize the ROC_SHMEM_CTX_DEFAULT variable.
|
||||
* @brief Allocate and initialize the ROCSHMEM_CTX_DEFAULT variable.
|
||||
*
|
||||
* @todo The default_ctx member looks unused after it is copied into
|
||||
* the ROC_SHMEM_CTX_DEFAULT variable.
|
||||
* the ROCSHMEM_CTX_DEFAULT variable.
|
||||
*/
|
||||
void setup_default_ctx();
|
||||
void setup_ctxs();
|
||||
@@ -187,7 +187,7 @@ class GPUIBBackend : public Backend {
|
||||
* When this method completes, the barrier_sync member will be available
|
||||
* for use.
|
||||
*/
|
||||
void roc_shmem_collective_init();
|
||||
void rocshmem_collective_init();
|
||||
|
||||
#ifdef USE_HOST_SIDE_HDP_FLUSH
|
||||
/**
|
||||
@@ -245,8 +245,8 @@ class GPUIBBackend : public Backend {
|
||||
void *pAta_pool{nullptr};
|
||||
|
||||
/**
|
||||
* @brief ROC_SHMEM's copy of MPI_COMM_WORLD (for interoperability
|
||||
* with orthogonal MPI usage in an MPI+ROC_SHMEM program).
|
||||
* @brief rocSHMEM's copy of MPI_COMM_WORLD (for interoperability
|
||||
* with orthogonal MPI usage in an MPI+rocSHMEM program).
|
||||
*/
|
||||
MPI_Comm gpu_ib_comm_world{};
|
||||
MPI_Comm backend_comm{};
|
||||
@@ -334,7 +334,7 @@ class GPUIBBackend : public Backend {
|
||||
* specification).
|
||||
*
|
||||
* @todo Remove this member from the backend class. There is another
|
||||
* copy stored in ROC_SHMEM_CTX_DEFAULT.
|
||||
* copy stored in ROCSHMEM_CTX_DEFAULT.
|
||||
*/
|
||||
GPUIBContext *default_ctx_{nullptr};
|
||||
|
||||
|
||||
@@ -39,19 +39,19 @@ int Connection::coherent_cq = 0;
|
||||
Connection::Connection(GPUIBBackend* b, int k) : backend(b), key_offset(k) {
|
||||
char* value = nullptr;
|
||||
|
||||
if ((value = getenv("ROC_SHMEM_USE_IB_HCA"))) {
|
||||
if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) {
|
||||
requested_dev = value;
|
||||
}
|
||||
|
||||
if ((value = getenv("ROC_SHMEM_SQ_SIZE"))) {
|
||||
if ((value = getenv("ROCSHMEM_SQ_SIZE"))) {
|
||||
sq_size = atoi(value);
|
||||
}
|
||||
|
||||
if ((value = getenv("ROC_SHMEM_USE_CQ_GPU_MEM")) != nullptr) {
|
||||
if ((value = getenv("ROCSHMEM_USE_CQ_GPU_MEM")) != nullptr) {
|
||||
cq_use_gpu_mem = atoi(value);
|
||||
}
|
||||
|
||||
if ((value = getenv("ROC_SHMEM_USE_SQ_GPU_MEM")) != nullptr) {
|
||||
if ((value = getenv("ROCSHMEM_USE_SQ_GPU_MEM")) != nullptr) {
|
||||
sq_use_gpu_mem = atoi(value);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ extern "C" {
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "connection_policy.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "backend_ib.hpp"
|
||||
|
||||
@@ -69,7 +69,7 @@ class GPUIBContext : public Context {
|
||||
|
||||
__device__ void sync_all();
|
||||
|
||||
__device__ void sync(roc_shmem_team_t team);
|
||||
__device__ void sync(rocshmem_team_t team);
|
||||
|
||||
template <typename T>
|
||||
__device__ void amo_add(void *dst, T value, int pe);
|
||||
@@ -113,13 +113,13 @@ class GPUIBContext : public Context {
|
||||
template <typename T>
|
||||
__device__ T g(const T *source, int pe);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void to_all(T *dest, const T *source, int nreduce, int PE_start,
|
||||
int logPE_stride, int PE_size, T *pWrk,
|
||||
long *pSync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__device__ void to_all(roc_shmem_team_t team, T *dest, const T *source,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void to_all(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nreduce);
|
||||
|
||||
template <typename T>
|
||||
@@ -135,7 +135,7 @@ class GPUIBContext : public Context {
|
||||
__device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T>
|
||||
@@ -144,43 +144,43 @@ class GPUIBContext : public Context {
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_broadcast(roc_shmem_team_t team, T *dest,
|
||||
__device__ void alltoall_broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_brucks(roc_shmem_team_t team, T *dest,
|
||||
__device__ void alltoall_brucks(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_gcen(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_gcen2(roc_shmem_team_t team, T *dest,
|
||||
__device__ void alltoall_gcen2(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_broadcast(roc_shmem_team_t team, T *dest,
|
||||
__device__ void fcollect_broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_brucks(roc_shmem_team_t team, T *dest,
|
||||
__device__ void fcollect_brucks(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_gcen(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_gcen2(roc_shmem_team_t team, T *dest,
|
||||
__device__ void fcollect_gcen2(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
__device__ void putmem_wg(void *dest, const void *source, size_t nelems,
|
||||
@@ -232,13 +232,13 @@ class GPUIBContext : public Context {
|
||||
__device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
private:
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void internal_direct_allreduce(
|
||||
T *dst, const T *src, int nelems, int PE_start, int logPE_stride,
|
||||
int PE_size, T *pWrk,
|
||||
long *pSync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void internal_ring_allreduce(T *dst, const T *src, int nelems,
|
||||
int PE_start, int logPE_stride,
|
||||
int PE_size, T *pWrk,
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "context_ib_tmpl_device.hpp"
|
||||
#include "../util.hpp"
|
||||
@@ -35,8 +35,8 @@ __device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start,
|
||||
// Go through all PE offsets (except current offset = 0)
|
||||
// and wait until they all reach
|
||||
for (size_t i = 1; i < n_pes; i++) {
|
||||
wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
threadfence_system();
|
||||
// Announce to other PEs that all have reached
|
||||
@@ -48,8 +48,8 @@ __device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start,
|
||||
// Mark current PE offset as reached
|
||||
size_t pe_offset = (pe - PE_start) / stride;
|
||||
put_nbi(&pSync[pe_offset], &flag_val, 1, PE_start);
|
||||
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
@@ -59,16 +59,16 @@ __device__ void GPUIBContext::internal_atomic_barrier(int pe, int PE_start,
|
||||
int64_t *pSync) {
|
||||
int64_t flag_val = 1;
|
||||
if (pe == PE_start) {
|
||||
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(n_pes - 1));
|
||||
pSync[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
|
||||
put_nbi(&pSync[0], &flag_val, 1, j);
|
||||
}
|
||||
} else {
|
||||
amo_add<int64_t>(&pSync[0], flag_val, PE_start);
|
||||
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
@@ -88,7 +88,7 @@ __device__ void GPUIBContext::internal_sync(int pe, int PE_start, int stride,
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::sync(roc_shmem_team_t team) {
|
||||
__device__ void GPUIBContext::sync(rocshmem_team_t team) {
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
|
||||
double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
|
||||
|
||||
@@ -86,16 +86,16 @@ class GPUIBHostContext : public Context {
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
|
||||
int log_pe_stride, int pe_size, T *p_wrk,
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ void to_all(roc_shmem_team_t team, T *dest, const T *source,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nreduce);
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -24,16 +24,16 @@
|
||||
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "context_ib_device.hpp"
|
||||
#include "gpu_ib_team.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
#include "../util.hpp"
|
||||
#include "../roc_shmem_calc.hpp"
|
||||
#include "../rocshmem_calc.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
|
||||
int wg_size) {
|
||||
for (size_t i = wg_id; i < size; i += wg_size) {
|
||||
@@ -47,7 +47,7 @@ __device__ void GPUIBContext::p(T *dest, T value, int pe) {
|
||||
putmem_nbi(dest, &value, sizeof(T), pe);
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void GPUIBContext::internal_ring_allreduce(
|
||||
T *dst, const T *src, int nelems, [[maybe_unused]] int PE_start,
|
||||
[[maybe_unused]] int logPE_stride, [[maybe_unused]] int PE_size, T *pWrk,
|
||||
@@ -81,7 +81,7 @@ __device__ void GPUIBContext::internal_ring_allreduce(
|
||||
wait_val = seg + 100;
|
||||
p(&pSync[round], wait_val, send_pe);
|
||||
|
||||
wait_until(&pSync[round], ROC_SHMEM_CMP_EQ, wait_val);
|
||||
wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val);
|
||||
__threadfence();
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -99,19 +99,19 @@ __device__ void GPUIBContext::internal_ring_allreduce(
|
||||
fence();
|
||||
wait_val = seg + 100;
|
||||
p(&pSync[round], wait_val, send_pe);
|
||||
wait_until(&pSync[round], ROC_SHMEM_CMP_EQ, wait_val);
|
||||
wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
|
||||
pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void GPUIBContext::internal_direct_allreduce(
|
||||
T *dst, const T *src, int nelems, int PE_start, int logPE_stride,
|
||||
int PE_size, T *pWrk,
|
||||
@@ -147,7 +147,7 @@ __device__ void GPUIBContext::internal_direct_allreduce(
|
||||
if (i != pe) {
|
||||
// Wait for leader thread to see that the buffer is ready.
|
||||
if (is_thread_zero_in_block()) {
|
||||
wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, 1L);
|
||||
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, 1L);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -159,14 +159,14 @@ __device__ void GPUIBContext::internal_direct_allreduce(
|
||||
__syncthreads();
|
||||
|
||||
for (int i = wg_id; i < num_pes; i += wg_size) {
|
||||
pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__device__ void GPUIBContext::to_all(roc_shmem_team_t team, T *dest,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void GPUIBContext::to_all(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nreduce) {
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
|
||||
@@ -189,7 +189,7 @@ __device__ void GPUIBContext::to_all(roc_shmem_team_t team, T *dest,
|
||||
p_sync);
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce,
|
||||
int PE_start, int logPE_stride,
|
||||
int PE_size, T *pWrk,
|
||||
@@ -200,8 +200,8 @@ __device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce,
|
||||
size_t ring_pSync = 2 * num_pes;
|
||||
|
||||
size_t provided_pWrk =
|
||||
max(nreduce / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
|
||||
size_t provided_pSync = ROC_SHMEM_REDUCE_SYNC_SIZE;
|
||||
max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
|
||||
size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE;
|
||||
|
||||
// TODO(bpotter):
|
||||
// We basically do a direct reduce if pWrk is big enough, else we
|
||||
@@ -212,12 +212,12 @@ __device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce,
|
||||
internal_direct_allreduce<T, Op>(dest, source, nreduce, PE_start,
|
||||
logPE_stride, PE_size, pWrk, pSync);
|
||||
} else {
|
||||
if (ring_pSync <= ROC_SHMEM_REDUCE_SYNC_SIZE) {
|
||||
if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) {
|
||||
int chunk_size = 1024;
|
||||
size_t ring_pWrk = chunk_size * num_pes;
|
||||
if (provided_pWrk < ring_pWrk) {
|
||||
ring_pWrk = max(nreduce / 2, // NOLINT
|
||||
ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
|
||||
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
|
||||
chunk_size = ring_pWrk / num_pes;
|
||||
}
|
||||
int seg_size = ring_pWrk;
|
||||
@@ -434,7 +434,7 @@ __device__ void GPUIBContext::internal_get_broadcast(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::broadcast(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::broadcast(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems, int pe_root) {
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
|
||||
@@ -475,14 +475,14 @@ __device__ void GPUIBContext::broadcast(T *dst, const T *src, int nelems,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::alltoall(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::alltoall(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// Currently broadcast implementation performs the best
|
||||
alltoall_broadcast(team, dst, src, nelems);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::alltoall_broadcast(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::alltoall_broadcast(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// Broadcast implementation of alltoall collective
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
@@ -514,7 +514,7 @@ __device__ void GPUIBContext::alltoall_broadcast(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::alltoall_brucks(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// Brucks implementation of alltoall collective
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
@@ -537,7 +537,7 @@ __device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst,
|
||||
int blk_size = get_flat_block_size();
|
||||
|
||||
// Check if we have enough buffer space. If not, fail.
|
||||
if (pe_size * nelems * 2 > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
if (pe_size * nelems * 2 > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
|
||||
assert(false);
|
||||
}
|
||||
@@ -612,7 +612,7 @@ __device__ void GPUIBContext::alltoall_brucks(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::alltoall_gcen(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// GPU-centric implementation of alltoall collective
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
@@ -629,12 +629,12 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst,
|
||||
int stride = 1 << log_pe_stride;
|
||||
|
||||
long *pSync = team_obj->alltoall_pSync;
|
||||
int64_t *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE];
|
||||
int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
|
||||
int my_pe_in_team = team_obj->my_pe;
|
||||
|
||||
// Check if we have enough buffer space. If not, fail.
|
||||
T *pAta = reinterpret_cast<T *>(team_obj->pAta);
|
||||
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
|
||||
assert(false);
|
||||
}
|
||||
@@ -677,29 +677,29 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst,
|
||||
if (dest_pe2 != my_pe) amo_add<int64_t>(&pSync[0], flag_val, dest_pe2);
|
||||
|
||||
if (my_pe == dest_pe) {
|
||||
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1));
|
||||
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
|
||||
pSync2[0] = ROCSHMEM_SYNC_VALUE;
|
||||
__threadfence_system();
|
||||
for (int i = 1; i < clust_size; ++i)
|
||||
put_nbi(&pSync2[0], &flag_val, 1,
|
||||
team_obj->get_pe_in_world(my_pe_in_team + i));
|
||||
} else {
|
||||
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync2[0] = ROCSHMEM_SYNC_VALUE;
|
||||
__threadfence_system();
|
||||
}
|
||||
|
||||
if (my_pe == dest_pe2) {
|
||||
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(num_clust - 1));
|
||||
pSync[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(num_clust - 1));
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
for (size_t i = 1, j = dest_pe2 + clust_size * stride; i < num_clust;
|
||||
++i, j += clust_size * stride) {
|
||||
put_nbi(&pSync[0], &flag_val, 1, j);
|
||||
}
|
||||
} else {
|
||||
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
@@ -707,7 +707,7 @@ __device__ void GPUIBContext::alltoall_gcen(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::alltoall_gcen2(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// GPU-centric implementation of alltoall collective
|
||||
// Uses in-place blocking sync
|
||||
@@ -725,12 +725,12 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst,
|
||||
int stride = 1 << log_pe_stride;
|
||||
|
||||
long *pSync = team_obj->alltoall_pSync;
|
||||
int64_t *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE];
|
||||
int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
|
||||
int my_pe_in_team = team_obj->my_pe;
|
||||
|
||||
// Check if we have enough buffer space. If not, fail.
|
||||
T *pAta = reinterpret_cast<T *>(team_obj->pAta);
|
||||
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
|
||||
assert(false);
|
||||
}
|
||||
@@ -771,15 +771,15 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst,
|
||||
if (is_thread_zero_in_block()) {
|
||||
quiet();
|
||||
if ((my_pe_in_team % clust_size) == 0) {
|
||||
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1));
|
||||
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
|
||||
pSync2[0] = ROCSHMEM_SYNC_VALUE;
|
||||
__threadfence_system();
|
||||
for (int i = 1; i < clust_size; ++i)
|
||||
put_nbi(&pSync2[0], &flag_val, 1,
|
||||
team_obj->get_pe_in_world(my_pe_in_team + i));
|
||||
} else {
|
||||
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync2[0] = ROCSHMEM_SYNC_VALUE;
|
||||
__threadfence_system();
|
||||
}
|
||||
}
|
||||
@@ -790,7 +790,7 @@ __device__ void GPUIBContext::alltoall_gcen2(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::fcollect(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::fcollect(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// Main function for fcollect
|
||||
// Broadcast version performs moderately well
|
||||
@@ -799,7 +799,7 @@ __device__ void GPUIBContext::fcollect(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::fcollect_broadcast(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::fcollect_broadcast(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// Broadcast implementation of fcollect collective
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
@@ -832,7 +832,7 @@ __device__ void GPUIBContext::fcollect_broadcast(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::fcollect_brucks(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// Brucks implementation of fcollect collective
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
@@ -855,7 +855,7 @@ __device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst,
|
||||
int blk_size = get_flat_block_size();
|
||||
|
||||
// Check if we have enough buffer space. If not, fail.
|
||||
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
|
||||
assert(false);
|
||||
}
|
||||
@@ -895,7 +895,7 @@ __device__ void GPUIBContext::fcollect_brucks(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::fcollect_gcen(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// GPU-centric implementation of fcollect collective
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
@@ -912,12 +912,12 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst,
|
||||
int stride = 1 << log_pe_stride;
|
||||
|
||||
long *pSync = team_obj->alltoall_pSync;
|
||||
long *pSync2 = &team_obj->alltoall_pSync[ROC_SHMEM_BARRIER_SYNC_SIZE];
|
||||
long *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
|
||||
int my_pe_in_team = team_obj->my_pe;
|
||||
|
||||
// Check if we have enough buffer space. If not, fail.
|
||||
T *pAta = reinterpret_cast<T *>(team_obj->pAta);
|
||||
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
|
||||
assert(false);
|
||||
}
|
||||
@@ -957,15 +957,15 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst,
|
||||
if (is_thread_zero_in_block()) {
|
||||
quiet();
|
||||
if ((my_pe_in_team % clust_size) == 0) {
|
||||
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val * (clust_size - 1));
|
||||
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
|
||||
pSync2[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
for (int i = 1; i < clust_size; ++i)
|
||||
put_nbi(&pSync2[0], &flag_val, 1,
|
||||
team_obj->get_pe_in_world(my_pe_in_team + i));
|
||||
} else {
|
||||
wait_until(pSync2, ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync2[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync2[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
@@ -976,7 +976,7 @@ __device__ void GPUIBContext::fcollect_gcen(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void GPUIBContext::fcollect_gcen2(roc_shmem_team_t team, T *dst,
|
||||
__device__ void GPUIBContext::fcollect_gcen2(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
// GPU-centric implementation of fcollect collective
|
||||
// Uses in-place blocking sync
|
||||
@@ -998,7 +998,7 @@ __device__ void GPUIBContext::fcollect_gcen2(roc_shmem_team_t team, T *dst,
|
||||
|
||||
// Check if we have enough buffer space. If not, fail.
|
||||
T *pAta = reinterpret_cast<T *>(team_obj->pAta);
|
||||
if (pe_size * nelems > ROC_SHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
|
||||
GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
|
||||
assert(false);
|
||||
}
|
||||
|
||||
@@ -93,13 +93,13 @@ __host__ void GPUIBHostContext::broadcast(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::broadcast(roc_shmem_team_t team, T *dest,
|
||||
__host__ void GPUIBHostContext::broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems,
|
||||
int pe_root) {
|
||||
host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce,
|
||||
int pe_start, int log_pe_stride,
|
||||
int pe_size, T *p_wrk,
|
||||
@@ -108,8 +108,8 @@ __host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce,
|
||||
pe_size, p_wrk, p_sync);
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ void GPUIBHostContext::to_all(roc_shmem_team_t team, T *dest,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void GPUIBHostContext::to_all(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nreduce) {
|
||||
host_interface->to_all<T, Op>(team, dest, source, nreduce);
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "roc_shmem/debug.hpp"
|
||||
#include "rocshmem/debug.hpp"
|
||||
|
||||
#include "qe_dumper.hpp"
|
||||
|
||||
|
||||
@@ -31,11 +31,11 @@ namespace rocshmem {
|
||||
DynamicConnection::DynamicConnection(GPUIBBackend* b) : Connection(b, 4) {
|
||||
char* value = nullptr;
|
||||
|
||||
if ((value = getenv("ROC_SHMEM_NUM_DCIs"))) {
|
||||
if ((value = getenv("ROCSHMEM_NUM_DCIs"))) {
|
||||
num_dcis = atoi(value);
|
||||
}
|
||||
|
||||
if ((value = getenv("ROC_SHMEM_NUM_DCT"))) {
|
||||
if ((value = getenv("ROCSHMEM_NUM_DCT"))) {
|
||||
num_dct = atoi(value);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,17 +38,17 @@ GPUIBTeam::GPUIBTeam(Backend *backend, TeamInfo *team_info_parent,
|
||||
pool_index_ = pool_index;
|
||||
|
||||
barrier_pSync =
|
||||
&(b->barrier_pSync_pool[pool_index * ROC_SHMEM_BARRIER_SYNC_SIZE]);
|
||||
&(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
|
||||
reduce_pSync =
|
||||
&(b->reduce_pSync_pool[pool_index * ROC_SHMEM_REDUCE_SYNC_SIZE]);
|
||||
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROC_SHMEM_BCAST_SYNC_SIZE]);
|
||||
&(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
|
||||
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
|
||||
alltoall_pSync =
|
||||
&(b->alltoall_pSync_pool[pool_index * ROC_SHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
&(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
|
||||
pWrk = reinterpret_cast<char *>(b->pWrk_pool) +
|
||||
ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
pAta = reinterpret_cast<char *>(b->pAta_pool) +
|
||||
ROC_SHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
}
|
||||
|
||||
GPUIBTeam::~GPUIBTeam() {}
|
||||
|
||||
@@ -297,7 +297,7 @@ void NetworkOnImpl::setup_gpu_qps(GPUIBBackend *B) {
|
||||
}
|
||||
}
|
||||
|
||||
void NetworkOnImpl::roc_shmem_g_init(SymmetricHeap *heap_handle,
|
||||
void NetworkOnImpl::rocshmem_g_init(SymmetricHeap *heap_handle,
|
||||
MPI_Comm thread_comm) {
|
||||
init_g_ret(heap_handle, thread_comm, num_blocks, &g_ret);
|
||||
}
|
||||
@@ -327,7 +327,7 @@ __host__ void NetworkOnImpl::networkHostSetup(GPUIBBackend *B) {
|
||||
|
||||
connection->initialize_gpu_policy(&connection_policy, heap_rkey);
|
||||
|
||||
roc_shmem_g_init(&B->heap, B->thread_comm);
|
||||
rocshmem_g_init(&B->heap, B->thread_comm);
|
||||
|
||||
connection->post_wqes();
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
#include <mpi.h>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "connection_policy.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
#include "../hdp_policy.hpp"
|
||||
@@ -154,7 +154,7 @@ class NetworkOnImpl {
|
||||
* @brief Allocate and initialize device-side memory that will be used for
|
||||
* the return of g shmem ops (eg: shmem_int_g)
|
||||
*/
|
||||
void roc_shmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm);
|
||||
void rocshmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm);
|
||||
|
||||
/**
|
||||
* @brief The backend delegates some InfiniBand connection setup to
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
*
|
||||
* @section DESCRIPTION
|
||||
* An IB QueuePair (SQ and CQ) that the device can use to perform network
|
||||
* operations. Most important ROC_SHMEM operations are performed by this
|
||||
* operations. Most important rocSHMEM operations are performed by this
|
||||
* class.
|
||||
*/
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ class QueuePair;
|
||||
|
||||
/*
|
||||
* GPU single-thread policy class. Only a single work-item per work-group
|
||||
* is allowed to call into a ROC_SHMEM function (unless it is specifically
|
||||
* is allowed to call into a rocSHMEM function (unless it is specifically
|
||||
* called out as a collective API. This thread policy is the fastest but
|
||||
* is not as flexible.
|
||||
*/
|
||||
@@ -59,7 +59,7 @@ class SingleThreadImpl {
|
||||
|
||||
/*
|
||||
* GPU multi-thread policy class. Multiple work-items per work-group are
|
||||
* allowed to call into a ROC_SHMEM function. A bit slower than its
|
||||
* allowed to call into a rocSHMEM function. A bit slower than its
|
||||
* single-thread counterpart but it enables a much more flexible user-facing
|
||||
* API.
|
||||
*/
|
||||
|
||||
+3
-3
@@ -83,13 +83,13 @@ int HostInterface::find_win_info_in_pool(WindowInfo* window_info) {
|
||||
}
|
||||
|
||||
__host__ HostInterface::HostInterface(HdpPolicy* hdp_policy,
|
||||
MPI_Comm roc_shmem_comm,
|
||||
MPI_Comm rocshmem_comm,
|
||||
SymmetricHeap* heap) {
|
||||
/*
|
||||
* Duplicate a communicator from roc_shem's comm
|
||||
* world for the host interface
|
||||
*/
|
||||
MPI_Comm_dup(roc_shmem_comm, &host_comm_world_);
|
||||
MPI_Comm_dup(rocshmem_comm, &host_comm_world_);
|
||||
MPI_Comm_rank(host_comm_world_, &my_pe_);
|
||||
MPI_Comm_rank(host_comm_world_, &num_pes_);
|
||||
|
||||
@@ -103,7 +103,7 @@ __host__ HostInterface::HostInterface(HdpPolicy* hdp_policy,
|
||||
* Allocate and initialize pool of windows for contexts
|
||||
*/
|
||||
char* value{nullptr};
|
||||
if ((value = getenv("ROC_SHMEM_MAX_NUM_HOST_CONTEXTS"))) {
|
||||
if ((value = getenv("ROCSHMEM_MAX_NUM_HOST_CONTEXTS"))) {
|
||||
max_num_ctxs_ = atoi(value);
|
||||
}
|
||||
|
||||
|
||||
+8
-8
@@ -36,7 +36,7 @@
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../hdp_policy.hpp"
|
||||
#include "../memory/symmetric_heap.hpp"
|
||||
#include "../memory/window_info.hpp"
|
||||
@@ -104,7 +104,7 @@ class HostInterface {
|
||||
/**
|
||||
* @brief Primary constructor
|
||||
*/
|
||||
__host__ HostInterface(HdpPolicy* hdp_policy, MPI_Comm roc_shmem_comm,
|
||||
__host__ HostInterface(HdpPolicy* hdp_policy, MPI_Comm rocshmem_comm,
|
||||
SymmetricHeap* heap);
|
||||
|
||||
/**
|
||||
@@ -198,16 +198,16 @@ class HostInterface {
|
||||
long* p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__host__ void broadcast(roc_shmem_team_t team, T* dest, const T* source,
|
||||
__host__ void broadcast(rocshmem_team_t team, T* dest, const T* source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(T* dest, const T* source, int nreduce, int pe_start,
|
||||
int log_pe_stride, int pe_size, T* p_wrk,
|
||||
long* p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce);
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ int reduce(rocshmem_team_t team, T* dest, const T* source, int nreduce);
|
||||
|
||||
template <typename T>
|
||||
__host__ void wait_until(T *ivars, int cmp, T val,
|
||||
@@ -288,7 +288,7 @@ class HostInterface {
|
||||
|
||||
__host__ MPI_Comm get_mpi_comm(int pe_start, int log_pe_stride, int pe_size);
|
||||
|
||||
__host__ MPI_Op get_mpi_op(ROC_SHMEM_OP Op);
|
||||
__host__ MPI_Op get_mpi_op(ROCSHMEM_OP Op);
|
||||
|
||||
template <typename T>
|
||||
__host__ MPI_Datatype get_mpi_type();
|
||||
@@ -300,7 +300,7 @@ class HostInterface {
|
||||
__host__ int test_and_compare(MPI_Aint offset, MPI_Datatype mpi_type,
|
||||
int cmp, T val, MPI_Win win);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all_internal(MPI_Comm mpi_comm, T* dest, const T* source,
|
||||
int nreduce);
|
||||
|
||||
|
||||
+22
-22
@@ -200,7 +200,7 @@ __host__ void HostInterface::broadcast(T* dest, const T* source, int nelems,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void HostInterface::broadcast(roc_shmem_team_t team, T* dest,
|
||||
__host__ void HostInterface::broadcast(rocshmem_team_t team, T* dest,
|
||||
const T* source, int nelems,
|
||||
int pe_root) {
|
||||
DPRINTF("Function: Team-based host_broadcast\n");
|
||||
@@ -216,24 +216,24 @@ __host__ void HostInterface::broadcast(roc_shmem_team_t team, T* dest,
|
||||
return;
|
||||
}
|
||||
|
||||
__host__ inline MPI_Op HostInterface::get_mpi_op(ROC_SHMEM_OP Op) {
|
||||
__host__ inline MPI_Op HostInterface::get_mpi_op(ROCSHMEM_OP Op) {
|
||||
switch (Op) {
|
||||
case ROC_SHMEM_SUM:
|
||||
case ROCSHMEM_SUM:
|
||||
return MPI_SUM;
|
||||
case ROC_SHMEM_MAX:
|
||||
case ROCSHMEM_MAX:
|
||||
return MPI_MAX;
|
||||
case ROC_SHMEM_MIN:
|
||||
case ROCSHMEM_MIN:
|
||||
return MPI_MIN;
|
||||
case ROC_SHMEM_PROD:
|
||||
case ROCSHMEM_PROD:
|
||||
return MPI_PROD;
|
||||
case ROC_SHMEM_AND:
|
||||
case ROCSHMEM_AND:
|
||||
return MPI_BAND;
|
||||
case ROC_SHMEM_OR:
|
||||
case ROCSHMEM_OR:
|
||||
return MPI_BOR;
|
||||
case ROC_SHMEM_XOR:
|
||||
case ROCSHMEM_XOR:
|
||||
return MPI_BXOR;
|
||||
default:
|
||||
fprintf(stderr, "Unknown ROC_SHMEM op MPI conversion %d\n", Op);
|
||||
fprintf(stderr, "Unknown rocSHMEM op MPI conversion %d\n", Op);
|
||||
abort();
|
||||
return 0;
|
||||
}
|
||||
@@ -330,7 +330,7 @@ __host__ T HostInterface::amo_fetch_cas(void* dst, T value, T cond, int pe,
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void HostInterface::to_all_internal(MPI_Comm mpi_comm, T* dest,
|
||||
const T* source, int nreduce) {
|
||||
DPRINTF("Function: host_to_all_internal\n");
|
||||
@@ -356,7 +356,7 @@ __host__ void HostInterface::to_all_internal(MPI_Comm mpi_comm, T* dest,
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void HostInterface::to_all(T* dest, const T* source, int nreduce,
|
||||
int pe_start, int log_pe_stride,
|
||||
int pe_size, [[maybe_unused]] T* p_wrk,
|
||||
@@ -375,8 +375,8 @@ __host__ void HostInterface::to_all(T* dest, const T* source, int nreduce,
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ int HostInterface::reduce(roc_shmem_team_t team, T* dest,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ int HostInterface::reduce(rocshmem_team_t team, T* dest,
|
||||
const T* source, int nreduce) {
|
||||
DPRINTF("Function: Team-based host_reduce\n");
|
||||
|
||||
@@ -388,7 +388,7 @@ __host__ int HostInterface::reduce(roc_shmem_team_t team, T* dest,
|
||||
|
||||
to_all_internal<T, Op>(mpi_comm, dest, source, nreduce);
|
||||
|
||||
return ROC_SHMEM_SUCCESS;
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -397,26 +397,26 @@ __host__ inline int HostInterface::compare(int cmp, T input_val,
|
||||
int cond_satisfied{0};
|
||||
|
||||
switch (cmp) {
|
||||
case ROC_SHMEM_CMP_EQ:
|
||||
case ROCSHMEM_CMP_EQ:
|
||||
cond_satisfied = (input_val == target_val) ? 1 : 0;
|
||||
break;
|
||||
case ROC_SHMEM_CMP_NE:
|
||||
case ROCSHMEM_CMP_NE:
|
||||
cond_satisfied = (input_val != target_val) ? 1 : 0;
|
||||
break;
|
||||
case ROC_SHMEM_CMP_GT:
|
||||
case ROCSHMEM_CMP_GT:
|
||||
cond_satisfied = (input_val > target_val) ? 1 : 0;
|
||||
break;
|
||||
case ROC_SHMEM_CMP_GE:
|
||||
case ROCSHMEM_CMP_GE:
|
||||
cond_satisfied = (input_val >= target_val) ? 1 : 0;
|
||||
break;
|
||||
case ROC_SHMEM_CMP_LT:
|
||||
case ROCSHMEM_CMP_LT:
|
||||
cond_satisfied = (input_val < target_val) ? 1 : 0;
|
||||
break;
|
||||
case ROC_SHMEM_CMP_LE:
|
||||
case ROCSHMEM_CMP_LE:
|
||||
cond_satisfied = (input_val <= target_val) ? 1 : 0;
|
||||
break;
|
||||
default:
|
||||
assert(cmp >= ROC_SHMEM_CMP_EQ && cmp <= ROC_SHMEM_CMP_LE);
|
||||
assert(cmp >= ROCSHMEM_CMP_EQ && cmp <= ROCSHMEM_CMP_LE);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
+40
-40
@@ -33,10 +33,10 @@ namespace rocshmem {
|
||||
} \
|
||||
}
|
||||
|
||||
extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT;
|
||||
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
|
||||
|
||||
roc_shmem_team_t get_external_team(GPUIBTeam *team) {
|
||||
return reinterpret_cast<roc_shmem_team_t>(team);
|
||||
rocshmem_team_t get_external_team(GPUIBTeam *team) {
|
||||
return reinterpret_cast<rocshmem_team_t>(team);
|
||||
}
|
||||
|
||||
int get_ls_non_zero_bit(char *bitmask, int mask_length) {
|
||||
@@ -57,7 +57,7 @@ IPCBackend::IPCBackend(MPI_Comm comm)
|
||||
: Backend() {
|
||||
type = BackendType::IPC_BACKEND;
|
||||
|
||||
if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) {
|
||||
if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
|
||||
std::stringstream sstream(maximum_num_contexts_str);
|
||||
sstream >> maximum_num_contexts_;
|
||||
}
|
||||
@@ -82,7 +82,7 @@ IPCBackend::IPCBackend(MPI_Comm comm)
|
||||
|
||||
default_host_ctx = std::make_unique<IPCHostContext>(this, 0);
|
||||
|
||||
ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
|
||||
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
|
||||
|
||||
init_g_ret(&heap, thread_comm, MAX_NUM_BLOCKS, &bp->g_ret);
|
||||
|
||||
@@ -92,7 +92,7 @@ IPCBackend::IPCBackend(MPI_Comm comm)
|
||||
|
||||
init_wrk_sync_buffer();
|
||||
|
||||
roc_shmem_collective_init();
|
||||
rocshmem_collective_init();
|
||||
|
||||
setup_fence_buffer();
|
||||
|
||||
@@ -143,7 +143,7 @@ void IPCBackend::setup_ctxs() {
|
||||
}
|
||||
}
|
||||
|
||||
__device__ bool IPCBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) {
|
||||
__device__ bool IPCBackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) {
|
||||
IPCContext *ctx_{nullptr};
|
||||
|
||||
auto pop_result = ctx_free_list.get()->pop_front();
|
||||
@@ -158,7 +158,7 @@ __device__ bool IPCBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) {
|
||||
return true;
|
||||
}
|
||||
|
||||
__device__ void IPCBackend::destroy_ctx(roc_shmem_ctx_t *ctx) {
|
||||
__device__ void IPCBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
|
||||
ctx_free_list.get()->push_back(static_cast<IPCContext *>(ctx->ctx_opaque));
|
||||
}
|
||||
|
||||
@@ -182,9 +182,9 @@ void IPCBackend::setup_team_world() {
|
||||
team_tracker.set_team_world(team_world);
|
||||
|
||||
/**
|
||||
* Copy the address to ROC_SHMEM_TEAM_WORLD.
|
||||
* Copy the address to ROCSHMEM_TEAM_WORLD.
|
||||
*/
|
||||
ROC_SHMEM_TEAM_WORLD = reinterpret_cast<roc_shmem_team_t>(team_world);
|
||||
ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
|
||||
}
|
||||
|
||||
void IPCBackend::init_mpi_once(MPI_Comm comm) {
|
||||
@@ -205,7 +205,7 @@ void IPCBackend::init_mpi_once(MPI_Comm comm) {
|
||||
NET_CHECK(MPI_Comm_rank(thread_comm, &my_pe));
|
||||
}
|
||||
|
||||
void IPCBackend::team_destroy(roc_shmem_team_t team) {
|
||||
void IPCBackend::team_destroy(rocshmem_team_t team) {
|
||||
IPCTeam *team_obj = get_internal_ipc_team(team);
|
||||
|
||||
/* Mark the pool as available */
|
||||
@@ -221,7 +221,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team,
|
||||
TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) {
|
||||
rocshmem_team_t *new_team) {
|
||||
/**
|
||||
* Read the bit mask and find out a common index into
|
||||
* the pool of available work arrays.
|
||||
@@ -303,24 +303,24 @@ void IPCBackend::init_wrk_sync_buffer() {
|
||||
/**
|
||||
* size of barrier sync
|
||||
*/
|
||||
Wrk_Sync_buffer_size_ += sizeof(*barrier_sync) * ROC_SHMEM_BARRIER_SYNC_SIZE;
|
||||
Wrk_Sync_buffer_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE;
|
||||
|
||||
/**
|
||||
* Size of sync arrays for the teams
|
||||
*/
|
||||
Wrk_Sync_buffer_size_ += sizeof(long) * max_num_teams *
|
||||
(ROC_SHMEM_BARRIER_SYNC_SIZE +
|
||||
ROC_SHMEM_REDUCE_SYNC_SIZE +
|
||||
ROC_SHMEM_BCAST_SYNC_SIZE +
|
||||
ROC_SHMEM_ALLTOALL_SYNC_SIZE);
|
||||
(ROCSHMEM_BARRIER_SYNC_SIZE +
|
||||
ROCSHMEM_REDUCE_SYNC_SIZE +
|
||||
ROCSHMEM_BCAST_SYNC_SIZE +
|
||||
ROCSHMEM_ALLTOALL_SYNC_SIZE);
|
||||
|
||||
/**
|
||||
* Size of work arrays for the teams
|
||||
* Accommodate largest possible data type for pWrk
|
||||
*/
|
||||
Wrk_Sync_buffer_size_ += sizeof(double) * max_num_teams *
|
||||
(ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE +
|
||||
ROC_SHMEM_ATA_MAX_WRKDATA_SIZE);
|
||||
(ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE +
|
||||
ROCSHMEM_ATA_MAX_WRKDATA_SIZE);
|
||||
|
||||
/**
|
||||
* Size of fence array
|
||||
@@ -397,12 +397,12 @@ void IPCBackend::setup_fence_buffer() {
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(int) * num_pes;
|
||||
}
|
||||
|
||||
void IPCBackend::roc_shmem_collective_init() {
|
||||
void IPCBackend::rocshmem_collective_init() {
|
||||
/*
|
||||
* Allocate heap space for barrier_sync
|
||||
*/
|
||||
size_t one_sync_size_bytes{sizeof(*barrier_sync)};
|
||||
size_t sync_size_bytes{one_sync_size_bytes * ROC_SHMEM_BARRIER_SYNC_SIZE};
|
||||
size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
|
||||
barrier_sync = reinterpret_cast<int64_t*>(temp_Wrk_Sync_buff_ptr_);
|
||||
temp_Wrk_Sync_buff_ptr_ += sync_size_bytes;
|
||||
|
||||
@@ -410,7 +410,7 @@ void IPCBackend::roc_shmem_collective_init() {
|
||||
* Initialize the barrier synchronization array with default values.
|
||||
*/
|
||||
for (int i = 0; i < num_pes; i++) {
|
||||
barrier_sync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -427,29 +427,29 @@ void IPCBackend::teams_init() {
|
||||
auto max_num_teams{team_tracker.get_max_num_teams()};
|
||||
|
||||
barrier_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BARRIER_SYNC_SIZE
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE
|
||||
* max_num_teams;
|
||||
|
||||
reduce_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_REDUCE_SYNC_SIZE
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE
|
||||
* max_num_teams;
|
||||
|
||||
bcast_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
|
||||
* max_num_teams;
|
||||
|
||||
alltoall_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROC_SHMEM_BCAST_SYNC_SIZE
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
|
||||
* max_num_teams;
|
||||
|
||||
/* Accommodating for largest possible data type for pWrk */
|
||||
pWrk_pool = reinterpret_cast<void *>(temp_Wrk_Sync_buff_ptr_);
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
|
||||
* max_num_teams;
|
||||
|
||||
|
||||
pAta_pool = reinterpret_cast<void *>(temp_Wrk_Sync_buff_ptr_);
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROC_SHMEM_ATA_MAX_WRKDATA_SIZE
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE
|
||||
* max_num_teams;
|
||||
|
||||
/**
|
||||
@@ -458,25 +458,25 @@ void IPCBackend::teams_init() {
|
||||
long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
|
||||
for (int team_i = 0; team_i < max_num_teams; team_i++) {
|
||||
barrier_pSync = reinterpret_cast<long *>(
|
||||
&barrier_pSync_pool[team_i * ROC_SHMEM_BARRIER_SYNC_SIZE]);
|
||||
&barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
|
||||
reduce_pSync = reinterpret_cast<long *>(
|
||||
&reduce_pSync_pool[team_i * ROC_SHMEM_REDUCE_SYNC_SIZE]);
|
||||
&reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
|
||||
bcast_pSync = reinterpret_cast<long *>(
|
||||
&bcast_pSync_pool[team_i * ROC_SHMEM_BCAST_SYNC_SIZE]);
|
||||
&bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
|
||||
alltoall_pSync = reinterpret_cast<long *>(
|
||||
&alltoall_pSync_pool[team_i * ROC_SHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
&alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
|
||||
for (int i = 0; i < ROC_SHMEM_BARRIER_SYNC_SIZE; i++) {
|
||||
barrier_pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
|
||||
barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROC_SHMEM_REDUCE_SYNC_SIZE; i++) {
|
||||
reduce_pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
|
||||
reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROC_SHMEM_BCAST_SYNC_SIZE; i++) {
|
||||
bcast_pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
|
||||
bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROC_SHMEM_ALLTOALL_SYNC_SIZE; i++) {
|
||||
alltoall_pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
|
||||
alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -48,13 +48,13 @@ class IPCBackend : public Backend {
|
||||
*/
|
||||
virtual ~IPCBackend();
|
||||
|
||||
__device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx);
|
||||
__device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
|
||||
|
||||
/**
|
||||
* @brief Destroy a `roc_shmem_ctx_t` context and returns it back to the
|
||||
* @brief Destroy a `rocshmem_ctx_t` context and returns it back to the
|
||||
* context free list.
|
||||
*/
|
||||
__device__ void destroy_ctx(roc_shmem_ctx_t *ctx);
|
||||
__device__ void destroy_ctx(rocshmem_ctx_t *ctx);
|
||||
|
||||
/**
|
||||
* @copydoc Backend::ctx_create
|
||||
@@ -103,12 +103,12 @@ class IPCBackend : public Backend {
|
||||
void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) override;
|
||||
rocshmem_team_t *new_team) override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::team_destroy(roc_shmem_team_t)
|
||||
* @copydoc Backend::team_destroy(rocshmem_team_t)
|
||||
*/
|
||||
void team_destroy(roc_shmem_team_t team) override;
|
||||
void team_destroy(rocshmem_team_t team) override;
|
||||
|
||||
/**
|
||||
* @brief Accessor for work/sync bases
|
||||
@@ -213,7 +213,7 @@ class IPCBackend : public Backend {
|
||||
* When this method completes, the barrier_sync member will be available
|
||||
* for use.
|
||||
*/
|
||||
void roc_shmem_collective_init();
|
||||
void rocshmem_collective_init();
|
||||
|
||||
/**
|
||||
* @brief Allocate buffer for fence/quiet operation
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "backend_ipc.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -63,7 +63,7 @@ class IPCContext : public Context {
|
||||
|
||||
__device__ void sync_all();
|
||||
|
||||
__device__ void sync(roc_shmem_team_t team);
|
||||
__device__ void sync(rocshmem_team_t team);
|
||||
|
||||
template <typename T>
|
||||
__device__ void p(T *dest, T value, int pe);
|
||||
@@ -121,18 +121,18 @@ class IPCContext : public Context {
|
||||
__device__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
|
||||
|
||||
// Collectives
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__device__ int reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce);
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce);
|
||||
|
||||
template <typename T>
|
||||
__device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
template <typename T>
|
||||
__device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
|
||||
@@ -211,11 +211,11 @@ class IPCContext : public Context {
|
||||
int pe_root); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_linear(roc_shmem_team_t team, T *dest,
|
||||
__device__ void fcollect_linear(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_linear(roc_shmem_team_t team, T *dest,
|
||||
__device__ void alltoall_linear(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
__device__ void internal_sync(int pe, int PE_start, int stride, int PE_size,
|
||||
@@ -227,10 +227,10 @@ class IPCContext : public Context {
|
||||
__device__ void internal_atomic_barrier(int pe, int PE_start, int stride,
|
||||
int n_pes, int64_t *pSync);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void internal_direct_allreduce(T *dst, const T *src,
|
||||
int nelems, IPCTeam *team_obj);
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void internal_ring_allreduce(T *dst, const T *src,
|
||||
int nelems, IPCTeam *team_obj,
|
||||
int n_seg, int seg_size, int chunk_size);
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "context_ipc_tmpl_device.hpp"
|
||||
#include "../util.hpp"
|
||||
@@ -39,8 +39,8 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start,
|
||||
__threadfence_system();
|
||||
#endif /* __gfx90a__ */
|
||||
for (size_t i = 1; i < n_pes; i++) {
|
||||
wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
threadfence_system();
|
||||
|
||||
@@ -58,8 +58,8 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start,
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#endif /* __gfx90a__ */
|
||||
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
@@ -69,8 +69,8 @@ __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start,
|
||||
int64_t *pSync) {
|
||||
int64_t flag_val = 1;
|
||||
if (pe == PE_start) {
|
||||
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, (int64_t)(n_pes - 1));
|
||||
pSync[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
|
||||
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
|
||||
@@ -78,8 +78,8 @@ __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start,
|
||||
}
|
||||
} else {
|
||||
amo_add<int64_t>(&pSync[0], flag_val, PE_start);
|
||||
wait_until(&pSync[0], ROC_SHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROC_SHMEM_SYNC_VALUE;
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
@@ -98,7 +98,7 @@ __device__ void IPCContext::internal_sync(int pe, int PE_start, int stride,
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ void IPCContext::sync(roc_shmem_team_t team) {
|
||||
__device__ void IPCContext::sync(rocshmem_team_t team) {
|
||||
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
|
||||
|
||||
int pe = team_obj->my_pe_in_world;
|
||||
|
||||
@@ -86,16 +86,16 @@ class IPCHostContext : public Context {
|
||||
long *p_sync);
|
||||
|
||||
template <typename T>
|
||||
__host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
|
||||
int log_pe_stride, int pe_size, T *p_wrk,
|
||||
long *p_sync);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ int reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce);
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce);
|
||||
|
||||
template <typename T>
|
||||
__host__ void wait_until(T *ivars, int cmp, T val);
|
||||
|
||||
@@ -24,11 +24,11 @@
|
||||
#define LIBRARY_SRC_IPC_CONTEXT_TMPL_DEVICE_HPP_
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "context_ipc_device.hpp"
|
||||
#include "../util.hpp"
|
||||
#include "ipc_team.hpp"
|
||||
#include "../roc_shmem_calc.hpp"
|
||||
#include "../rocshmem_calc.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -153,7 +153,7 @@ __device__ T IPCContext::amo_fetch_cas(void *dest, T value, T cond, int pe) {
|
||||
}
|
||||
|
||||
// Collectives
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
|
||||
int wg_size) {
|
||||
for (size_t i = wg_id; i < size; i += wg_size) {
|
||||
@@ -162,7 +162,7 @@ __device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void IPCContext::internal_direct_allreduce(
|
||||
T *dst, const T *src, int nelems, IPCTeam *team_obj) { // NOLINT(runtime/int)
|
||||
|
||||
@@ -203,7 +203,7 @@ __device__ void IPCContext::internal_direct_allreduce(
|
||||
if (i != pe) {
|
||||
// Wait for leader thread to see that the buffer is ready.
|
||||
if (is_thread_zero_in_block()) {
|
||||
wait_until(&pSync[i], ROC_SHMEM_CMP_EQ, flag_val);
|
||||
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -216,7 +216,7 @@ __device__ void IPCContext::internal_direct_allreduce(
|
||||
__syncthreads();
|
||||
|
||||
for (int i = wg_id; i < num_pes; i += wg_size) {
|
||||
pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
threadfence_system();
|
||||
__syncthreads();
|
||||
@@ -278,7 +278,7 @@ __device__ void IPCContext::internal_direct_allreduce(
|
||||
* [02+12+22+32] [02+12+22+32] [02+12+22+32] [02+12+22+32]
|
||||
* [03+13+23+33] [03+13+23+33] [03+13+23+33] [03+13+23+33]
|
||||
*/
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void IPCContext::internal_ring_allreduce(
|
||||
T *dst, const T *src, int nelems, IPCTeam *team_obj, // NOLINT(runtime/int)
|
||||
int n_seg, int seg_size, int chunk_size) {
|
||||
@@ -323,7 +323,7 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#endif /* __gfx90a__ */
|
||||
wait_until(&pSync[iter], ROC_SHMEM_CMP_EQ, wait_val);
|
||||
wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val);
|
||||
}
|
||||
__syncthreads();
|
||||
compute_reduce<T, Op>(&pWrk[off_recv], &dst[off_seg + off_recv],
|
||||
@@ -344,7 +344,7 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#endif /* __gfx90a__ */
|
||||
wait_until(&pSync[iter], ROC_SHMEM_CMP_EQ, wait_val);
|
||||
wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -352,13 +352,13 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
__syncthreads();
|
||||
|
||||
for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
|
||||
pSync[i] = ROC_SHMEM_SYNC_VALUE;
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ int IPCContext::reduce(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nreduce) {
|
||||
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
|
||||
|
||||
@@ -367,14 +367,14 @@ __device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest,
|
||||
size_t direct_pWrk = PE_size * nreduce;
|
||||
size_t direct_pSync = PE_size;
|
||||
size_t ring_pSync = 2 * PE_size;
|
||||
size_t provided_pWrk = max(nreduce / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
|
||||
size_t provided_pSync = ROC_SHMEM_REDUCE_SYNC_SIZE;
|
||||
size_t provided_pWrk = max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
|
||||
size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE;
|
||||
|
||||
if (provided_pWrk >= direct_pWrk && provided_pSync >= direct_pSync) {
|
||||
internal_direct_allreduce<T, Op>(dest, source, nreduce, team_obj);
|
||||
} else {
|
||||
if (ring_pSync <= ROC_SHMEM_REDUCE_SYNC_SIZE) {
|
||||
size_t ring_pWrk = ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE;
|
||||
if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) {
|
||||
size_t ring_pWrk = ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE;
|
||||
// integer division truncating value
|
||||
int chunk_size = ring_pWrk / PE_size;
|
||||
int seg_size = chunk_size * PE_size;
|
||||
@@ -410,10 +410,10 @@ __device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest,
|
||||
}
|
||||
} else {
|
||||
GPU_DPRINTF("Unsupported reduction size for IPC conduit.\n");
|
||||
return ROC_SHMEM_ERROR;
|
||||
return ROCSHMEM_ERROR;
|
||||
}
|
||||
}
|
||||
return ROC_SHMEM_SUCCESS;
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -439,7 +439,7 @@ __device__ void IPCContext::internal_get_broadcast(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void IPCContext::broadcast(roc_shmem_team_t team, T *dst,
|
||||
__device__ void IPCContext::broadcast(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems, int pe_root) {
|
||||
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
|
||||
|
||||
@@ -471,13 +471,13 @@ __device__ void IPCContext::internal_broadcast(T *dst, const T *src, int nelems,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void IPCContext::alltoall(roc_shmem_team_t team, T *dst,
|
||||
__device__ void IPCContext::alltoall(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
alltoall_linear(team, dst, src, nelems);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void IPCContext::alltoall_linear(roc_shmem_team_t team, T *dst,
|
||||
__device__ void IPCContext::alltoall_linear(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
|
||||
|
||||
@@ -500,13 +500,13 @@ __device__ void IPCContext::alltoall_linear(roc_shmem_team_t team, T *dst,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void IPCContext::fcollect(roc_shmem_team_t team, T *dst,
|
||||
__device__ void IPCContext::fcollect(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
fcollect_linear(team, dst, src, nelems);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void IPCContext::fcollect_linear(roc_shmem_team_t team, T *dst,
|
||||
__device__ void IPCContext::fcollect_linear(rocshmem_team_t team, T *dst,
|
||||
const T *src, int nelems) {
|
||||
IPCTeam *team_obj = reinterpret_cast<IPCTeam *>(team);
|
||||
|
||||
|
||||
@@ -93,13 +93,13 @@ __host__ void IPCHostContext::broadcast(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void IPCHostContext::broadcast(roc_shmem_team_t team, T *dest,
|
||||
__host__ void IPCHostContext::broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems,
|
||||
int pe_root) {
|
||||
host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce,
|
||||
int pe_start, int log_pe_stride,
|
||||
int pe_size, T *p_wrk,
|
||||
@@ -108,8 +108,8 @@ __host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce,
|
||||
pe_size, p_wrk, p_sync);
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ int IPCHostContext::reduce(roc_shmem_team_t team, T *dest,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ int IPCHostContext::reduce(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nreduce) {
|
||||
return host_interface->reduce<T, Op>(team, dest, source, nreduce);
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ class IPCDefaultContextProxy {
|
||||
auto ctx{proxy_.get()};
|
||||
new (ctx) IPCContext(reinterpret_cast<Backend*>(backend));
|
||||
ctx->tinfo = tinfo;
|
||||
roc_shmem_ctx_t local{ctx, tinfo};
|
||||
rocshmem_ctx_t local{ctx, tinfo};
|
||||
set_internal_ctx(&local);
|
||||
}
|
||||
|
||||
@@ -89,4 +89,4 @@ using IPCDefaultContextProxyT = IPCDefaultContextProxy<HIPAllocator>;
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_IPC_CONTEXT_PROXY_HPP_
|
||||
#endif // LIBRARY_SRC_IPC_CONTEXT_PROXY_HPP_
|
||||
|
||||
@@ -38,17 +38,17 @@ IPCTeam::IPCTeam(Backend *backend, TeamInfo *team_info_parent,
|
||||
pool_index_ = pool_index;
|
||||
|
||||
barrier_pSync =
|
||||
&(b->barrier_pSync_pool[pool_index * ROC_SHMEM_BARRIER_SYNC_SIZE]);
|
||||
&(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
|
||||
reduce_pSync =
|
||||
&(b->reduce_pSync_pool[pool_index * ROC_SHMEM_REDUCE_SYNC_SIZE]);
|
||||
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROC_SHMEM_BCAST_SYNC_SIZE]);
|
||||
&(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
|
||||
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
|
||||
alltoall_pSync =
|
||||
&(b->alltoall_pSync_pool[pool_index * ROC_SHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
&(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
|
||||
pWrk = reinterpret_cast<char *>(b->pWrk_pool) +
|
||||
ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
pAta = reinterpret_cast<char *>(b->pAta_pool) +
|
||||
ROC_SHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
}
|
||||
|
||||
IPCTeam::~IPCTeam() {}
|
||||
|
||||
+1
-1
@@ -123,7 +123,7 @@ class IpcOffImpl {
|
||||
|
||||
__device__ bool isIpcAvailable(int my_pe, int target_pe) { return false; }
|
||||
|
||||
__device__ void ipcGpuInit(Backend *roc_shmem_handle, Context *ctx,
|
||||
__device__ void ipcGpuInit(Backend *rocshmem_handle, Context *ctx,
|
||||
int thread_id) {}
|
||||
|
||||
__device__ void ipcCopy(void *dst, void *src, size_t size) {}
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
namespace rocshmem {
|
||||
|
||||
SingleHeap::SingleHeap() {
|
||||
if (auto heap_size_cstr = getenv("ROC_SHMEM_HEAP_SIZE")) {
|
||||
if (auto heap_size_cstr = getenv("ROCSHMEM_HEAP_SIZE")) {
|
||||
std::stringstream sstream(heap_size_cstr);
|
||||
size_t heap_size;
|
||||
sstream >> heap_size;
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
namespace rocshmem {
|
||||
|
||||
SlabHeap::SlabHeap() {
|
||||
if (auto slab_size_cstr = getenv("ROC_SHMEM_SLAB_SIZE")) {
|
||||
if (auto slab_size_cstr = getenv("ROCSHMEM_SLAB_SIZE")) {
|
||||
std::stringstream sstream(slab_size_cstr);
|
||||
size_t slab_size;
|
||||
sstream >> slab_size;
|
||||
|
||||
@@ -81,7 +81,7 @@ class MPIInitSingleton {
|
||||
int nprocs_{-1};
|
||||
|
||||
/**
|
||||
* @brief Was MPI initialized before ROCSHMEM_init call
|
||||
* @brief Was MPI initialized before rocshmem_init call
|
||||
*/
|
||||
int pre_init_done{0};
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
#include <memory>
|
||||
#include <thread> // NOLINT
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../atomic_return.hpp"
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
@@ -42,13 +42,13 @@
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
extern roc_shmem_ctx_t ROC_SHMEM_HOST_CTX_DEFAULT;
|
||||
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
|
||||
|
||||
ROBackend::ROBackend(MPI_Comm comm)
|
||||
: profiler_proxy_(MAX_NUM_BLOCKS), Backend() {
|
||||
type = BackendType::RO_BACKEND;
|
||||
|
||||
if (auto maximum_num_contexts_str = getenv("ROC_SHMEM_MAX_NUM_CONTEXTS")) {
|
||||
if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
|
||||
std::stringstream sstream(maximum_num_contexts_str);
|
||||
sstream >> maximum_num_contexts_;
|
||||
}
|
||||
@@ -83,14 +83,14 @@ ROBackend::ROBackend(MPI_Comm comm)
|
||||
|
||||
default_host_ctx = std::make_unique<ROHostContext>(this, 0);
|
||||
|
||||
ROC_SHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
|
||||
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
|
||||
|
||||
team_world_proxy_ = new ROTeamProxy<HIPAllocator>(
|
||||
this, transport_->get_world_comm(), my_pe, num_pes);
|
||||
team_tracker.set_team_world(team_world_proxy_->get());
|
||||
|
||||
ROC_SHMEM_TEAM_WORLD =
|
||||
reinterpret_cast<roc_shmem_team_t>(team_world_proxy_->get());
|
||||
ROCSHMEM_TEAM_WORLD =
|
||||
reinterpret_cast<rocshmem_team_t>(team_world_proxy_->get());
|
||||
|
||||
default_block_handle_proxy_ = DefaultBlockHandleProxyT(
|
||||
bp->g_ret, bp->atomic_ret, &queue_, &ipcImpl, hdp_proxy_.get());
|
||||
@@ -120,7 +120,7 @@ ROBackend::~ROBackend() {
|
||||
CHECK_HIP(hipFree(ctx_array));
|
||||
}
|
||||
|
||||
__device__ bool ROBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) {
|
||||
__device__ bool ROBackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) {
|
||||
ROContext *ctx_;
|
||||
|
||||
auto pop_result = ctx_free_list.get()->pop_front();
|
||||
@@ -133,11 +133,11 @@ __device__ bool ROBackend::create_ctx(int64_t options, roc_shmem_ctx_t *ctx) {
|
||||
return true;
|
||||
}
|
||||
|
||||
__device__ void ROBackend::destroy_ctx(roc_shmem_ctx_t *ctx) {
|
||||
__device__ void ROBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
|
||||
ctx_free_list.get()->push_back(static_cast<ROContext *>(ctx->ctx_opaque));
|
||||
}
|
||||
|
||||
void ROBackend::team_destroy(roc_shmem_team_t team) {
|
||||
void ROBackend::team_destroy(rocshmem_team_t team) {
|
||||
ROTeam *team_obj{get_internal_ro_team(team)};
|
||||
|
||||
team_obj->~ROTeam();
|
||||
@@ -148,7 +148,7 @@ void ROBackend::create_new_team(Team *parent_team,
|
||||
TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) {
|
||||
rocshmem_team_t *new_team) {
|
||||
transport_->createNewTeam(this, parent_team, team_info_wrt_parent,
|
||||
team_info_wrt_world, num_pes, my_pe_in_new_team,
|
||||
team_comm, new_team);
|
||||
|
||||
@@ -85,20 +85,20 @@ class ROBackend : public Backend {
|
||||
void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) override;
|
||||
rocshmem_team_t *new_team) override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::team_destroy(roc_shmem_team_t)
|
||||
* @copydoc Backend::team_destroy(rocshmem_team_t)
|
||||
*/
|
||||
void team_destroy(roc_shmem_team_t team) override;
|
||||
void team_destroy(rocshmem_team_t team) override;
|
||||
|
||||
__device__ bool create_ctx(int64_t options, roc_shmem_ctx_t *ctx);
|
||||
__device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
|
||||
|
||||
/**
|
||||
* @brief Destroy a `roc_shmem_ctx_t` context and returns it back to the
|
||||
* @brief Destroy a `rocshmem_ctx_t` context and returns it back to the
|
||||
* context free list.
|
||||
*/
|
||||
__device__ void destroy_ctx(roc_shmem_ctx_t *ctx);
|
||||
__device__ void destroy_ctx(rocshmem_ctx_t *ctx);
|
||||
|
||||
/**
|
||||
* @copydoc Backend::ctx_create
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#ifndef LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_
|
||||
#define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../device_proxy.hpp"
|
||||
#include "../memory/hip_allocator.hpp"
|
||||
#include "context_ro_device.hpp"
|
||||
@@ -46,7 +46,7 @@ class DefaultContextProxy {
|
||||
: constructed_{true} {
|
||||
auto ctx{proxy_.get()};
|
||||
new (ctx) ROContext(reinterpret_cast<Backend*>(backend), -1);
|
||||
roc_shmem_ctx_t local{ctx, tinfo};
|
||||
rocshmem_ctx_t local{ctx, tinfo};
|
||||
set_internal_ctx(&local);
|
||||
}
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include "config.h" // NOLINT(build/include_subdir)
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../backend_type.hpp"
|
||||
#include "../hdp_policy.hpp"
|
||||
#include "backend_proxy.hpp"
|
||||
@@ -176,7 +176,7 @@ __device__ void ROContext::sync_all() {
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ void ROContext::sync(roc_shmem_team_t team) {
|
||||
__device__ void ROContext::sync(rocshmem_team_t team) {
|
||||
ROTeam *team_obj = reinterpret_cast<ROTeam *>(team);
|
||||
if (is_thread_zero_in_block()) {
|
||||
build_queue_element(RO_NET_SYNC, nullptr, nullptr, 0, 0, 0, 0, 0, nullptr,
|
||||
@@ -472,7 +472,7 @@ __device__ void build_queue_element(
|
||||
ro_net_cmds type, void *dst, void *src, size_t size, int pe,
|
||||
int logPE_stride, int PE_size, int PE_root, void *pWrk, long *pSync,
|
||||
MPI_Comm team_comm, int ro_net_win_id, BlockHandle *handle,
|
||||
bool blocking, ROC_SHMEM_OP op, ro_net_types datatype) {
|
||||
bool blocking, ROCSHMEM_OP op, ro_net_types datatype) {
|
||||
auto write_slot{next_write_slot(handle)};
|
||||
auto queue_element = &handle->queue[write_slot];
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ __device__ void build_queue_element(
|
||||
ro_net_cmds type, void *dst, void *src, size_t size, int pe,
|
||||
int logPE_stride, int PE_size, int PE_root, void *pWrk, long *pSync,
|
||||
MPI_Comm team_comm, int ro_net_win_id, BlockHandle *handle,
|
||||
bool blocking, ROC_SHMEM_OP op = ROC_SHMEM_SUM,
|
||||
bool blocking, ROCSHMEM_OP op = ROCSHMEM_SUM,
|
||||
ro_net_types datatype = RO_NET_INT);
|
||||
|
||||
class ROContext : public Context {
|
||||
@@ -67,7 +67,7 @@ class ROContext : public Context {
|
||||
|
||||
__device__ void sync_all();
|
||||
|
||||
__device__ void sync(roc_shmem_team_t team);
|
||||
__device__ void sync(rocshmem_team_t team);
|
||||
|
||||
template <typename T>
|
||||
__device__ void p(T *dest, T value, int pe);
|
||||
@@ -75,13 +75,13 @@ class ROContext : public Context {
|
||||
template <typename T>
|
||||
__device__ T g(const T *source, int pe);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void to_all(T *dest, const T *source, int nreduce, int PE_start,
|
||||
int logPE_stride, int PE_size, T *pWrk,
|
||||
long *pSync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__device__ void to_all(roc_shmem_team_t team, T *dest, const T *source,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void to_all(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nreduce);
|
||||
|
||||
template <typename T>
|
||||
@@ -133,7 +133,7 @@ class ROContext : public Context {
|
||||
__device__ void amo_xor(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T>
|
||||
@@ -142,43 +142,43 @@ class ROContext : public Context {
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_broadcast(roc_shmem_team_t team, T *dest,
|
||||
__device__ void alltoall_broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_mpi(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void alltoall_mpi(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_gcen(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_gcen2(roc_shmem_team_t team, T *dest,
|
||||
__device__ void alltoall_gcen2(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_broadcast(roc_shmem_team_t team, T *dest,
|
||||
__device__ void fcollect_broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_mpi(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void fcollect_mpi(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_gcen(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_gcen2(roc_shmem_team_t team, T *dest,
|
||||
__device__ void fcollect_gcen2(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
__device__ void putmem_wg(void *dest, const void *source, size_t nelems,
|
||||
|
||||
@@ -135,16 +135,16 @@ class ROHostContext : public Context {
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__host__ void broadcast(roc_shmem_team_t team, T *dest, const T *source,
|
||||
__host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
|
||||
int log_pe_stride, int pe_size, T *p_wrk,
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ void to_all(roc_shmem_team_t team, T *dest, const T *source,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nreduce);
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -108,8 +108,8 @@ struct GetROType<long double> {
|
||||
********************************* DEVICE API *********************************
|
||||
*****************************************************************************/
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__device__ void ROContext::to_all(roc_shmem_team_t team, T *dest,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void ROContext::to_all(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nreduce) {
|
||||
if (!is_thread_zero_in_block()) {
|
||||
__syncthreads();
|
||||
@@ -125,7 +125,7 @@ __device__ void ROContext::to_all(roc_shmem_team_t team, T *dest,
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void ROContext::to_all(T *dest, const T *source, int nreduce,
|
||||
int PE_start, int logPE_stride, int PE_size,
|
||||
T *pWrk, long *pSync) {
|
||||
@@ -210,7 +210,7 @@ __device__ T ROContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
|
||||
value, pe, 0, 0, 0,
|
||||
reinterpret_cast<void *>(static_cast<long long>(cond)),
|
||||
nullptr, (MPI_Comm)NULL, ro_net_win_id, block_handle, true,
|
||||
ROC_SHMEM_SUM, GetROType<T>::Type);
|
||||
ROCSHMEM_SUM, GetROType<T>::Type);
|
||||
__threadfence();
|
||||
return *source;
|
||||
}
|
||||
@@ -225,7 +225,7 @@ __device__ T ROContext::amo_fetch_add(void *dst, T value, int pe) {
|
||||
auto source{get_unused_atomic()};
|
||||
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<T *>(source), value,
|
||||
pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
|
||||
ro_net_win_id, block_handle, true, ROC_SHMEM_SUM,
|
||||
ro_net_win_id, block_handle, true, ROCSHMEM_SUM,
|
||||
GetROType<T>::Type);
|
||||
__threadfence();
|
||||
return *source;
|
||||
@@ -241,7 +241,7 @@ __device__ T ROContext::amo_swap(void *dst, T value, int pe) {
|
||||
auto source{get_unused_atomic()};
|
||||
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<void *>(source),
|
||||
value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
|
||||
ro_net_win_id, block_handle, true, ROC_SHMEM_REPLACE,
|
||||
ro_net_win_id, block_handle, true, ROCSHMEM_REPLACE,
|
||||
GetROType<T>::Type);
|
||||
__threadfence();
|
||||
return *source;
|
||||
@@ -257,7 +257,7 @@ __device__ T ROContext::amo_fetch_and(void *dst, T value, int pe) {
|
||||
auto source{get_unused_atomic()};
|
||||
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<void *>(source),
|
||||
value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
|
||||
ro_net_win_id, block_handle, true, ROC_SHMEM_AND,
|
||||
ro_net_win_id, block_handle, true, ROCSHMEM_AND,
|
||||
GetROType<T>::Type);
|
||||
__threadfence();
|
||||
return *source;
|
||||
@@ -273,7 +273,7 @@ __device__ T ROContext::amo_fetch_or(void *dst, T value, int pe) {
|
||||
auto source{get_unused_atomic()};
|
||||
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<void *>(source),
|
||||
value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
|
||||
ro_net_win_id, block_handle, true, ROC_SHMEM_OR,
|
||||
ro_net_win_id, block_handle, true, ROCSHMEM_OR,
|
||||
GetROType<T>::Type);
|
||||
__threadfence();
|
||||
return *source;
|
||||
@@ -289,7 +289,7 @@ __device__ T ROContext::amo_fetch_xor(void *dst, T value, int pe) {
|
||||
auto source{get_unused_atomic()};
|
||||
build_queue_element(RO_NET_AMO_FOP, dst, reinterpret_cast<void *>(source),
|
||||
value, pe, 0, 0, 0, nullptr, nullptr, (MPI_Comm)NULL,
|
||||
ro_net_win_id, block_handle, true, ROC_SHMEM_XOR,
|
||||
ro_net_win_id, block_handle, true, ROCSHMEM_XOR,
|
||||
GetROType<T>::Type);
|
||||
__threadfence();
|
||||
return *source;
|
||||
@@ -301,7 +301,7 @@ __device__ void ROContext::amo_xor(void *dst, T value, int pe) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void ROContext::broadcast(roc_shmem_team_t team, T *dest,
|
||||
__device__ void ROContext::broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems, int pe_root) {
|
||||
if (!is_thread_zero_in_block()) {
|
||||
__syncthreads();
|
||||
@@ -313,7 +313,7 @@ __device__ void ROContext::broadcast(roc_shmem_team_t team, T *dest,
|
||||
build_queue_element(RO_NET_TEAM_BROADCAST, dest, const_cast<T *>(source),
|
||||
nelems, 0, 0, 0, pe_root, nullptr, nullptr,
|
||||
team_obj->mpi_comm, ro_net_win_id, block_handle, true,
|
||||
ROC_SHMEM_SUM, GetROType<T>::Type);
|
||||
ROCSHMEM_SUM, GetROType<T>::Type);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -331,13 +331,13 @@ __device__ void ROContext::broadcast(T *dest, const T *source, int nelems,
|
||||
build_queue_element(RO_NET_BROADCAST, dest, const_cast<T *>(source), nelems,
|
||||
pe_start, log_pe_stride, pe_size, pe_root, nullptr,
|
||||
p_sync, (MPI_Comm)NULL, ro_net_win_id, block_handle, true,
|
||||
ROC_SHMEM_SUM, GetROType<T>::Type);
|
||||
ROCSHMEM_SUM, GetROType<T>::Type);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void ROContext::alltoall(roc_shmem_team_t team, T *dest,
|
||||
__device__ void ROContext::alltoall(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems) {
|
||||
if (!is_thread_zero_in_block()) {
|
||||
__syncthreads();
|
||||
@@ -349,13 +349,13 @@ __device__ void ROContext::alltoall(roc_shmem_team_t team, T *dest,
|
||||
build_queue_element(RO_NET_ALLTOALL, dest, const_cast<T *>(source), nelems, 0,
|
||||
0, 0, 0, team_obj->ata_buffer, nullptr,
|
||||
team_obj->mpi_comm, ro_net_win_id, block_handle, true,
|
||||
ROC_SHMEM_SUM, GetROType<T>::Type);
|
||||
ROCSHMEM_SUM, GetROType<T>::Type);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void ROContext::fcollect(roc_shmem_team_t team, T *dest,
|
||||
__device__ void ROContext::fcollect(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems) {
|
||||
if (!is_thread_zero_in_block()) {
|
||||
__syncthreads();
|
||||
@@ -367,7 +367,7 @@ __device__ void ROContext::fcollect(roc_shmem_team_t team, T *dest,
|
||||
build_queue_element(RO_NET_FCOLLECT, dest, const_cast<T *>(source), nelems, 0,
|
||||
0, 0, 0, team_obj->ata_buffer, nullptr,
|
||||
team_obj->mpi_comm, ro_net_win_id, block_handle, true,
|
||||
ROC_SHMEM_SUM, GetROType<T>::Type);
|
||||
ROCSHMEM_SUM, GetROType<T>::Type);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ __host__ void ROHostContext::broadcast(T *dest, const T *source, int nelems,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void ROHostContext::broadcast(roc_shmem_team_t team, T *dest,
|
||||
__host__ void ROHostContext::broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems,
|
||||
int pe_root) {
|
||||
DPRINTF("Function: Team-based ro_net_host_broadcast\n");
|
||||
@@ -122,7 +122,7 @@ __host__ void ROHostContext::broadcast(roc_shmem_team_t team, T *dest,
|
||||
host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void ROHostContext::to_all(T *dest, const T *source, int nreduce,
|
||||
int pe_start, int log_pe_stride,
|
||||
int pe_size, T *p_wrk, long *p_sync) {
|
||||
@@ -132,8 +132,8 @@ __host__ void ROHostContext::to_all(T *dest, const T *source, int nreduce,
|
||||
pe_size, p_wrk, p_sync);
|
||||
}
|
||||
|
||||
template <typename T, ROC_SHMEM_OP Op>
|
||||
__host__ void ROHostContext::to_all(roc_shmem_team_t team, T *dest,
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void ROHostContext::to_all(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nreduce) {
|
||||
DPRINTF("Function: Team-based ro_net_host_to_all\n");
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ void MPITransport::submitRequestsToMPI() {
|
||||
const_cast<unsigned long long *>(&next_element.ol1.atomic_value),
|
||||
next_element.PE, next_element.ro_net_win_id, queue_idx,
|
||||
next_element.threadId, true,
|
||||
static_cast<ROC_SHMEM_OP>(next_element.op),
|
||||
static_cast<ROCSHMEM_OP>(next_element.op),
|
||||
static_cast<ro_net_types>(next_element.datatype));
|
||||
DPRINTF("Received AMO dst %p src %p Val %llu pe %d\n", next_element.dst,
|
||||
next_element.src, next_element.ol1.atomic_value, next_element.PE);
|
||||
@@ -163,7 +163,7 @@ void MPITransport::submitRequestsToMPI() {
|
||||
team_reduction(next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.ro_net_win_id, queue_idx,
|
||||
next_element.team_comm,
|
||||
static_cast<ROC_SHMEM_OP>(next_element.op),
|
||||
static_cast<ROCSHMEM_OP>(next_element.op),
|
||||
static_cast<ro_net_types>(next_element.datatype),
|
||||
next_element.threadId, true);
|
||||
DPRINTF("Received FLOAT_SUM_TEAM_TO_ALL dst %p src %p size %lu team %d\n",
|
||||
@@ -175,7 +175,7 @@ void MPITransport::submitRequestsToMPI() {
|
||||
next_element.PE, next_element.ro_net_win_id, queue_idx,
|
||||
next_element.PE, next_element.logPE_stride,
|
||||
next_element.PE_size, next_element.ol2.pWrk, next_element.pSync,
|
||||
static_cast<ROC_SHMEM_OP>(next_element.op),
|
||||
static_cast<ROCSHMEM_OP>(next_element.op),
|
||||
static_cast<ro_net_types>(next_element.datatype),
|
||||
next_element.threadId, true);
|
||||
DPRINTF(
|
||||
@@ -275,15 +275,15 @@ void MPITransport::finalizeTransport() {
|
||||
delete host_interface;
|
||||
}
|
||||
|
||||
roc_shmem_team_t get_external_team(ROTeam *team) {
|
||||
return reinterpret_cast<roc_shmem_team_t>(team);
|
||||
rocshmem_team_t get_external_team(ROTeam *team) {
|
||||
return reinterpret_cast<rocshmem_team_t>(team);
|
||||
}
|
||||
|
||||
void MPITransport::createNewTeam(ROBackend *backend, Team *parent_team,
|
||||
TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) {
|
||||
rocshmem_team_t *new_team) {
|
||||
ROTeam *new_team_obj{nullptr};
|
||||
|
||||
CHECK_HIP(hipMalloc(&new_team_obj, sizeof(ROTeam)));
|
||||
@@ -342,26 +342,26 @@ void MPITransport::barrier(int blockId, int threadId, bool blocking,
|
||||
outstanding[blockId]++;
|
||||
}
|
||||
|
||||
MPI_Op MPITransport::get_mpi_op(ROC_SHMEM_OP op) {
|
||||
MPI_Op MPITransport::get_mpi_op(ROCSHMEM_OP op) {
|
||||
switch (op) {
|
||||
case ROC_SHMEM_SUM:
|
||||
case ROCSHMEM_SUM:
|
||||
return MPI_SUM;
|
||||
case ROC_SHMEM_MAX:
|
||||
case ROCSHMEM_MAX:
|
||||
return MPI_MAX;
|
||||
case ROC_SHMEM_MIN:
|
||||
case ROCSHMEM_MIN:
|
||||
return MPI_MIN;
|
||||
case ROC_SHMEM_PROD:
|
||||
case ROCSHMEM_PROD:
|
||||
return MPI_PROD;
|
||||
case ROC_SHMEM_AND:
|
||||
case ROCSHMEM_AND:
|
||||
return MPI_BAND;
|
||||
case ROC_SHMEM_OR:
|
||||
case ROCSHMEM_OR:
|
||||
return MPI_BOR;
|
||||
case ROC_SHMEM_XOR:
|
||||
case ROCSHMEM_XOR:
|
||||
return MPI_BXOR;
|
||||
case ROC_SHMEM_REPLACE:
|
||||
case ROCSHMEM_REPLACE:
|
||||
return MPI_REPLACE;
|
||||
default:
|
||||
fprintf(stderr, "Unknown ROC_SHMEM op MPI conversion %d\n", op);
|
||||
fprintf(stderr, "Unknown rocSHMEM op MPI conversion %d\n", op);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
@@ -383,7 +383,7 @@ static MPI_Datatype convertType(ro_net_types type) {
|
||||
case RO_NET_LONG_DOUBLE:
|
||||
return MPI_LONG_DOUBLE;
|
||||
default:
|
||||
fprintf(stderr, "Unknown ROC_SHMEM type MPI conversion %d\n", type);
|
||||
fprintf(stderr, "Unknown rocSHMEM type MPI conversion %d\n", type);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
@@ -391,7 +391,7 @@ static MPI_Datatype convertType(ro_net_types type) {
|
||||
void MPITransport::reduction(void *dst, void *src, int size, int pe,
|
||||
int win_id, int blockId, int start, int logPstride,
|
||||
int sizePE, void *pWrk, long *pSync,
|
||||
ROC_SHMEM_OP op, ro_net_types type, int threadId,
|
||||
ROCSHMEM_OP op, ro_net_types type, int threadId,
|
||||
bool blocking) {
|
||||
MPI_Request request{};
|
||||
MPI_Op mpi_op{get_mpi_op(op)};
|
||||
@@ -435,7 +435,7 @@ void MPITransport::broadcast(void *dst, void *src, int size, int pe,
|
||||
}
|
||||
|
||||
void MPITransport::team_reduction(void *dst, void *src, int size, int win_id,
|
||||
int blockId, MPI_Comm team, ROC_SHMEM_OP op,
|
||||
int blockId, MPI_Comm team, ROCSHMEM_OP op,
|
||||
ro_net_types type, int threadId,
|
||||
bool blocking) {
|
||||
MPI_Request request{};
|
||||
@@ -1046,7 +1046,7 @@ void MPITransport::putMem(void *dst, void *src, int size, int pe, int win_id,
|
||||
|
||||
void MPITransport::amoFOP(void *dst, void *src, void *val, int pe, int win_id,
|
||||
int blockId, int threadId, bool blocking,
|
||||
ROC_SHMEM_OP op, ro_net_types type) {
|
||||
ROCSHMEM_OP op, ro_net_types type) {
|
||||
queue->flush_hdp();
|
||||
|
||||
auto *bp{backend_proxy->get()};
|
||||
|
||||
@@ -49,18 +49,18 @@ class MPITransport : public Transport {
|
||||
TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) override;
|
||||
rocshmem_team_t *new_team) override;
|
||||
|
||||
void barrier(int blockId, int threadId, bool blocking,
|
||||
MPI_Comm team) override;
|
||||
|
||||
void reduction(void *dst, void *src, int size, int pe, int win_id,
|
||||
int blockId, int start, int logPstride, int sizePE, void *pWrk,
|
||||
long *pSync, ROC_SHMEM_OP op, ro_net_types type,
|
||||
long *pSync, ROCSHMEM_OP op, ro_net_types type,
|
||||
int threadId, bool blocking) override;
|
||||
|
||||
void team_reduction(void *dst, void *src, int size, int win_id, int blockId,
|
||||
MPI_Comm team, ROC_SHMEM_OP op, ro_net_types type,
|
||||
MPI_Comm team, ROCSHMEM_OP op, ro_net_types type,
|
||||
int threadId, bool blocking) override;
|
||||
|
||||
void broadcast(void *dst, void *src, int size, int pe, int win_id,
|
||||
@@ -116,7 +116,7 @@ class MPITransport : public Transport {
|
||||
int threadId, bool blocking, bool inline_data = false) override;
|
||||
|
||||
void amoFOP(void *dst, void *src, void *val, int pe, int win_id, int blockId,
|
||||
int threadId, bool blocking, ROC_SHMEM_OP op,
|
||||
int threadId, bool blocking, ROCSHMEM_OP op,
|
||||
ro_net_types type) override;
|
||||
|
||||
void amoFCAS(void *dst, void *src, void *val, int pe, int win_id, int blockId,
|
||||
@@ -193,7 +193,7 @@ class MPITransport : public Transport {
|
||||
|
||||
void submitRequestsToMPI();
|
||||
|
||||
MPI_Op get_mpi_op(ROC_SHMEM_OP op);
|
||||
MPI_Op get_mpi_op(ROCSHMEM_OP op);
|
||||
|
||||
Queue *queue{nullptr};
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "backend_proxy.hpp"
|
||||
#include "ro_net_team.hpp"
|
||||
|
||||
@@ -48,18 +48,18 @@ class Transport {
|
||||
TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
roc_shmem_team_t *new_team) = 0;
|
||||
rocshmem_team_t *new_team) = 0;
|
||||
|
||||
virtual void barrier(int wg_id, int threadId, bool blocking,
|
||||
MPI_Comm team) = 0;
|
||||
|
||||
virtual void reduction(void *dst, void *src, int size, int pe, int win_id,
|
||||
int wg_id, int start, int logPstride, int sizePE,
|
||||
void *pWrk, long *pSync, ROC_SHMEM_OP op,
|
||||
void *pWrk, long *pSync, ROCSHMEM_OP op,
|
||||
ro_net_types type, int threadId, bool blocking) = 0;
|
||||
|
||||
virtual void team_reduction(void *dst, void *src, int size, int win_id,
|
||||
int wg_id, MPI_Comm team, ROC_SHMEM_OP op,
|
||||
int wg_id, MPI_Comm team, ROCSHMEM_OP op,
|
||||
ro_net_types type, int threadId,
|
||||
bool blocking) = 0;
|
||||
|
||||
@@ -89,7 +89,7 @@ class Transport {
|
||||
int wg_id, int threadId, bool blocking) = 0;
|
||||
|
||||
virtual void amoFOP(void *dst, void *src, void *val, int pe, int win_id,
|
||||
int wg_id, int threadId, bool blocking, ROC_SHMEM_OP op,
|
||||
int wg_id, int threadId, bool blocking, ROCSHMEM_OP op,
|
||||
ro_net_types type) = 0;
|
||||
|
||||
virtual void amoFCAS(void *dst, void *src, void *val, int pe, int win_id,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -20,14 +20,14 @@
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_ROC_SHMEM_CALC_HPP_
|
||||
#define LIBRARY_SRC_ROC_SHMEM_CALC_HPP_
|
||||
#ifndef LIBRARY_SRC_ROCSHMEM_CALC_HPP_
|
||||
#define LIBRARY_SRC_ROCSHMEM_CALC_HPP_
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
// clang-format off
|
||||
NOWARN(-Wunused-parameter,
|
||||
template <ROC_SHMEM_OP Op>
|
||||
template <ROCSHMEM_OP Op>
|
||||
struct OpWrap {
|
||||
template <typename T>
|
||||
__device__ static void Calc(T *src, T *dst, int i) {
|
||||
@@ -41,7 +41,7 @@ struct OpWrap {
|
||||
************************** TEMPLATE SPECIALIZATIONS **************************
|
||||
*****************************************************************************/
|
||||
template <>
|
||||
struct OpWrap<ROC_SHMEM_SUM> {
|
||||
struct OpWrap<ROCSHMEM_SUM> {
|
||||
template <typename T>
|
||||
__device__ static void Calc(T *src, T *dst, int i) {
|
||||
dst[i] += src[i];
|
||||
@@ -49,7 +49,7 @@ struct OpWrap<ROC_SHMEM_SUM> {
|
||||
};
|
||||
|
||||
template <>
|
||||
struct OpWrap<ROC_SHMEM_MAX> {
|
||||
struct OpWrap<ROCSHMEM_MAX> {
|
||||
template <typename T>
|
||||
__device__ static void Calc(T *src, T *dst, int i) {
|
||||
dst[i] = max(dst[i], src[i]);
|
||||
@@ -57,7 +57,7 @@ struct OpWrap<ROC_SHMEM_MAX> {
|
||||
};
|
||||
|
||||
template <>
|
||||
struct OpWrap<ROC_SHMEM_MIN> {
|
||||
struct OpWrap<ROCSHMEM_MIN> {
|
||||
template <typename T>
|
||||
__device__ static void Calc(T *src, T *dst, int i) {
|
||||
dst[i] = min(dst[i], src[i]);
|
||||
@@ -65,7 +65,7 @@ struct OpWrap<ROC_SHMEM_MIN> {
|
||||
};
|
||||
|
||||
template <>
|
||||
struct OpWrap<ROC_SHMEM_PROD> {
|
||||
struct OpWrap<ROCSHMEM_PROD> {
|
||||
template <typename T>
|
||||
__device__ static void Calc(T *src, T *dst, int i) {
|
||||
dst[i] *= src[i];
|
||||
@@ -73,7 +73,7 @@ struct OpWrap<ROC_SHMEM_PROD> {
|
||||
};
|
||||
|
||||
template <>
|
||||
struct OpWrap<ROC_SHMEM_AND> {
|
||||
struct OpWrap<ROCSHMEM_AND> {
|
||||
template <typename T>
|
||||
__device__ static void Calc(T *src, T *dst, int i) {
|
||||
dst[i] &= src[i];
|
||||
@@ -81,7 +81,7 @@ struct OpWrap<ROC_SHMEM_AND> {
|
||||
};
|
||||
|
||||
template <>
|
||||
struct OpWrap<ROC_SHMEM_OR> {
|
||||
struct OpWrap<ROCSHMEM_OR> {
|
||||
template <typename T>
|
||||
__device__ static void Calc(T *src, T *dst, int i) {
|
||||
dst[i] |= src[i];
|
||||
@@ -89,7 +89,7 @@ struct OpWrap<ROC_SHMEM_OR> {
|
||||
};
|
||||
|
||||
template <>
|
||||
struct OpWrap<ROC_SHMEM_XOR> {
|
||||
struct OpWrap<ROCSHMEM_XOR> {
|
||||
template <typename T>
|
||||
__device__ static void Calc(T *src, T *dst, int i) {
|
||||
dst[i] ^= src[i];
|
||||
@@ -97,4 +97,4 @@ struct OpWrap<ROC_SHMEM_XOR> {
|
||||
};
|
||||
|
||||
}
|
||||
#endif // LIBRARY_SRC_ROC_SHMEM_CALC_HPP_
|
||||
#endif // LIBRARY_SRC_ROCSHMEM_CALC_HPP_
|
||||
File diff suppressed because it is too large
Load Diff
+5
-5
@@ -27,12 +27,12 @@
|
||||
|
||||
#include <atomic>
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
enum roc_shmem_stats {
|
||||
enum rocshmem_stats {
|
||||
NUM_PUT = 0,
|
||||
NUM_PUT_NBI,
|
||||
NUM_P,
|
||||
@@ -85,7 +85,7 @@ enum roc_shmem_stats {
|
||||
NUM_STATS
|
||||
};
|
||||
|
||||
enum roc_shmem_host_stats {
|
||||
enum rocshmem_host_stats {
|
||||
NUM_HOST_PUT = 0,
|
||||
NUM_HOST_PUT_NBI,
|
||||
NUM_HOST_P,
|
||||
@@ -135,10 +135,10 @@ class Stats {
|
||||
StatType stats[I] = {0};
|
||||
|
||||
public:
|
||||
__device__ uint64_t startTimer() const { return roc_shmem_timer(); }
|
||||
__device__ uint64_t startTimer() const { return rocshmem_timer(); }
|
||||
|
||||
__device__ void endTimer(uint64_t start, int index) {
|
||||
incStat(index, roc_shmem_timer() - start);
|
||||
incStat(index, rocshmem_timer() - start);
|
||||
}
|
||||
|
||||
__device__ void incStat(int index, int value = 1) {
|
||||
|
||||
+10
-10
@@ -24,34 +24,34 @@
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "roc_shmem/roc_shmem.hpp"
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "backend_bc.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
roc_shmem_team_t ROC_SHMEM_TEAM_WORLD;
|
||||
rocshmem_team_t ROCSHMEM_TEAM_WORLD;
|
||||
|
||||
__host__ __device__ Team* get_internal_team(roc_shmem_team_t team) {
|
||||
__host__ __device__ Team* get_internal_team(rocshmem_team_t team) {
|
||||
return reinterpret_cast<Team*>(team);
|
||||
}
|
||||
|
||||
GPUIBTeam* get_internal_gpu_ib_team(roc_shmem_team_t team) {
|
||||
GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team) {
|
||||
return reinterpret_cast<GPUIBTeam*>(team);
|
||||
}
|
||||
|
||||
ROTeam* get_internal_ro_team(roc_shmem_team_t team) {
|
||||
ROTeam* get_internal_ro_team(rocshmem_team_t team) {
|
||||
return reinterpret_cast<ROTeam*>(team);
|
||||
}
|
||||
|
||||
IPCTeam* get_internal_ipc_team(roc_shmem_team_t team) {
|
||||
IPCTeam* get_internal_ipc_team(rocshmem_team_t team) {
|
||||
return reinterpret_cast<IPCTeam*>(team);
|
||||
}
|
||||
|
||||
__host__ __device__ int team_translate_pe(roc_shmem_team_t src_team, int src_pe,
|
||||
roc_shmem_team_t dst_team) {
|
||||
if (src_team == ROC_SHMEM_TEAM_INVALID ||
|
||||
dst_team == ROC_SHMEM_TEAM_INVALID) {
|
||||
__host__ __device__ int team_translate_pe(rocshmem_team_t src_team, int src_pe,
|
||||
rocshmem_team_t dst_team) {
|
||||
if (src_team == ROCSHMEM_TEAM_INVALID ||
|
||||
dst_team == ROCSHMEM_TEAM_INVALID) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user