Removed GPU_IB (#59)
This commit is contained in:
@@ -39,9 +39,7 @@ endif()
|
||||
###############################################################################
|
||||
option(DEBUG "Enable debug trace" OFF)
|
||||
option(PROFILE "Enable statistics and timing support" OFF)
|
||||
option(USE_GPU_IB "Enable GPU_IB conduit." ON)
|
||||
option(USE_RO "Enable RO conduit." ON)
|
||||
option(USE_DC "Enable IB dynamically connected transport (DC)" OFF)
|
||||
option(USE_IPC "Enable IPC support (using HIP)" OFF)
|
||||
option(USE_THREADS "Enable workgroup threads to share network queues" OFF)
|
||||
option(USE_WF_COAL "Enable wavefront message coalescing" OFF)
|
||||
@@ -158,10 +156,6 @@ if (NOT BUILD_TESTS_ONLY)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hsa-runtime64 REQUIRED)
|
||||
|
||||
if (USE_GPU_IB)
|
||||
find_package(Ibverbs REQUIRED)
|
||||
endif()
|
||||
|
||||
set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
|
||||
set(THREADS_PREFER_PTHREAD_FLAG TRUE)
|
||||
find_package(Threads REQUIRED)
|
||||
@@ -188,20 +182,6 @@ if (NOT BUILD_TESTS_ONLY)
|
||||
hip::host
|
||||
hsa-runtime64::hsa-runtime64
|
||||
)
|
||||
|
||||
if (USE_GPU_IB)
|
||||
target_include_directories(
|
||||
${PROJECT_NAME}
|
||||
PUBLIC
|
||||
${IBVERBS_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(
|
||||
${PROJECT_NAME}
|
||||
PUBLIC
|
||||
${IBVERBS_LIBRARIES}
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
###############################################################################
|
||||
|
||||
@@ -7,11 +7,11 @@ code complexity and enables more fine-grained communication/computation
|
||||
overlap than traditional host-driven networking.
|
||||
rocSHMEM uses a single symmetric heap (SHEAP) that is allocated on GPU memories.
|
||||
|
||||
There are currently three backends for rocSHMEM;
|
||||
IPC, Reverse Offload (RO), and GPU-IB.
|
||||
There are currently two backends for rocSHMEM;
|
||||
IPC and Reverse Offload (RO).
|
||||
The backends primarily differ in their implementations of intra-kernel networking.
|
||||
Currently, only the IPC backend is supported.
|
||||
The RO and GPU-IB backends are provided as-is with
|
||||
The RO backend is provided as-is with
|
||||
no guarantees of support from AMD or AMD Research.
|
||||
|
||||
The IPC backend implements communication primitives using load/store operations issued from the GPU.
|
||||
@@ -21,10 +21,6 @@ to the host-side runtime, which calls into a traditional MPI or OpenSHMEM
|
||||
implementation. This forwarding of requests is transparent to the
|
||||
programmer, who only sees the GPU-side interface.
|
||||
|
||||
The GPU InfiniBand (GPU-IB) backend implements a lightweight InfiniBand verbs interface
|
||||
on the GPU. The GPU itself is responsible for building commands and ringing
|
||||
the doorbell on the NIC to send network commands.
|
||||
|
||||
## Requirements
|
||||
|
||||
rocSHMEM base requirements:
|
||||
|
||||
@@ -1,62 +0,0 @@
|
||||
###############################################################################
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to
|
||||
# deal in the Software without restriction, including without limitation the
|
||||
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
# sell copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
###############################################################################
|
||||
|
||||
find_package(PkgConfig REQUIRED QUIET)
|
||||
pkg_check_modules(PC_IBVERBS QUIET libibverbs)
|
||||
|
||||
find_path(
|
||||
IBVERBS_INCLUDE_DIR infiniband/verbs.h
|
||||
HINTS ${PC_IBVERBS_INCLUDEDIR} ${PC_IBVERBS_INCLUDE_DIRS}
|
||||
PATH_SUFFIXES include
|
||||
)
|
||||
|
||||
find_library(
|
||||
IBVERBS_LIBRARY
|
||||
NAMES ibverbs libibverbs
|
||||
HINTS ${PC_IBVERBS_LIBDIR} ${PC_IBVERBS_LIBRARY_DIRS}
|
||||
PATH_SUFFIXES lib lib64
|
||||
)
|
||||
|
||||
find_library(
|
||||
MLX5_LIBRARY
|
||||
NAMES mlx5 libmlx5
|
||||
HINTS ${PC_IBVERBS_LIBDIR} ${PC_IBVERBS_LIBRARY_DIRS}
|
||||
PATH_SUFFIXES lib lib64
|
||||
)
|
||||
|
||||
set(
|
||||
IBVERBS_LIBRARIES
|
||||
${IBVERBS_LIBRARY} ${MLX5_LIBRARY}
|
||||
CACHE INTERNAL ""
|
||||
)
|
||||
|
||||
set(
|
||||
IBVERBS_INCLUDE_DIRS
|
||||
${IBVERBS_INCLUDE_DIR}
|
||||
CACHE INTERNAL ""
|
||||
)
|
||||
|
||||
find_package_handle_standard_args(
|
||||
Ibverbs DEFAULT_MSG IBVERBS_LIBRARY IBVERBS_INCLUDE_DIR
|
||||
)
|
||||
|
||||
mark_as_advanced(IBVERBS_LIBRARY IBVERBS_INCLUDE_DIR)
|
||||
@@ -1,8 +1,6 @@
|
||||
#cmakedefine DEBUG
|
||||
#cmakedefine PROFILE
|
||||
#cmakedefine USE_GPU_IB
|
||||
#cmakedefine USE_RO
|
||||
#cmakedefine USE_DC
|
||||
#cmakedefine USE_IPC
|
||||
#cmakedefine USE_THREADS
|
||||
#cmakedefine USE_SHARED_CTX
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=ON \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DDEBUG=ON \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=ON \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=ON \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=ON \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=ON \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=ON \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -17,9 +17,7 @@ cmake \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=OFF \
|
||||
-DUSE_RO=OFF \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
|
||||
@@ -22,9 +22,7 @@ cmake \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=OFF \
|
||||
-DUSE_RO=OFF \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=ON \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,28 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_COHERENT_HEAP=OFF \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DDEBUG=ON \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,28 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_MANAGED_HEAP=ON \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,28 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DDEBUG=ON \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_MANAGED_HEAP=ON \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=ON \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,30 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_COHERENT_HEAP=OFF \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_SINGLE_NODE=ON \
|
||||
-DUSE_HOST_SIDE_HDP_FLUSH=ON\
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -1,30 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
set -e
|
||||
|
||||
if [ -z $1 ]
|
||||
then
|
||||
install_path=~/rocshmem
|
||||
else
|
||||
install_path=$1
|
||||
fi
|
||||
|
||||
src_path=$(dirname "$(realpath $0)")/../../
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DCMAKE_INSTALL_PREFIX=$install_path \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=ON \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_COHERENT_HEAP=OFF \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_SINGLE_NODE=ON \
|
||||
-DUSE_HOST_SIDE_HDP_FLUSH=ON\
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
@@ -17,8 +17,6 @@ cmake \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=OFF \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
|
||||
@@ -17,8 +17,6 @@ cmake \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=OFF \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
|
||||
@@ -17,8 +17,6 @@ cmake \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=OFF \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_GPU_IB=OFF \
|
||||
-DUSE_DC=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
|
||||
@@ -145,12 +145,9 @@ def determine_algos_from_library_config_type(config):
|
||||
if config['algorithms']:
|
||||
return config
|
||||
|
||||
gpu_ib = re.match('^[rd]c_', config['library_build_config_type'])
|
||||
thread_single = re.match('.*single.*', config['library_build_config_type'])
|
||||
|
||||
if not gpu_ib:
|
||||
config['algorithms'] = reverse_offload_algorithms
|
||||
elif thread_single:
|
||||
if thread_single:
|
||||
config['algorithms'] = single_thread_algorithms
|
||||
else:
|
||||
config['algorithms'] = multi_thread_algorithms
|
||||
|
||||
+1
-3
@@ -58,9 +58,7 @@ target_compile_options(
|
||||
###############################################################################
|
||||
# ROCSHMEM TARGET FOR BACKENDS
|
||||
###############################################################################
|
||||
IF (USE_GPU_IB)
|
||||
add_subdirectory(gpu_ib)
|
||||
ELSEIF(USE_RO)
|
||||
IF (USE_RO)
|
||||
add_subdirectory(reverse_offload)
|
||||
ELSE()
|
||||
add_subdirectory(ipc)
|
||||
|
||||
+3
-9
@@ -25,9 +25,7 @@
|
||||
#include "backend_type.hpp"
|
||||
#include "context_incl.hpp"
|
||||
|
||||
#ifdef USE_GPU_IB
|
||||
#include "gpu_ib/backend_ib.hpp"
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#include "reverse_offload/backend_ro.hpp"
|
||||
#else
|
||||
#include "ipc/backend_ipc.hpp"
|
||||
@@ -203,9 +201,7 @@ void Backend::reset_stats() {
|
||||
}
|
||||
|
||||
__device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
|
||||
#ifdef USE_GPU_IB
|
||||
return static_cast<GPUIBBackend*>(this)->create_ctx(option, ctx);
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
return static_cast<ROBackend*>(this)->create_ctx(option, ctx);
|
||||
#else
|
||||
return static_cast<IPCBackend*>(this)->create_ctx(option, ctx);
|
||||
@@ -213,9 +209,7 @@ __device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
|
||||
}
|
||||
|
||||
__device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) {
|
||||
#ifdef USE_GPU_IB
|
||||
static_cast<GPUIBBackend*>(this)->destroy_ctx(ctx);
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
static_cast<ROBackend*>(this)->destroy_ctx(ctx);
|
||||
#else
|
||||
static_cast<IPCBackend*>(this)->destroy_ctx(ctx);
|
||||
|
||||
+1
-1
@@ -278,7 +278,7 @@ class Backend {
|
||||
* rely on the normal inheritance mechanism to tailor behavior for
|
||||
* derived backend types.
|
||||
*/
|
||||
BackendType type{BackendType::GPU_IB_BACKEND};
|
||||
BackendType type{BackendType::RO_BACKEND};
|
||||
|
||||
/**
|
||||
* @brief Dumps derived class statistics.
|
||||
|
||||
+6
-24
@@ -44,7 +44,7 @@ namespace rocshmem {
|
||||
* @note Derived classes which use Backend as a base class must add
|
||||
* themselves to this enum class to support static polymorphism.
|
||||
*/
|
||||
enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
|
||||
enum class BackendType { RO_BACKEND, IPC_BACKEND };
|
||||
|
||||
/**
|
||||
* @brief Helper macro for some dispatch calls
|
||||
@@ -54,10 +54,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
|
||||
/**
|
||||
* @brief Device static dispatch method call.
|
||||
*/
|
||||
#ifdef USE_GPU_IB
|
||||
#define DISPATCH(Func) \
|
||||
static_cast<GPUIBContext *>(this)->Func;
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#define DISPATCH(Func) \
|
||||
static_cast<ROContext *>(this)->Func;
|
||||
#else
|
||||
@@ -68,11 +65,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
|
||||
/**
|
||||
* @brief Device static dispatch method call with a return value.
|
||||
*/
|
||||
#ifdef USE_GPU_IB
|
||||
#define DISPATCH_RET(Func) \
|
||||
auto ret_val = static_cast<GPUIBContext *>(this)->Func; \
|
||||
return ret_val;
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#define DISPATCH_RET(Func) \
|
||||
auto ret_val = static_cast<ROContext *>(this)->Func; \
|
||||
return ret_val;
|
||||
@@ -85,12 +78,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
|
||||
/**
|
||||
* @brief Device static dispatch method call with a return type of pointer.
|
||||
*/
|
||||
#ifdef USE_GPU_IB
|
||||
#define DISPATCH_RET_PTR(Func) \
|
||||
void *ret_val{nullptr}; \
|
||||
ret_val = static_cast<GPUIBContext *>(this)->Func; \
|
||||
return ret_val;
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#define DISPATCH_RET_PTR(Func) \
|
||||
void *ret_val{nullptr}; \
|
||||
ret_val = static_cast<ROContext *>(this)->Func; \
|
||||
@@ -109,9 +97,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
|
||||
* MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and
|
||||
* threading semantics of collectives in OpenSHMEM match those of MPI.
|
||||
*/
|
||||
#ifdef USE_GPU_IB
|
||||
#define HOST_DISPATCH(Func) static_cast<GPUIBHostContext *>(this)->Func;
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#define HOST_DISPATCH(Func) static_cast<ROHostContext *>(this)->Func;
|
||||
#else
|
||||
#define HOST_DISPATCH(Func) static_cast<IPCHostContext *>(this)->Func;
|
||||
@@ -124,11 +110,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
|
||||
* threading semantics of collectives in OpenSHMEM match those of MPI.
|
||||
*/
|
||||
|
||||
#ifdef USE_GPU_IB
|
||||
#define HOST_DISPATCH_RET(Func) \
|
||||
auto ret_val = static_cast<GPUIBHostContext *>(this)->Func; \
|
||||
return ret_val;
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#define HOST_DISPATCH_RET(Func) \
|
||||
auto ret_val = static_cast<ROHostContext *>(this)->Func; \
|
||||
return ret_val;
|
||||
|
||||
@@ -26,10 +26,7 @@
|
||||
#include "context.hpp"
|
||||
#include "context_tmpl_device.hpp"
|
||||
#include "context_tmpl_host.hpp"
|
||||
#ifdef USE_GPU_IB
|
||||
#include "gpu_ib/context_ib_device.hpp"
|
||||
#include "gpu_ib/context_ib_host.hpp"
|
||||
#elif defined (USE_RO)
|
||||
#ifdef USE_RO
|
||||
#include "reverse_offload/context_ro_device.hpp"
|
||||
#include "reverse_offload/context_ro_host.hpp"
|
||||
#else
|
||||
|
||||
@@ -25,9 +25,7 @@
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_type.hpp"
|
||||
#ifdef USE_GPU_IB
|
||||
#include "gpu_ib/context_ib_device.hpp"
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#include "reverse_offload/context_ro_device.hpp"
|
||||
#else
|
||||
#include "ipc/context_ipc_device.hpp"
|
||||
|
||||
@@ -25,9 +25,7 @@
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_type.hpp"
|
||||
#ifdef USE_GPU_IB
|
||||
#include "gpu_ib/context_ib_host.hpp"
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#include "reverse_offload/context_ro_host.hpp"
|
||||
#else
|
||||
#include "ipc/context_ipc_host.hpp"
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
###############################################################################
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to
|
||||
# deal in the Software without restriction, including without limitation the
|
||||
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
# sell copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
###############################################################################
|
||||
|
||||
###############################################################################
|
||||
# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY
|
||||
###############################################################################
|
||||
target_sources(
|
||||
${PROJECT_NAME}
|
||||
PRIVATE
|
||||
backend_ib.cpp
|
||||
connection.cpp
|
||||
connection_policy.cpp
|
||||
context_ib_device.cpp
|
||||
context_ib_device_coll.cpp
|
||||
context_ib_host.cpp
|
||||
debug.cpp
|
||||
dynamic_connection.cpp
|
||||
endian.cpp
|
||||
gpu_ib_team.cpp
|
||||
network_policy.cpp
|
||||
qe_dumper.cpp
|
||||
queue_pair.cpp
|
||||
reliable_connection.cpp
|
||||
segment_builder.cpp
|
||||
thread_policy.cpp
|
||||
)
|
||||
@@ -1,493 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "backend_ib.hpp"
|
||||
|
||||
#include <endian.h>
|
||||
#include <mpi.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <mutex> // NOLINT(build/c++11)
|
||||
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "gpu_ib_team.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
#include "../host/host.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
#define NET_CHECK(cmd) \
|
||||
{ \
|
||||
if (cmd != MPI_SUCCESS) { \
|
||||
fprintf(stderr, "Unrecoverable error: MPI Failure\n"); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
|
||||
|
||||
rocshmem_team_t get_external_team(GPUIBTeam *team) {
|
||||
return reinterpret_cast<rocshmem_team_t>(team);
|
||||
}
|
||||
|
||||
int get_ls_non_zero_bit(char *bitmask, int mask_length) {
|
||||
int position = -1;
|
||||
|
||||
for (int bit_i = 0; bit_i < mask_length; bit_i++) {
|
||||
int byte_i = bit_i / CHAR_BIT;
|
||||
if (bitmask[byte_i] & (1 << (bit_i % CHAR_BIT))) {
|
||||
position = bit_i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return position;
|
||||
}
|
||||
|
||||
GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
|
||||
if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
|
||||
std::stringstream sstream(maximum_num_contexts_str);
|
||||
sstream >> maximum_num_contexts_;
|
||||
}
|
||||
num_blocks_ = maximum_num_contexts_;
|
||||
|
||||
init_mpi_once(comm);
|
||||
|
||||
type = BackendType::GPU_IB_BACKEND;
|
||||
|
||||
NET_CHECK(MPI_Comm_dup(backend_comm, &gpu_ib_comm_world));
|
||||
NET_CHECK(MPI_Comm_size(gpu_ib_comm_world, &num_pes));
|
||||
NET_CHECK(MPI_Comm_rank(gpu_ib_comm_world, &my_pe));
|
||||
|
||||
/* Initialize the host interface */
|
||||
host_interface =
|
||||
new HostInterface(hdp_proxy_.get(), gpu_ib_comm_world, &heap);
|
||||
|
||||
/*
|
||||
* Construct default host context independently of the
|
||||
* default device context (done in the async thread)
|
||||
* so that host operations can execute regardless of
|
||||
* device operations.
|
||||
*/
|
||||
setup_default_host_ctx();
|
||||
|
||||
setup_team_world();
|
||||
|
||||
rocshmem_collective_init();
|
||||
|
||||
teams_init();
|
||||
|
||||
// MPI_Comm_dup(gpu_ib_comm_world, &thread_comm);
|
||||
thread_comm = gpu_ib_comm_world;
|
||||
|
||||
NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
|
||||
|
||||
worker_thread_exit = false;
|
||||
|
||||
#ifdef USE_HOST_SIDE_HDP_FLUSH
|
||||
hdp_gpu_cpu_flush_flag_ =
|
||||
static_cast<unsigned int *>(rocshmem_malloc(sizeof(unsigned int)));
|
||||
hdp_policy->set_flush_polling_ptr(hdp_gpu_cpu_flush_flag_);
|
||||
hdp_flush_worker_thread = std::thread(&GPUIBBackend::hdp_flush_poll, this);
|
||||
|
||||
// We can now initialize and set the HDP window in the host interface
|
||||
host_interface->create_hdp_window();
|
||||
|
||||
#endif
|
||||
|
||||
// commenting out the async thread as there is some issues with ROCm
|
||||
// this makes the CPU init blocking
|
||||
// async_thread_ = thread_spawn(this);
|
||||
thread_func_internal(this);
|
||||
}
|
||||
|
||||
__device__ bool GPUIBBackend::create_ctx(int64_t options,
|
||||
rocshmem_ctx_t *ctx) {
|
||||
GPUIBContext *ctx_;
|
||||
|
||||
auto pop_result = ctx_free_list.get()->pop_front();
|
||||
if (!pop_result.success) {
|
||||
return false;
|
||||
}
|
||||
ctx_ = pop_result.value;
|
||||
ctx->ctx_opaque = ctx_;
|
||||
return true;
|
||||
}
|
||||
|
||||
void GPUIBBackend::ctx_create(int64_t options, void **ctx) {
|
||||
GPUIBHostContext *new_ctx = nullptr;
|
||||
|
||||
new_ctx = new GPUIBHostContext(this, options);
|
||||
|
||||
*ctx = new_ctx;
|
||||
}
|
||||
|
||||
GPUIBHostContext *get_internal_gpu_ib_ctx(Context *ctx) {
|
||||
return reinterpret_cast<GPUIBHostContext *>(ctx);
|
||||
}
|
||||
|
||||
void GPUIBBackend::ctx_destroy(Context *ctx) {
|
||||
GPUIBHostContext *gpu_ib_host_ctx = get_internal_gpu_ib_ctx(ctx);
|
||||
delete gpu_ib_host_ctx;
|
||||
}
|
||||
|
||||
__device__ void GPUIBBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
|
||||
ctx_free_list.get()->push_back(static_cast<GPUIBContext *>(ctx->ctx_opaque));
|
||||
}
|
||||
|
||||
GPUIBBackend::~GPUIBBackend() {
|
||||
// need to get this back once ROCm is fixed
|
||||
// async_thread_.join();
|
||||
|
||||
worker_thread_exit = true;
|
||||
|
||||
#ifdef USE_HOST_SIDE_HDP_FLUSH
|
||||
hdp_flush_worker_thread.join();
|
||||
hdp_policy->set_flush_polling_ptr(nullptr);
|
||||
rocshmem_free(hdp_gpu_cpu_flush_flag_);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Destroy teams infrastructure
|
||||
* and team world
|
||||
*/
|
||||
teams_destroy();
|
||||
auto *team_world{team_tracker.get_team_world()};
|
||||
team_world->~Team();
|
||||
CHECK_HIP(hipFree(team_world));
|
||||
|
||||
delete default_host_ctx_;
|
||||
|
||||
NET_CHECK(MPI_Comm_free(&gpu_ib_comm_world));
|
||||
|
||||
CHECK_HIP(hipFree(default_ctx_->device_qp_proxy));
|
||||
CHECK_HIP(hipFree(default_ctx_));
|
||||
default_ctx_ = nullptr;
|
||||
|
||||
delete host_interface;
|
||||
host_interface = nullptr;
|
||||
|
||||
networkImpl.networkHostFinalize();
|
||||
|
||||
CHECK_HIP(hipFree(ctx_array));
|
||||
}
|
||||
|
||||
__host__ void GPUIBBackend::global_exit(int status) {
|
||||
MPI_Abort(gpu_ib_comm_world, status);
|
||||
}
|
||||
|
||||
void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team,
|
||||
TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
rocshmem_team_t *new_team) {
|
||||
/**
|
||||
* Read the bit mask and find out a common index into
|
||||
* the pool of available work arrays.
|
||||
*/
|
||||
NET_CHECK(MPI_Allreduce(pool_bitmask_, reduced_bitmask_, bitmask_size_,
|
||||
MPI_CHAR, MPI_BAND, team_comm));
|
||||
|
||||
/* Pick the least significant non-zero bit (logical layout) in the reduced
|
||||
* bitmask */
|
||||
auto max_num_teams{team_tracker.get_max_num_teams()};
|
||||
int common_index = get_ls_non_zero_bit(reduced_bitmask_, max_num_teams);
|
||||
if (common_index < 0) {
|
||||
/* No team available */
|
||||
abort();
|
||||
}
|
||||
|
||||
/* Mark the team as taken (by unsetting the bit in the pool bitmask) */
|
||||
int byte = common_index / CHAR_BIT;
|
||||
pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT));
|
||||
|
||||
/**
|
||||
* Allocate device-side memory for team_world and
|
||||
* construct a GPU_IB team in it
|
||||
*/
|
||||
GPUIBTeam *new_team_obj;
|
||||
CHECK_HIP(hipMalloc(&new_team_obj, sizeof(GPUIBTeam)));
|
||||
new (new_team_obj)
|
||||
GPUIBTeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes,
|
||||
my_pe_in_new_team, team_comm, common_index);
|
||||
|
||||
*new_team = get_external_team(new_team_obj);
|
||||
}
|
||||
|
||||
void GPUIBBackend::team_destroy(rocshmem_team_t team) {
|
||||
GPUIBTeam *team_obj = get_internal_gpu_ib_team(team);
|
||||
|
||||
/* Mark the pool as available */
|
||||
int bit = team_obj->pool_index_;
|
||||
int byte_i = bit / CHAR_BIT;
|
||||
pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT);
|
||||
|
||||
team_obj->~GPUIBTeam();
|
||||
CHECK_HIP(hipFree(team_obj));
|
||||
}
|
||||
|
||||
void GPUIBBackend::dump_backend_stats() {
|
||||
networkImpl.dump_backend_stats(&globalStats);
|
||||
}
|
||||
|
||||
void GPUIBBackend::reset_backend_stats() { networkImpl.reset_backend_stats(); }
|
||||
|
||||
void GPUIBBackend::initialize_ipc() {
|
||||
ipcImpl.ipcHostInit(my_pe, heap.get_heap_bases(), thread_comm);
|
||||
}
|
||||
|
||||
void GPUIBBackend::initialize_network() { networkImpl.networkHostSetup(this); }
|
||||
|
||||
void GPUIBBackend::setup_default_host_ctx() {
|
||||
default_host_ctx_ = new GPUIBHostContext(this, 0);
|
||||
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_;
|
||||
}
|
||||
|
||||
void GPUIBBackend::setup_ctxs() {
|
||||
/*
|
||||
* Allocate device-side memory for all context and construct an
|
||||
* InfiniBand context in it.
|
||||
*/
|
||||
CHECK_HIP(
|
||||
hipMalloc(&ctx_array, sizeof(GPUIBContext) * maximum_num_contexts_));
|
||||
for (int i = 0; i < maximum_num_contexts_; i++) {
|
||||
new (&ctx_array[i]) GPUIBContext(this, false, i);
|
||||
ctx_free_list.get()->push_back(ctx_array + i);
|
||||
}
|
||||
}
|
||||
|
||||
void GPUIBBackend::setup_default_ctx() {
|
||||
/*
|
||||
* Allocate device-side memory for default context and construct an
|
||||
* InfiniBand context in it.
|
||||
*/
|
||||
CHECK_HIP(hipMalloc(&default_ctx_, sizeof(GPUIBContext)));
|
||||
new (default_ctx_) GPUIBContext(this, true, 0);
|
||||
|
||||
/*
|
||||
* Set the ROCSHMEM_CTX_DEFAULT in constant memory.
|
||||
*/
|
||||
int *symbol_address;
|
||||
CHECK_HIP(hipGetSymbolAddress(reinterpret_cast<void **>(&symbol_address),
|
||||
HIP_SYMBOL(ROCSHMEM_CTX_DEFAULT)));
|
||||
|
||||
TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world;
|
||||
rocshmem_ctx_t ctx_default_host{default_ctx_, tinfo};
|
||||
|
||||
hipStream_t stream;
|
||||
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
CHECK_HIP(hipMemcpyAsync(symbol_address, &ctx_default_host,
|
||||
sizeof(rocshmem_ctx_t), hipMemcpyDefault, stream));
|
||||
CHECK_HIP(hipStreamSynchronize(stream));
|
||||
CHECK_HIP(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void GPUIBBackend::setup_team_world() {
|
||||
TeamInfo *team_info_wrt_parent, *team_info_wrt_world;
|
||||
|
||||
/**
|
||||
* Allocate device-side memory for team_world and construct a
|
||||
* GPU_IB team in it.
|
||||
*/
|
||||
CHECK_HIP(hipMalloc(&team_info_wrt_parent, sizeof(TeamInfo)));
|
||||
CHECK_HIP(hipMalloc(&team_info_wrt_world, sizeof(TeamInfo)));
|
||||
|
||||
new (team_info_wrt_parent) TeamInfo(nullptr, 0, 1, num_pes);
|
||||
new (team_info_wrt_world) TeamInfo(nullptr, 0, 1, num_pes);
|
||||
|
||||
MPI_Comm team_world_comm;
|
||||
NET_CHECK(MPI_Comm_dup(gpu_ib_comm_world, &team_world_comm));
|
||||
|
||||
GPUIBTeam *team_world{nullptr};
|
||||
CHECK_HIP(hipMalloc(&team_world, sizeof(GPUIBTeam)));
|
||||
new (team_world) GPUIBTeam(this, team_info_wrt_parent, team_info_wrt_world,
|
||||
num_pes, my_pe, team_world_comm, 0);
|
||||
team_tracker.set_team_world(team_world);
|
||||
|
||||
/**
|
||||
* Copy the address to ROCSHMEM_TEAM_WORLD.
|
||||
*/
|
||||
ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
|
||||
}
|
||||
|
||||
void GPUIBBackend::init_mpi_once(MPI_Comm comm) {
|
||||
static std::mutex init_mutex;
|
||||
const std::lock_guard<std::mutex> lock(init_mutex);
|
||||
|
||||
int init_done = 0;
|
||||
NET_CHECK(MPI_Initialized(&init_done));
|
||||
if (init_done == 0) {
|
||||
int provided;
|
||||
NET_CHECK(
|
||||
MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided));
|
||||
}
|
||||
if (comm == MPI_COMM_NULL) {
|
||||
NET_CHECK(MPI_Comm_dup(MPI_COMM_WORLD, &backend_comm));
|
||||
} else {
|
||||
NET_CHECK(MPI_Comm_dup(comm, &backend_comm));
|
||||
}
|
||||
}
|
||||
|
||||
std::thread GPUIBBackend::thread_spawn(GPUIBBackend *b) {
|
||||
return std::thread(&GPUIBBackend::thread_func_internal, this, b);
|
||||
}
|
||||
|
||||
void GPUIBBackend::thread_func_internal(GPUIBBackend *b) {
|
||||
CHECK_HIP(hipSetDevice(hip_dev_id));
|
||||
|
||||
b->initialize_ipc();
|
||||
b->initialize_network();
|
||||
b->setup_ctxs();
|
||||
b->setup_default_ctx();
|
||||
*(b->done_init) = 1;
|
||||
}
|
||||
|
||||
#ifdef USE_HOST_SIDE_HDP_FLUSH
|
||||
void GPUIBBackend::hdp_flush_poll() {
|
||||
while (!worker_thread_exit) {
|
||||
if (hdp_policy->has_active_flush_request()) {
|
||||
hdp_policy->hdp_flush();
|
||||
hdp_policy->clear_active_flush_flag();
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void GPUIBBackend::teams_init() {
|
||||
/**
|
||||
* Allocate pools for the teams sync and work arrary from the SHEAP.
|
||||
*/
|
||||
auto max_num_teams{team_tracker.get_max_num_teams()};
|
||||
barrier_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
|
||||
sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams));
|
||||
reduce_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
|
||||
sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams));
|
||||
bcast_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
|
||||
sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams));
|
||||
alltoall_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
|
||||
sizeof(long) * ROCSHMEM_ALLTOALL_SYNC_SIZE * max_num_teams));
|
||||
|
||||
/* Accommodating for largest possible data type for pWrk */
|
||||
pWrk_pool = rocshmem_malloc(
|
||||
sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams);
|
||||
pAta_pool = rocshmem_malloc(sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE *
|
||||
max_num_teams);
|
||||
|
||||
/**
|
||||
* Initialize the sync arrays in the pool with default values.
|
||||
*/
|
||||
long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
|
||||
for (int team_i = 0; team_i < max_num_teams; team_i++) {
|
||||
barrier_pSync = reinterpret_cast<long *>(
|
||||
&barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
|
||||
reduce_pSync = reinterpret_cast<long *>(
|
||||
&reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
|
||||
bcast_pSync = reinterpret_cast<long *>(
|
||||
&bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
|
||||
alltoall_pSync = reinterpret_cast<long *>(
|
||||
&alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
|
||||
for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
|
||||
barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
|
||||
reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
|
||||
bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
|
||||
alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize bit mask
|
||||
*
|
||||
* Logical:
|
||||
* MSB..........................................................................LSB
|
||||
* Physical: MSB...1st least significant 8 bits...LSB MSB...2nd least
|
||||
* signifant 8 bits...LSB
|
||||
*
|
||||
* Description shows only a 2-byte long mask but idea extends to any
|
||||
* arbitrary size.
|
||||
*/
|
||||
bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1)
|
||||
: (max_num_teams / CHAR_BIT);
|
||||
pool_bitmask_ = reinterpret_cast<char *>(malloc(bitmask_size_));
|
||||
reduced_bitmask_ = reinterpret_cast<char *>(malloc(bitmask_size_));
|
||||
|
||||
memset(pool_bitmask_, 0, bitmask_size_);
|
||||
memset(reduced_bitmask_, 0, bitmask_size_);
|
||||
/* Set all to available except the 0th one (reserved for TEAM_WORLD) */
|
||||
for (int bit_i = 1; bit_i < max_num_teams; bit_i++) {
|
||||
int byte_i = bit_i / CHAR_BIT;
|
||||
|
||||
pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure that all processing elements have done this before
|
||||
* continuing.
|
||||
*/
|
||||
NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
|
||||
}
|
||||
|
||||
void GPUIBBackend::teams_destroy() {
|
||||
rocshmem_free(barrier_pSync_pool);
|
||||
rocshmem_free(reduce_pSync_pool);
|
||||
rocshmem_free(bcast_pSync_pool);
|
||||
rocshmem_free(alltoall_pSync_pool);
|
||||
rocshmem_free(pWrk_pool);
|
||||
rocshmem_free(pAta_pool);
|
||||
|
||||
free(pool_bitmask_);
|
||||
free(reduced_bitmask_);
|
||||
}
|
||||
|
||||
void GPUIBBackend::rocshmem_collective_init() {
|
||||
/*
|
||||
* Allocate heap space for barrier_sync
|
||||
*/
|
||||
size_t one_sync_size_bytes{sizeof(*barrier_sync)};
|
||||
size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
|
||||
heap.malloc(reinterpret_cast<void **>(&barrier_sync), sync_size_bytes);
|
||||
|
||||
/*
|
||||
* Initialize the barrier synchronization array with default values.
|
||||
*/
|
||||
for (int i = 0; i < num_pes; i++) {
|
||||
barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that all processing elements have done this before
|
||||
* continuing.
|
||||
*/
|
||||
NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,351 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
|
||||
|
||||
#include "../backend_bc.hpp"
|
||||
#include "../containers/free_list_impl.hpp"
|
||||
#include "network_policy.hpp"
|
||||
#include "../hdp_policy.hpp"
|
||||
#include "../hdp_proxy.hpp"
|
||||
#include "../memory/hip_allocator.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class HostInterface;
|
||||
|
||||
/**
|
||||
* @class GPUIBBackend backend.hpp
|
||||
* @brief InfiniBand specific backend.
|
||||
*
|
||||
* The InfiniBand (GPUIB) backend enables the device to enqueue network
|
||||
* requests to InfiniBand queues (with minimal host intervention). The setup
|
||||
* requires some effort from the host, but the device is able to craft
|
||||
* InfiniBand requests and send them on its own.
|
||||
*/
|
||||
class GPUIBBackend : public Backend {
|
||||
public:
|
||||
/**
|
||||
* @copydoc Backend::Backend(unsigned)
|
||||
*/
|
||||
explicit GPUIBBackend(MPI_Comm comm);
|
||||
|
||||
/**
|
||||
* @copydoc Backend::~Backend()
|
||||
*/
|
||||
virtual ~GPUIBBackend();
|
||||
|
||||
/**
|
||||
* @brief Abort the application.
|
||||
*
|
||||
* @param[in] status Exit code.
|
||||
*
|
||||
* @return void.
|
||||
*
|
||||
* @note This routine terminates the entire application.
|
||||
*/
|
||||
void global_exit(int status) override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::create_new_team
|
||||
*/
|
||||
void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
|
||||
TeamInfo *team_info_wrt_world, int num_pes,
|
||||
int my_pe_in_new_team, MPI_Comm team_comm,
|
||||
rocshmem_team_t *new_team) override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::team_destroy(rocshmem_team_t)
|
||||
*/
|
||||
void team_destroy(rocshmem_team_t team) override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::ctx_create
|
||||
*/
|
||||
void ctx_create(int64_t options, void **ctx) override;
|
||||
|
||||
__device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
|
||||
|
||||
/**
|
||||
* @copydoc Backend::ctx_destroy
|
||||
*/
|
||||
void ctx_destroy(Context *ctx) override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::ctx_destroy
|
||||
*/
|
||||
__device__ void destroy_ctx(rocshmem_ctx_t *ctx);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* @copydoc Backend::dump_backend_stats()
|
||||
*/
|
||||
void dump_backend_stats() override;
|
||||
|
||||
/**
|
||||
* @copydoc Backend::reset_backend_stats()
|
||||
*/
|
||||
void reset_backend_stats() override;
|
||||
|
||||
/**
|
||||
* @brief spawn a new thread to perform the rest of initialization
|
||||
*/
|
||||
std::thread thread_spawn(GPUIBBackend *b);
|
||||
|
||||
/**
|
||||
* @brief overheads for helper thread to run
|
||||
*
|
||||
* @param[in] the thread needs access to the class
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
void thread_func_internal(GPUIBBackend *b);
|
||||
|
||||
/**
|
||||
* @brief initialize MPI.
|
||||
*
|
||||
* GPUIB relies on MPI just to exchange the connection information.
|
||||
*
|
||||
* todo: remove the dependency on MPI and make it generic to PMI-X or just
|
||||
* to OpenSHMEM to have support for both CPU and GPU
|
||||
*/
|
||||
void init_mpi_once(MPI_Comm comm);
|
||||
|
||||
/**
|
||||
* @brief init the network support
|
||||
*/
|
||||
void initialize_network();
|
||||
|
||||
/**
|
||||
* @brief Invokes the IPC policy class initialization method.
|
||||
*
|
||||
* This method delegates Inter Process Communication (IPC)
|
||||
* initialization to the appropriate policy class. The initialization
|
||||
* needs to be exposed to the Backed due to initialization ordering
|
||||
* constraints. (The symmetric heaps needs to be allocated and
|
||||
* initialized before this method can be called.)
|
||||
*
|
||||
* The policy class encapsulates what the initialization process so
|
||||
* refer to that class for more details.
|
||||
*/
|
||||
void initialize_ipc();
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize the ROCSHMEM_CTX_DEFAULT variable.
|
||||
*
|
||||
* @todo The default_ctx member looks unused after it is copied into
|
||||
* the ROCSHMEM_CTX_DEFAULT variable.
|
||||
*/
|
||||
void setup_default_ctx();
|
||||
void setup_ctxs();
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize the default context for host
|
||||
* operations.
|
||||
*/
|
||||
void setup_default_host_ctx();
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize team world.
|
||||
*/
|
||||
void setup_team_world();
|
||||
|
||||
/**
|
||||
* @brief Initialize the resources required to support teams
|
||||
*/
|
||||
void teams_init();
|
||||
|
||||
/**
|
||||
* @brief Destruct the resources required to support teams
|
||||
*/
|
||||
void teams_destroy();
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize barrier operation addresses on
|
||||
* symmetric heap.
|
||||
*
|
||||
* When this method completes, the barrier_sync member will be available
|
||||
* for use.
|
||||
*/
|
||||
void rocshmem_collective_init();
|
||||
|
||||
#ifdef USE_HOST_SIDE_HDP_FLUSH
|
||||
/**
|
||||
* @brief A service thread routine that flushes the hdp cache on behalf of the
|
||||
* GPU.
|
||||
*/
|
||||
void hdp_flush_poll();
|
||||
|
||||
/**
|
||||
* @brief Workers used to poll on the device hdp flush request.
|
||||
*/
|
||||
std::thread hdp_flush_worker_thread{};
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Signals to the worker threads to exist
|
||||
*/
|
||||
std::atomic<bool> worker_thread_exit{false};
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief The host-facing interface that will be used
|
||||
* by all contexts of the GPUIBBackend
|
||||
*/
|
||||
HostInterface *host_interface{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for raw memory for barrier sync
|
||||
*/
|
||||
long *barrier_pSync_pool{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for raw memory for reduce sync
|
||||
*/
|
||||
long *reduce_pSync_pool{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for raw memory for broadcast sync
|
||||
*/
|
||||
long *bcast_pSync_pool{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for raw memory for alltoall sync
|
||||
*/
|
||||
long *alltoall_pSync_pool{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for raw memory for work
|
||||
*/
|
||||
void *pWrk_pool{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for raw memory for alltoall
|
||||
*/
|
||||
void *pAta_pool{nullptr};
|
||||
|
||||
/**
|
||||
* @brief rocSHMEM's copy of MPI_COMM_WORLD (for interoperability
|
||||
* with orthogonal MPI usage in an MPI+rocSHMEM program).
|
||||
*/
|
||||
MPI_Comm gpu_ib_comm_world{};
|
||||
MPI_Comm backend_comm{};
|
||||
|
||||
/**
|
||||
* @brief Holds number of blocks used in library
|
||||
*/
|
||||
size_t num_blocks_{1};
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Allocates cacheable, device memory for the hdp policy.
|
||||
*
|
||||
* @note Internal data ownership is managed by the proxy
|
||||
*/
|
||||
HdpProxy<HIPAllocator> hdp_proxy_{};
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Policy choice for two HDP implementations.
|
||||
*
|
||||
* @todo Combine HDP related stuff together into a class with a
|
||||
* reasonable interface. The functionality does not need to exist in
|
||||
* multiple pieces in the Backend and QueuePair classes. The hdp_rkey,
|
||||
* hdp_addresses, and hdp_policy fields should all live in the class.
|
||||
*/
|
||||
HdpPolicy *hdp_policy{hdp_proxy_.get()};
|
||||
|
||||
/**
|
||||
* @brief Scratchpad for the internal barrier algorithms.
|
||||
*/
|
||||
int64_t *barrier_sync{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Compile-time configuration policy for network (IB)
|
||||
*
|
||||
*
|
||||
* The configuration option "USE_SINGLE_NODE" can be enabled to not build
|
||||
* with network support.
|
||||
*/
|
||||
NetworkImpl networkImpl{};
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief An array of @ref ROContexts that backs the context FreeList.
|
||||
*/
|
||||
GPUIBContext *ctx_array{nullptr};
|
||||
|
||||
/**
|
||||
* @brief A free-list containing contexts.
|
||||
*/
|
||||
FreeListProxy<HIPAllocator, GPUIBContext *> ctx_free_list{};
|
||||
|
||||
/**
|
||||
* @brief Holds maximum number of contexts used in library
|
||||
*/
|
||||
size_t maximum_num_contexts_{1024};
|
||||
|
||||
/**
|
||||
* @brief The bitmask representing the availability of teams in the pool
|
||||
*/
|
||||
char *pool_bitmask_{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Bitmask to store the reduced result of bitmasks on pariticipating
|
||||
* PEs
|
||||
*
|
||||
* With no thread-safety for this bitmask, multithreaded creation of teams is
|
||||
* not supported.
|
||||
*/
|
||||
char *reduced_bitmask_{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Size of the bitmask
|
||||
*/
|
||||
int bitmask_size_{-1};
|
||||
|
||||
/**
|
||||
* @brief a helper thread to perform the initialization (non-blocking init)
|
||||
*/
|
||||
std::thread async_thread_{};
|
||||
|
||||
/**
|
||||
* @brief Holds a copy of the default context (see OpenSHMEM
|
||||
* specification).
|
||||
*
|
||||
* @todo Remove this member from the backend class. There is another
|
||||
* copy stored in ROCSHMEM_CTX_DEFAULT.
|
||||
*/
|
||||
GPUIBContext *default_ctx_{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Holds a copy of the default context for host functions
|
||||
*/
|
||||
GPUIBHostContext *default_host_ctx_{nullptr};
|
||||
|
||||
unsigned int* hdp_gpu_cpu_flush_flag_;
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
|
||||
@@ -1,54 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#include "dynamic_connection.hpp"
|
||||
#include "reliable_connection.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
// test with different use_ib_hca
|
||||
// test with different heap size
|
||||
// test with different sleep
|
||||
// test with different sq_size
|
||||
|
||||
TEST(DynamicConnect, ToNothing) {
|
||||
DynamicConnection connect;
|
||||
connect.construct_init(1);
|
||||
}
|
||||
|
||||
// test with different num_dcis
|
||||
// test with different num_dcts
|
||||
|
||||
TEST(ReliableConnect, ToNothing) {
|
||||
ReliableConnection connect;
|
||||
connect.construct_init(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,431 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "connection.hpp"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include <mutex> // NOLINT(build/c++11)
|
||||
#include <vector>
|
||||
|
||||
#include "backend_ib.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
#include "../util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
int Connection::use_gpu_mem = 0;
|
||||
int Connection::coherent_cq = 0;
|
||||
|
||||
Connection::Connection(GPUIBBackend* b, int k) : backend(b), key_offset(k) {
|
||||
char* value = nullptr;
|
||||
|
||||
if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) {
|
||||
requested_dev = value;
|
||||
}
|
||||
|
||||
if ((value = getenv("ROCSHMEM_SQ_SIZE"))) {
|
||||
sq_size = atoi(value);
|
||||
}
|
||||
|
||||
if ((value = getenv("ROCSHMEM_USE_CQ_GPU_MEM")) != nullptr) {
|
||||
cq_use_gpu_mem = atoi(value);
|
||||
}
|
||||
|
||||
if ((value = getenv("ROCSHMEM_USE_SQ_GPU_MEM")) != nullptr) {
|
||||
sq_use_gpu_mem = atoi(value);
|
||||
}
|
||||
}
|
||||
|
||||
Connection::~Connection() { delete ib_state; }
|
||||
|
||||
void Connection::reg_mr(void* ptr, size_t size, ibv_mr** mr, bool managed) {
|
||||
int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
|
||||
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
|
||||
if (managed) {
|
||||
access |= IBV_ACCESS_ON_DEMAND;
|
||||
}
|
||||
|
||||
*mr = ibv_reg_mr(ib_state->pd, ptr, size, access);
|
||||
|
||||
if (*mr == nullptr) {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
unsigned Connection::total_number_connections() {
|
||||
int connections;
|
||||
get_remote_conn(&connections);
|
||||
return backend->num_blocks_ * connections;
|
||||
}
|
||||
|
||||
void Connection::initialize(int num_block) {
|
||||
allocate_dynamic_members(num_block);
|
||||
|
||||
int ib_devices{0};
|
||||
dev_list = ibv_get_device_list(&ib_devices);
|
||||
if (dev_list == nullptr) {
|
||||
abort();
|
||||
}
|
||||
|
||||
struct ibv_device* ib_dev = dev_list[0];
|
||||
if (requested_dev != nullptr) {
|
||||
for (int i = 0; i < ib_devices; i++) {
|
||||
const char* select_dev = ibv_get_device_name(dev_list[i]);
|
||||
if (strstr(select_dev, requested_dev) != nullptr) {
|
||||
ib_dev = dev_list[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t port = 1;
|
||||
ib_init(ib_dev, port);
|
||||
|
||||
int hip_dev_id = 0;
|
||||
CHECK_HIP(hipGetDevice(&hip_dev_id));
|
||||
|
||||
int ib_fork_err = ibv_fork_init();
|
||||
if (ib_fork_err != 0) printf("error: ibv_fork_init failed \n");
|
||||
|
||||
sq_post_dv = static_cast<sq_post_dv_t*>(
|
||||
malloc(sizeof(sq_post_dv_t) * total_number_connections()));
|
||||
|
||||
if (sq_post_dv == nullptr) {
|
||||
abort();
|
||||
}
|
||||
|
||||
create_qps(port, backend->my_pe, &ib_state->portinfo);
|
||||
initialize_1(port, num_block);
|
||||
|
||||
MPI_Barrier(backend->thread_comm);
|
||||
free_dynamic_members();
|
||||
}
|
||||
|
||||
void Connection::finalize() {
|
||||
ibv_free_device_list(dev_list);
|
||||
|
||||
int ret = ibv_dereg_mr(backend->networkImpl.heap_mr);
|
||||
if (ret) {
|
||||
abort();
|
||||
}
|
||||
// comment until rocm 4.5
|
||||
// ibv_dereg_mr(backend->networkImpl.hdp_mr);
|
||||
ibv_dereg_mr(backend->networkImpl.mr);
|
||||
}
|
||||
|
||||
void Connection::ib_init(struct ibv_device* ib_dev, uint8_t port) {
|
||||
ib_state = new ib_state_t;
|
||||
if (!ib_state) {
|
||||
abort();
|
||||
}
|
||||
|
||||
ib_state->context = ibv_open_device(ib_dev);
|
||||
if (!ib_state->context) {
|
||||
delete ib_state;
|
||||
abort();
|
||||
}
|
||||
|
||||
ib_state->pd = ibv_alloc_pd(ib_state->context);
|
||||
if (!ib_state->pd) {
|
||||
delete ib_state;
|
||||
abort();
|
||||
}
|
||||
|
||||
ibv_parent_domain_init_attr pattr;
|
||||
init_parent_domain_attr(&pattr);
|
||||
ib_state->pd = ibv_alloc_parent_domain(ib_state->context, &pattr);
|
||||
|
||||
ibv_query_port(ib_state->context, port, &ib_state->portinfo);
|
||||
}
|
||||
|
||||
template <typename StateType>
|
||||
void Connection::try_to_modify_qp(ibv_qp* qp, StateType state) {
|
||||
ibv_modify_qp(qp, &state.exp_qp_attr, state.exp_attr_mask);
|
||||
}
|
||||
|
||||
void Connection::init_qp_status(ibv_qp* qp, uint8_t port) {
|
||||
try_to_modify_qp<InitQPState>(qp, initqp(port));
|
||||
}
|
||||
|
||||
/**
|
||||
* rtr stands for 'ready to receive'
|
||||
*/
|
||||
void Connection::change_status_rtr(ibv_qp* qp, dest_info_t* dest,
|
||||
uint8_t port) {
|
||||
try_to_modify_qp<RtrState>(qp, rtr(dest, port));
|
||||
}
|
||||
|
||||
/**
|
||||
* rts stands for 'ready to send'
|
||||
*/
|
||||
void Connection::change_status_rts(ibv_qp* qp, dest_info_t* dest) {
|
||||
try_to_modify_qp<RtsState>(qp, rts(dest));
|
||||
}
|
||||
|
||||
void Connection::create_qps(uint8_t port, int my_rank,
|
||||
ibv_port_attr* ib_port_att) {
|
||||
create_qps_1();
|
||||
|
||||
ibv_qp_cap cap{};
|
||||
cap.max_send_wr = sq_size;
|
||||
cap.max_send_sge = 1;
|
||||
cap.max_inline_data = 4;
|
||||
|
||||
QPInitAttr qp_init_attr = qpattr(cap);
|
||||
|
||||
size_t qp_size = total_number_connections();
|
||||
cqs.resize(qp_size);
|
||||
qps.resize(qp_size);
|
||||
|
||||
int cqe = qp_init_attr.attr.cap.max_send_wr;
|
||||
for (auto& entry : cqs) {
|
||||
entry = create_cq(ib_state->context, ib_state->pd, cqe);
|
||||
if (!entry) {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
create_qps_2(port, my_rank, ib_port_att);
|
||||
|
||||
for (int i = 0; i < qps.size(); i++) {
|
||||
qps[i] =
|
||||
create_qp(ib_state->pd, ib_state->context, &qp_init_attr.attr, cqs[i]);
|
||||
if (!qps[i]) {
|
||||
abort();
|
||||
}
|
||||
|
||||
create_qps_3(port, qps[i], i, ib_port_att);
|
||||
}
|
||||
}
|
||||
|
||||
void Connection::initialize_gpu_policy(ConnectionImpl** conn,
|
||||
uint32_t* heap_rkey) {
|
||||
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(conn), sizeof(ConnectionImpl)));
|
||||
new (*conn) ConnectionImpl(this, heap_rkey);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create and write the rdma segment to the SQ
|
||||
*/
|
||||
void Connection::set_rdma_seg(mlx5_wqe_raddr_seg* rdma, uint64_t address,
|
||||
uint32_t rkey) {
|
||||
rdma->raddr = htobe64(address);
|
||||
rdma->rkey = htobe32(rkey);
|
||||
}
|
||||
|
||||
/*
|
||||
* Retrieve the address of a SQ.
|
||||
* We used this address to write the WQE directly to the SQ.
|
||||
*/
|
||||
uint64_t* Connection::get_address_sq(int i) {
|
||||
mlx5dv_obj mlx_obj;
|
||||
mlx5dv_qp qp_out;
|
||||
|
||||
mlx_obj.qp.in = qps[i];
|
||||
mlx_obj.qp.out = &qp_out;
|
||||
|
||||
mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP);
|
||||
|
||||
return reinterpret_cast<uint64_t*>(qp_out.sq.buf);
|
||||
}
|
||||
|
||||
void* Connection::buf_alloc([[maybe_unused]] struct ibv_pd* pd,
|
||||
[[maybe_unused]] void* pd_context, size_t size,
|
||||
[[maybe_unused]] size_t alignment,
|
||||
[[maybe_unused]] uint64_t resource_type) {
|
||||
if (use_gpu_mem) {
|
||||
void* dev_ptr;
|
||||
if (coherent_cq == 1) {
|
||||
#if defined USE_COHERENT_HEAP
|
||||
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&dev_ptr), size));
|
||||
#else
|
||||
#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
|
||||
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
|
||||
hipDeviceMallocUncached));
|
||||
#else
|
||||
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
|
||||
hipDeviceMallocFinegrained));
|
||||
#endif
|
||||
#endif
|
||||
} else {
|
||||
#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
|
||||
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
|
||||
hipDeviceMallocUncached));
|
||||
#else
|
||||
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
|
||||
hipDeviceMallocFinegrained));
|
||||
#endif
|
||||
|
||||
}
|
||||
memset(dev_ptr, 0, size);
|
||||
return dev_ptr;
|
||||
}
|
||||
return IBV_ALLOCATOR_USE_DEFAULT;
|
||||
}
|
||||
|
||||
void Connection::buf_release([[maybe_unused]] struct ibv_pd* pd,
|
||||
[[maybe_unused]] void* pd_context, void* ptr,
|
||||
[[maybe_unused]] uint64_t resource_type) {
|
||||
if (use_gpu_mem) {
|
||||
CHECK_HIP(hipFree(ptr));
|
||||
} else {
|
||||
free(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
void Connection::init_parent_domain_attr(ibv_parent_domain_init_attr* attr1) {
|
||||
attr1->pd = ib_state->pd;
|
||||
attr1->td = nullptr;
|
||||
attr1->comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS;
|
||||
attr1->alloc = Connection::buf_alloc;
|
||||
attr1->free = Connection::buf_release;
|
||||
attr1->pd_context = nullptr;
|
||||
}
|
||||
|
||||
ibv_cq* Connection::create_cq(ibv_context* context, ibv_pd* pd, int cqe) {
|
||||
use_gpu_mem = cq_use_gpu_mem;
|
||||
|
||||
ibv_cq_init_attr_ex cq_attr;
|
||||
memset(&cq_attr, 0, sizeof(ibv_cq_init_attr_ex));
|
||||
cq_attr.cqe = cqe;
|
||||
cq_attr.cq_context = nullptr;
|
||||
cq_attr.channel = nullptr;
|
||||
cq_attr.comp_vector = 0;
|
||||
cq_attr.flags = 0; // see ibv_exp_cq_create_flags
|
||||
cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_PD;
|
||||
cq_attr.parent_domain = pd;
|
||||
|
||||
coherent_cq = 1;
|
||||
ibv_cq_ex* cq = ibv_create_cq_ex(context, &cq_attr);
|
||||
coherent_cq = 0;
|
||||
if (!cq) {
|
||||
printf("error in ibv_create_cq_ex: %d %s\n", errno, strerror(errno));
|
||||
return nullptr;
|
||||
}
|
||||
return ibv_cq_ex_to_cq(cq);
|
||||
}
|
||||
|
||||
void Connection::init_gpu_qp_from_connection(QueuePair* gpu_qp,
|
||||
int conn_num) {
|
||||
int hip_dev_id = 0;
|
||||
CHECK_HIP(hipGetDevice(&hip_dev_id));
|
||||
use_gpu_mem = cq_use_gpu_mem;
|
||||
|
||||
mlx5dv_cq cq_out;
|
||||
mlx5dv_obj mlx_obj;
|
||||
mlx_obj.cq.in = cqs[conn_num];
|
||||
mlx_obj.cq.out = &cq_out;
|
||||
|
||||
mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
|
||||
gpu_qp->cq_log_size = log2(cq_out.cqe_cnt);
|
||||
gpu_qp->cq_size = cq_out.cqe_cnt;
|
||||
|
||||
void* gpu_ptr = nullptr;
|
||||
if (use_gpu_mem) {
|
||||
gpu_qp->current_cq_q = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
|
||||
} else {
|
||||
rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(cq_out.buf),
|
||||
cq_out.cqe_cnt * 64, &gpu_ptr, hip_dev_id);
|
||||
gpu_qp->current_cq_q = reinterpret_cast<mlx5_cqe64*>(gpu_ptr);
|
||||
}
|
||||
gpu_qp->current_cq_q_H = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
|
||||
|
||||
rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(cq_out.dbrec), 64,
|
||||
&gpu_ptr, hip_dev_id);
|
||||
|
||||
gpu_qp->dbrec_cq = reinterpret_cast<volatile uint32_t*>(gpu_ptr);
|
||||
|
||||
use_gpu_mem = sq_use_gpu_mem;
|
||||
|
||||
mlx5dv_qp qp_out;
|
||||
mlx_obj.qp.in = qps[conn_num];
|
||||
mlx_obj.qp.out = &qp_out;
|
||||
|
||||
mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP);
|
||||
|
||||
gpu_qp->max_nwqe = (qp_out.sq.wqe_cnt);
|
||||
|
||||
volatile uint32_t* dbrec_send = qp_out.dbrec + 1;
|
||||
|
||||
if (use_gpu_mem) {
|
||||
gpu_qp->current_sq = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
|
||||
gpu_qp->dbrec_send = reinterpret_cast<volatile uint32_t*>(dbrec_send);
|
||||
} else {
|
||||
gpu_ptr = nullptr;
|
||||
rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(qp_out.sq.buf),
|
||||
qp_out.sq.wqe_cnt * 64, &gpu_ptr,
|
||||
hip_dev_id);
|
||||
|
||||
gpu_qp->current_sq = reinterpret_cast<uint64_t*>(gpu_ptr);
|
||||
|
||||
rocm_memory_lock_to_fine_grain(
|
||||
reinterpret_cast<void*>(const_cast<uint32_t*>(dbrec_send)), 32,
|
||||
&gpu_ptr, hip_dev_id);
|
||||
|
||||
gpu_qp->dbrec_send = reinterpret_cast<volatile uint32_t*>(gpu_ptr);
|
||||
}
|
||||
|
||||
gpu_qp->current_sq_H = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
|
||||
|
||||
gpu_qp->setDBval(*(reinterpret_cast<uint64_t*>(qp_out.sq.buf)));
|
||||
|
||||
rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr,
|
||||
hip_dev_id);
|
||||
|
||||
gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
|
||||
|
||||
uint32_t* sq = reinterpret_cast<uint32_t*>(qp_out.sq.buf);
|
||||
uint32_t ctrl_qp_sq = (reinterpret_cast<uint32_t*>(sq))[1];
|
||||
gpu_qp->ctrl_qp_sq = ctrl_qp_sq & 0xFFFFFF;
|
||||
gpu_qp->ctrl_sig = (reinterpret_cast<uint64_t*>(sq))[1];
|
||||
gpu_qp->rkey = (reinterpret_cast<uint32_t*>(sq))[6 + key_offset];
|
||||
gpu_qp->lkey = (reinterpret_cast<uint32_t*>(sq))[9 + key_offset];
|
||||
}
|
||||
|
||||
ibv_qp* Connection::create_qp(ibv_pd* pd, ibv_context* context,
|
||||
ibv_qp_init_attr_ex* qp_attr, ibv_cq* cq) {
|
||||
use_gpu_mem = sq_use_gpu_mem;
|
||||
|
||||
ibv_qp* qp = nullptr;
|
||||
|
||||
assert(pd);
|
||||
assert(context);
|
||||
assert(qp_attr);
|
||||
|
||||
qp_attr->send_cq = cq;
|
||||
qp_attr->recv_cq = cq;
|
||||
qp_attr->pd = pd;
|
||||
|
||||
qp_attr->comp_mask = IBV_QP_INIT_ATTR_PD;
|
||||
|
||||
qp = create_qp_0(context, qp_attr);
|
||||
|
||||
if (!qp) {
|
||||
printf("***** error ibv_create_qp failed %d m %m \n", errno, errno);
|
||||
ibv_destroy_cq(cq);
|
||||
}
|
||||
|
||||
return qp;
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,259 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
extern "C" {
|
||||
#include <infiniband/mlx5dv.h>
|
||||
}
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "connection_policy.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class GPUIBBackend;
|
||||
class QueuePair;
|
||||
|
||||
class Connection {
|
||||
protected:
|
||||
typedef struct ib_state {
|
||||
struct ibv_context* context;
|
||||
struct ibv_pd* pd;
|
||||
struct ibv_mr* mr;
|
||||
struct ibv_port_attr portinfo;
|
||||
} ib_state_t;
|
||||
|
||||
typedef struct dest_info {
|
||||
int lid;
|
||||
int qpn;
|
||||
int psn;
|
||||
union ibv_gid gid;
|
||||
} dest_info_t;
|
||||
|
||||
typedef struct heap_info {
|
||||
void* base_heap;
|
||||
uint32_t rkey;
|
||||
} heap_info_t;
|
||||
|
||||
struct sq_post_dv_t {
|
||||
uint64_t segments[16];
|
||||
uint32_t current_sq;
|
||||
uint16_t wqe_idx;
|
||||
};
|
||||
|
||||
class State {
|
||||
public:
|
||||
ibv_qp_attr exp_qp_attr{};
|
||||
uint64_t exp_attr_mask{};
|
||||
};
|
||||
|
||||
class InitQPState : public State {
|
||||
public:
|
||||
InitQPState() {
|
||||
exp_qp_attr.qp_state = IBV_QPS_INIT;
|
||||
exp_qp_attr.qp_access_flags =
|
||||
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
|
||||
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
|
||||
|
||||
exp_attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
|
||||
}
|
||||
};
|
||||
|
||||
class RtrState : public State {
|
||||
public:
|
||||
RtrState() {
|
||||
exp_qp_attr.qp_state = IBV_QPS_RTR;
|
||||
exp_qp_attr.path_mtu = IBV_MTU_4096;
|
||||
exp_qp_attr.ah_attr.sl = 1;
|
||||
exp_qp_attr.max_dest_rd_atomic = 1;
|
||||
exp_qp_attr.min_rnr_timer = 12;
|
||||
|
||||
exp_attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU;
|
||||
}
|
||||
};
|
||||
|
||||
class RtsState : public State {
|
||||
public:
|
||||
RtsState() {
|
||||
exp_qp_attr.qp_state = IBV_QPS_RTS;
|
||||
exp_qp_attr.timeout = 14;
|
||||
exp_qp_attr.retry_cnt = 7;
|
||||
exp_qp_attr.rnr_retry = 7;
|
||||
exp_qp_attr.max_rd_atomic = 1;
|
||||
|
||||
exp_attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
|
||||
IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC;
|
||||
}
|
||||
};
|
||||
|
||||
class QPInitAttr {
|
||||
public:
|
||||
explicit QPInitAttr(ibv_qp_cap cap) {
|
||||
attr.cap = cap;
|
||||
attr.sq_sig_all = 0;
|
||||
}
|
||||
ibv_qp_init_attr_ex attr{};
|
||||
};
|
||||
|
||||
public:
|
||||
Connection(GPUIBBackend* backend, int key_offset);
|
||||
|
||||
virtual ~Connection();
|
||||
|
||||
void initialize(int num_block);
|
||||
|
||||
void finalize();
|
||||
|
||||
virtual void post_wqes() = 0;
|
||||
|
||||
void reg_mr(void* ptr, size_t size, ibv_mr** mr, bool is_managed);
|
||||
|
||||
virtual void get_remote_conn(int* remote_conn) = 0;
|
||||
|
||||
unsigned total_number_connections();
|
||||
|
||||
virtual void initialize_rkey_handle(uint32_t** heap_rkey_handle,
|
||||
ibv_mr* mr) = 0;
|
||||
|
||||
virtual void free_rkey_handle(uint32_t* heap_rkey_handle) = 0;
|
||||
|
||||
void initialize_gpu_policy(ConnectionImpl** conn, uint32_t* heap_rkey);
|
||||
|
||||
/*
|
||||
* Populate a QueuePair for use on the GPU from the internal IB state.
|
||||
*/
|
||||
void init_gpu_qp_from_connection(QueuePair* qp, int conn_num);
|
||||
|
||||
protected:
|
||||
Connection() = default;
|
||||
|
||||
virtual InitQPState initqp(uint8_t port) = 0;
|
||||
|
||||
virtual RtrState rtr(dest_info_t* dest, uint8_t port) = 0;
|
||||
|
||||
virtual RtsState rts(dest_info_t* dest) = 0;
|
||||
|
||||
virtual QPInitAttr qpattr(ibv_qp_cap cap) = 0;
|
||||
|
||||
void init_qp_status(ibv_qp* qp, uint8_t port);
|
||||
|
||||
void change_status_rtr(ibv_qp* qp, dest_info_t* dest, uint8_t port);
|
||||
|
||||
void change_status_rts(ibv_qp* qp, dest_info_t* dest);
|
||||
|
||||
void create_qps(uint8_t port, int my_rank, ibv_port_attr* ib_port_att);
|
||||
|
||||
template <typename T>
|
||||
void try_to_modify_qp(ibv_qp* qp, T state);
|
||||
|
||||
virtual void create_qps_1() = 0;
|
||||
|
||||
virtual void create_qps_2(int port, int my_rank,
|
||||
ibv_port_attr* ib_port_att) = 0;
|
||||
|
||||
virtual void create_qps_3(int port, ibv_qp* qp, int offset,
|
||||
ibv_port_attr* ib_port_att) = 0;
|
||||
|
||||
virtual ibv_qp* create_qp_0(ibv_context* context,
|
||||
ibv_qp_init_attr_ex* qp_attr) = 0;
|
||||
|
||||
virtual void allocate_dynamic_members(int num_block) = 0;
|
||||
|
||||
virtual void free_dynamic_members() = 0;
|
||||
|
||||
virtual void initialize_1(int port, int num_block) = 0;
|
||||
|
||||
virtual void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah,
|
||||
int dc_key) = 0;
|
||||
|
||||
virtual int get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) = 0;
|
||||
|
||||
void set_sq_dv(int num_block, int wg_idx, int pe_idx);
|
||||
|
||||
/*
|
||||
* ibv interface functions must be static.
|
||||
*/
|
||||
static void* buf_alloc(ibv_pd* pd, void* pd_context, size_t size,
|
||||
size_t alignment, uint64_t resource_type);
|
||||
|
||||
static void buf_release(ibv_pd* pd, void* pd_context, void* ptr,
|
||||
uint64_t resource_type);
|
||||
|
||||
void init_parent_domain_attr(ibv_parent_domain_init_attr* attr);
|
||||
|
||||
void set_rdma_seg(mlx5_wqe_raddr_seg* rdma, uint64_t address, uint32_t rkey);
|
||||
|
||||
uint64_t* get_address_sq(int i);
|
||||
|
||||
ibv_cq* create_cq(ibv_context* context, ibv_pd* pd, int cqe);
|
||||
|
||||
ibv_qp* create_qp(ibv_pd* pd, ibv_context* context,
|
||||
ibv_qp_init_attr_ex* qp_attr, ibv_cq* rcq);
|
||||
|
||||
/*
|
||||
* TODO: Remove this eventually. Goal is to have backend delegate
|
||||
* connection stuff to this class, while this class knows nothing about
|
||||
* GPUs or backends.
|
||||
*/
|
||||
GPUIBBackend* backend{nullptr};
|
||||
|
||||
uint32_t sq_size{1024};
|
||||
|
||||
ib_state_t* ib_state{nullptr};
|
||||
|
||||
const int key_offset{0};
|
||||
|
||||
sq_post_dv_t* sq_post_dv{nullptr};
|
||||
|
||||
std::vector<ibv_cq*> cqs;
|
||||
|
||||
std::vector<ibv_qp*> qps;
|
||||
|
||||
uint64_t counter_wqe{0};
|
||||
|
||||
static int use_gpu_mem;
|
||||
|
||||
static int coherent_cq;
|
||||
|
||||
int cq_use_gpu_mem{1};
|
||||
|
||||
int sq_use_gpu_mem{1};
|
||||
|
||||
private:
|
||||
void init_shmem_handle();
|
||||
|
||||
void ib_init(ibv_device* ib_dev, uint8_t port);
|
||||
|
||||
char* requested_dev{nullptr};
|
||||
|
||||
ibv_device** dev_list{nullptr};
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
|
||||
@@ -1,81 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "connection_policy.hpp"
|
||||
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "dynamic_connection.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
|
||||
#ifdef DEBUG
|
||||
#define HIP_ENABLE_PRINTF 1
|
||||
#endif
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
RCConnectionImpl::RCConnectionImpl([[maybe_unused]] Connection* conn,
|
||||
[[maybe_unused]] uint32_t* _vec_rkey) {}
|
||||
|
||||
DCConnectionImpl::DCConnectionImpl(Connection* conn, uint32_t* _vec_rkey)
|
||||
: vec_dct_num(static_cast<DynamicConnection*>(conn)->get_vec_dct_num()),
|
||||
vec_rkey(_vec_rkey),
|
||||
vec_lids(static_cast<DynamicConnection*>(conn)->get_vec_lids()) {}
|
||||
|
||||
__device__ uint32_t RCConnectionImpl::getNumWqesImpl([
|
||||
[maybe_unused]] uint8_t opcode) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
__device__ uint32_t DCConnectionImpl::getNumWqesImpl(uint8_t opcode) {
|
||||
// FIXME: We assume all threads in wave are performing ATOMIC ops.
|
||||
// While this might be common, we do not have such restriction
|
||||
// so need to be fixed.
|
||||
// Since OFED 5.2, a DC segments uses 48bytes - so with or without
|
||||
// atomic we need 2 wqes.
|
||||
// return 2;
|
||||
return (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS)
|
||||
? 2
|
||||
: 1;
|
||||
}
|
||||
|
||||
__device__ bool RCConnectionImpl::updateConnectionSegmentImpl(
|
||||
[[maybe_unused]] ib_mlx5_base_av_t* wqe, [[maybe_unused]] int pe) {
|
||||
return false;
|
||||
}
|
||||
|
||||
__device__ bool DCConnectionImpl::updateConnectionSegmentImpl(
|
||||
ib_mlx5_base_av_t* wqe, int pe) {
|
||||
wqe->dqp_dct = vec_dct_num[pe];
|
||||
wqe->rlid = vec_lids[pe];
|
||||
return true;
|
||||
}
|
||||
|
||||
__device__ void RCConnectionImpl::setRkeyImpl([[maybe_unused]] uint32_t* rkey,
|
||||
[[maybe_unused]] int pe) {}
|
||||
|
||||
__device__ void DCConnectionImpl::setRkeyImpl(uint32_t* rkey, int pe) {
|
||||
*rkey = vec_rkey[pe];
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,130 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "infiniband_structs.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
/*
|
||||
* CRTP base class for connection type
|
||||
*/
|
||||
template <typename Derived>
|
||||
class ConnectionBase {
|
||||
public:
|
||||
/*
|
||||
* Control segment WQE offset imposed by this connection type.
|
||||
*/
|
||||
__device__ int wqeCntrlOffset() {
|
||||
return static_cast<Derived*>(this)->wqeCntrlOffsetImpl();
|
||||
}
|
||||
|
||||
/*
|
||||
* Whether or not we need to force PE-level divergence when posting for
|
||||
* this connection type.
|
||||
*/
|
||||
__device__ bool forcePostDivergence() {
|
||||
return static_cast<Derived*>(this)->forcePostDivergenceImpl();
|
||||
}
|
||||
|
||||
/*
|
||||
* Number of WQEs produced by this connection type for the given opcode.
|
||||
*/
|
||||
__device__ uint32_t getNumWqes(uint8_t opcode) {
|
||||
return static_cast<Derived*>(this)->getNumWqesImpl(opcode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Updates the connection-specific segment in the SQ.
|
||||
*/
|
||||
__device__ bool updateConnectionSegment(ib_mlx5_base_av_t* wqe, int pe) {
|
||||
return static_cast<Derived*>(this)->updateConnectionSegmentImpl(wqe, pe);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the rkey based on this connection type.
|
||||
*/
|
||||
__device__ void setRkey(uint32_t* rkey, int pe) {
|
||||
static_cast<Derived*>(this)->setRkeyImpl(rkey, pe);
|
||||
}
|
||||
};
|
||||
|
||||
class Connection;
|
||||
|
||||
/*
|
||||
* Connection policy corresponding to an RC connection type.
|
||||
*/
|
||||
class RCConnectionImpl : public ConnectionBase<RCConnectionImpl> {
|
||||
public:
|
||||
RCConnectionImpl(Connection* conn, uint32_t* _vec_rkey);
|
||||
|
||||
__device__ int wqeCntrlOffsetImpl() { return 0; }
|
||||
|
||||
__device__ bool forcePostDivergenceImpl() { return true; }
|
||||
|
||||
__device__ uint32_t getNumWqesImpl(uint8_t opcode);
|
||||
|
||||
__device__ bool updateConnectionSegmentImpl(ib_mlx5_base_av_t* wqe, int pe);
|
||||
|
||||
__device__ void setRkeyImpl(uint32_t* rkey, int pe);
|
||||
};
|
||||
|
||||
/*
|
||||
* Connection policy corresponding to a DC connection type.
|
||||
*/
|
||||
class DCConnectionImpl : public ConnectionBase<DCConnectionImpl> {
|
||||
public:
|
||||
DCConnectionImpl(Connection* conn, uint32_t* _vec_rkey);
|
||||
|
||||
__device__ int wqeCntrlOffsetImpl() { return 1; }
|
||||
|
||||
__device__ bool forcePostDivergenceImpl() { return false; }
|
||||
|
||||
__device__ uint32_t getNumWqesImpl(uint8_t opcode);
|
||||
|
||||
__device__ bool updateConnectionSegmentImpl(ib_mlx5_base_av_t* wqe, int pe);
|
||||
|
||||
__device__ void setRkeyImpl(uint32_t* rkey, int pe);
|
||||
|
||||
private:
|
||||
uint32_t* vec_dct_num{nullptr};
|
||||
|
||||
uint32_t* vec_rkey{nullptr};
|
||||
|
||||
uint16_t* vec_lids{nullptr};
|
||||
};
|
||||
|
||||
/*
|
||||
* Select which one of our connection policies to use at compile time.
|
||||
*/
|
||||
#ifdef USE_DC
|
||||
typedef DCConnectionImpl ConnectionImpl;
|
||||
#else
|
||||
typedef RCConnectionImpl ConnectionImpl;
|
||||
#endif
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
|
||||
@@ -1,371 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "context_ib_device.hpp"
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "backend_ib.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
__host__ GPUIBContext::GPUIBContext(Backend *backend, bool option, int idx)
|
||||
: Context(backend, option) {
|
||||
GPUIBBackend *b{static_cast<GPUIBBackend *>(backend)};
|
||||
ctx_idx = idx;
|
||||
networkImpl = b->networkImpl;
|
||||
base_heap = b->heap.get_heap_bases().data();
|
||||
networkImpl.networkHostInit(this, idx);
|
||||
|
||||
barrier_sync = b->barrier_sync;
|
||||
ipcImpl_.ipc_bases = b->ipcImpl.ipc_bases;
|
||||
ipcImpl_.shm_size = b->ipcImpl.shm_size;
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::ctx_create() {
|
||||
/* Nothing to do in the GPU_IB backend */
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO(bpotter): these will go in a policy class based on DC/RC.
|
||||
* I am not completely sure at this point what else is needed in said class,
|
||||
* so just leave them up here for now.
|
||||
*/
|
||||
__device__ __host__ QueuePair *GPUIBContext::getQueuePair(int pe) {
|
||||
return networkImpl.getQueuePair(device_qp_proxy, pe);
|
||||
}
|
||||
|
||||
__device__ __host__ int GPUIBContext::getNumQueuePairs() {
|
||||
return networkImpl.getNumQueuePairs();
|
||||
}
|
||||
|
||||
__device__ __host__ int GPUIBContext::getNumDest() {
|
||||
return networkImpl.getNumDest();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::fence() {
|
||||
#ifdef USE_SINGLE_NODE
|
||||
threadfence_system();
|
||||
#else
|
||||
|
||||
for (int k = 0; k < getNumDest(); k++) {
|
||||
getQueuePair(k)->fence(k);
|
||||
}
|
||||
|
||||
fence_.flush();
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::fence(int pe) {
|
||||
#ifdef USE_SINGLE_NODE
|
||||
threadfence_system();
|
||||
#else
|
||||
getQueuePair(pe)->fence(pe);
|
||||
fence_.flush();
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::putmem_nbi(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
|
||||
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[local_pe] + L_offset,
|
||||
const_cast<void *>(source), nelems);
|
||||
} else {
|
||||
bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
|
||||
if (!must_send_message) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto *qp = getQueuePair(pe);
|
||||
qp->put_nbi<THREAD>(base_heap[pe] + L_offset, source, nelems, pe, true);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::getmem_nbi(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
const char *src_typed = reinterpret_cast<const char *>(source);
|
||||
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
|
||||
} else {
|
||||
bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
|
||||
if (!must_send_message) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto *qp = getQueuePair(pe);
|
||||
qp->get_nbi<THREAD>(base_heap[pe] + L_offset, dest, nelems, pe, true);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::quiet() {
|
||||
#ifdef USE_SINGLE_NODE
|
||||
threadfence_system();
|
||||
for (int pe = 0; pe < ipcImpl_.shm_size; pe++) {
|
||||
if (pe != my_pe) {
|
||||
ipcImpl_.zero_byte_read(pe);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
for (int k = 0; k < getNumDest(); k++) {
|
||||
getQueuePair(k)->quiet_single_heavy<THREAD>(k);
|
||||
}
|
||||
fence_.flush();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ void *GPUIBContext::shmem_ptr(const void *dest, int pe) {
|
||||
void *ret = nullptr;
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
void *dst = const_cast<void *>(dest);
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ret = ipcImpl_.ipc_bases[local_pe] + L_offset;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::threadfence_system() {
|
||||
int thread_id = get_flat_block_id();
|
||||
|
||||
if (thread_id % WF_SIZE == lowerID()) {
|
||||
#ifdef USE_SINGLE_NODE
|
||||
// Flush current PE HDP
|
||||
HdpPolicy::hdp_flush(
|
||||
reinterpret_cast<unsigned int *>(networkImpl.hdp_address));
|
||||
|
||||
// Flush the rest of the HDPs
|
||||
for (int pe = 0; pe < ipcImpl_.shm_size; pe++) {
|
||||
auto target_address = networkImpl.hdp_address;
|
||||
const int value = HdpPolicy::HDP_FLUSH_VAL;
|
||||
if (pe != my_pe) {
|
||||
const int value = HdpPolicy::HDP_FLUSH_VAL;
|
||||
auto mapped_address =
|
||||
shmem_ptr(reinterpret_cast<void *>(target_address), pe);
|
||||
__hip_atomic_store(static_cast<int *>(mapped_address), value,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
}
|
||||
#else
|
||||
getQueuePair(my_pe)->hdp_policy->flushCoherency();
|
||||
#endif
|
||||
}
|
||||
|
||||
__threadfence_system();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::getmem(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
const char *src_typed = reinterpret_cast<const char *>(source);
|
||||
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
|
||||
} else {
|
||||
bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
|
||||
if (!must_send_message) {
|
||||
return;
|
||||
}
|
||||
auto *qp = getQueuePair(pe);
|
||||
qp->get_nbi_cqe<THREAD>(base_heap[pe] + L_offset, dest, nelems, pe, true);
|
||||
qp->quiet_single<THREAD>();
|
||||
}
|
||||
fence_.flush();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::putmem(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[local_pe] + L_offset,
|
||||
const_cast<void *>(source), nelems);
|
||||
|
||||
threadfence_system();
|
||||
ipcImpl_.zero_byte_read(pe);
|
||||
} else {
|
||||
bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
|
||||
if (!must_send_message) {
|
||||
return;
|
||||
}
|
||||
auto *qp = getQueuePair(pe);
|
||||
qp->put_nbi_cqe<THREAD>(base_heap[pe] + L_offset, source, nelems, pe, true);
|
||||
qp->quiet_single<THREAD>();
|
||||
}
|
||||
fence_.flush();
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
************************ WORKGROUP/WAVE-LEVEL RMA API ************************
|
||||
*****************************************************************************/
|
||||
__device__ void GPUIBContext::putmem_nbi_wg(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[local_pe] + L_offset,
|
||||
const_cast<void *>(source), nelems);
|
||||
} else {
|
||||
if (is_thread_zero_in_block()) {
|
||||
auto *qp = getQueuePair(pe);
|
||||
qp->put_nbi<WG>(base_heap[pe] + L_offset, source, nelems, pe, true);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::putmem_nbi_wave(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[local_pe] + L_offset,
|
||||
const_cast<void *>(source), nelems);
|
||||
} else {
|
||||
if (is_thread_zero_in_wave()) {
|
||||
auto *qp = getQueuePair(pe);
|
||||
qp->put_nbi<WAVE>(base_heap[pe] + L_offset, source, nelems, pe, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::putmem_wg(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[local_pe] + L_offset,
|
||||
const_cast<void *>(source), nelems);
|
||||
__syncthreads();
|
||||
threadfence_system();
|
||||
ipcImpl_.zero_byte_read(pe);
|
||||
} else {
|
||||
auto *qp = getQueuePair(pe);
|
||||
if (is_thread_zero_in_block()) {
|
||||
qp->put_nbi_cqe<WG>(base_heap[pe] + L_offset, source, nelems, pe, true);
|
||||
}
|
||||
qp->quiet_single<WG>();
|
||||
}
|
||||
__syncthreads();
|
||||
fence_.flush();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::putmem_wave(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
|
||||
auto *qp = getQueuePair(pe);
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[local_pe] + L_offset,
|
||||
const_cast<void *>(source), nelems);
|
||||
threadfence_system();
|
||||
ipcImpl_.zero_byte_read(pe);
|
||||
} else {
|
||||
if (is_thread_zero_in_wave()) {
|
||||
qp->put_nbi_cqe<WAVE>(base_heap[pe] + L_offset, source, nelems, pe, true);
|
||||
}
|
||||
qp->quiet_single<WAVE>();
|
||||
}
|
||||
fence_.flush();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::getmem_wg(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
const char *src_typed = reinterpret_cast<const char *>(source);
|
||||
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
|
||||
auto *qp = getQueuePair(pe);
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
|
||||
} else {
|
||||
if (is_thread_zero_in_block()) {
|
||||
qp->get_nbi_cqe<WG>(base_heap[pe] + L_offset, dest, nelems, pe, true);
|
||||
}
|
||||
qp->quiet_single<WG>();
|
||||
}
|
||||
__syncthreads();
|
||||
fence_.flush();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::getmem_wave(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
const char *src_typed = reinterpret_cast<const char *>(source);
|
||||
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
|
||||
auto *qp = getQueuePair(pe);
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[local_pe] + L_offset,
|
||||
nelems);
|
||||
} else {
|
||||
if (is_thread_zero_in_wave()) {
|
||||
qp->get_nbi_cqe<WAVE>(base_heap[pe] + L_offset, dest, nelems, pe, true);
|
||||
}
|
||||
qp->quiet_single<WAVE>();
|
||||
}
|
||||
fence_.flush();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::getmem_nbi_wg(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
const char *src_typed = reinterpret_cast<const char *>(source);
|
||||
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
|
||||
} else {
|
||||
if (is_thread_zero_in_block()) {
|
||||
auto *qp = getQueuePair(pe);
|
||||
qp->get_nbi<WG>(base_heap[pe] + L_offset, dest, nelems, pe, true);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::getmem_nbi_wave(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
const char *src_typed = reinterpret_cast<const char *>(source);
|
||||
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
|
||||
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
|
||||
int local_pe = pe % ipcImpl_.shm_size;
|
||||
ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[local_pe] + L_offset,
|
||||
nelems);
|
||||
} else {
|
||||
if (is_thread_zero_in_wave()) {
|
||||
auto *qp = getQueuePair(pe);
|
||||
qp->get_nbi<WAVE>(base_heap[pe] + L_offset, dest, nelems, pe, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,304 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
|
||||
|
||||
#include "../context.hpp"
|
||||
#include "memory_builder_policy.hpp"
|
||||
#include "network_policy.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class QueuePair;
|
||||
|
||||
class GPUIBContext : public Context {
|
||||
public:
|
||||
__host__ GPUIBContext(Backend *b, bool option, int idx);
|
||||
|
||||
__device__ __host__ QueuePair *getQueuePair(int pe);
|
||||
|
||||
__device__ __host__ int getNumQueuePairs();
|
||||
|
||||
__device__ __host__ int getNumDest();
|
||||
|
||||
__device__ __attribute__((noinline)) void threadfence_system();
|
||||
|
||||
__device__ void ctx_create();
|
||||
|
||||
__device__ void ctx_destroy();
|
||||
|
||||
__device__ void putmem(void *dest, const void *source, size_t nelems, int pe);
|
||||
|
||||
__device__ void getmem(void *dest, const void *source, size_t nelems, int pe);
|
||||
|
||||
__device__ void putmem_nbi(void *dest, const void *source, size_t nelems,
|
||||
int pe);
|
||||
|
||||
__device__ void getmem_nbi(void *dest, const void *source, size_t size,
|
||||
int pe);
|
||||
|
||||
__device__ void fence();
|
||||
|
||||
__device__ void fence(int pe);
|
||||
|
||||
__device__ void quiet();
|
||||
|
||||
__device__ void *shmem_ptr(const void *dest, int pe);
|
||||
|
||||
__device__ void barrier_all();
|
||||
|
||||
__device__ void sync_all();
|
||||
|
||||
__device__ void sync(rocshmem_team_t team);
|
||||
|
||||
template <typename T>
|
||||
__device__ void amo_add(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void amo_set(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T amo_swap(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T amo_fetch_and(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void amo_and(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T amo_fetch_or(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void amo_or(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T amo_fetch_xor(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void amo_xor(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void amo_cas(void *dst, T value, T cond, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T amo_fetch_add(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void p(T *dest, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T g(const T *source, int pe);
|
||||
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void to_all(T *dest, const T *source, int nreduce, int PE_start,
|
||||
int logPE_stride, int PE_size, T *pWrk,
|
||||
long *pSync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void to_all(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nreduce);
|
||||
|
||||
template <typename T>
|
||||
__device__ void put(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void put_nbi(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void get(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T>
|
||||
__device__ void broadcast(T *dest, const T *source, int nelems, int pe_root,
|
||||
int pe_start, int log_pe_stride, int pe_size,
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_brucks(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void alltoall_gcen2(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_brucks(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems);
|
||||
|
||||
template <typename T>
|
||||
__device__ void fcollect_gcen2(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems);
|
||||
|
||||
__device__ void putmem_wg(void *dest, const void *source, size_t nelems,
|
||||
int pe);
|
||||
|
||||
__device__ void getmem_wg(void *dest, const void *source, size_t nelems,
|
||||
int pe);
|
||||
|
||||
__device__ void putmem_nbi_wg(void *dest, const void *source, size_t nelems,
|
||||
int pe);
|
||||
|
||||
__device__ void getmem_nbi_wg(void *dest, const void *source, size_t size,
|
||||
int pe);
|
||||
|
||||
__device__ void putmem_wave(void *dest, const void *source, size_t nelems,
|
||||
int pe);
|
||||
|
||||
__device__ void getmem_wave(void *dest, const void *source, size_t nelems,
|
||||
int pe);
|
||||
|
||||
__device__ void putmem_nbi_wave(void *dest, const void *source, size_t nelems,
|
||||
int pe);
|
||||
|
||||
__device__ void getmem_nbi_wave(void *dest, const void *source, size_t size,
|
||||
int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void put_wg(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void put_nbi_wg(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void get_wg(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void get_nbi_wg(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void put_wave(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void put_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void get_wave(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
private:
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void internal_direct_allreduce(
|
||||
T *dst, const T *src, int nelems, int PE_start, int logPE_stride,
|
||||
int PE_size, T *pWrk,
|
||||
long *pSync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void internal_ring_allreduce(T *dst, const T *src, int nelems,
|
||||
int PE_start, int logPE_stride,
|
||||
int PE_size, T *pWrk,
|
||||
long *pSync, // NOLINT(runtime/int)
|
||||
int n_seg, int seg_size,
|
||||
int chunk_size);
|
||||
|
||||
template <typename T>
|
||||
__device__ void internal_put_broadcast(T *dst, const T *src, int nelems,
|
||||
int pe_root, int PE_start,
|
||||
int logPE_stride, int PE_size,
|
||||
long *pSync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__device__ void internal_get_broadcast(T *dst, const T *src, int nelems,
|
||||
int pe_root,
|
||||
long *pSync); // NOLINT(runtime/int)
|
||||
|
||||
__device__ void internal_direct_barrier(int pe, int PE_start, int stride,
|
||||
int n_pes, int64_t *pSync);
|
||||
|
||||
__device__ void internal_atomic_barrier(int pe, int PE_start, int stride,
|
||||
int n_pes, int64_t *pSync);
|
||||
|
||||
__device__ void internal_sync(int pe, int PE_start, int stride, int PE_size,
|
||||
int64_t *pSync);
|
||||
|
||||
__device__ void quiet_single(int cq_num);
|
||||
|
||||
public:
|
||||
/*
|
||||
* Collection of queue pairs that are currently checked out by this
|
||||
* context from GPUIBBackend.
|
||||
*/
|
||||
// FIXME: keep it private and destroy in destructor for better
|
||||
// encapsulation.
|
||||
QueuePair *device_qp_proxy{nullptr};
|
||||
|
||||
/*
|
||||
* Array of char * pointers corresponding to the heap base pointers VA for
|
||||
* each PE that we can communicate with.
|
||||
*/
|
||||
char *const *base_heap{nullptr};
|
||||
|
||||
/*
|
||||
* Buffer used to store the results of a *_g operation. These ops do not
|
||||
* provide a destination buffer, so the runtime must manage one.
|
||||
*/
|
||||
char *g_ret{nullptr};
|
||||
|
||||
NetworkImpl networkImpl{};
|
||||
|
||||
/*
|
||||
* Temporary scratchpad memory used by internal barrier algorithms.
|
||||
*/
|
||||
int64_t *barrier_sync{nullptr};
|
||||
|
||||
int ctx_idx;
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
|
||||
@@ -1,122 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "context_ib_tmpl_device.hpp"
|
||||
#include "../util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
__device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start,
|
||||
int stride, int n_pes,
|
||||
int64_t *pSync) {
|
||||
int64_t flag_val = 1;
|
||||
if (pe == PE_start) {
|
||||
// Go through all PE offsets (except current offset = 0)
|
||||
// and wait until they all reach
|
||||
for (size_t i = 1; i < n_pes; i++) {
|
||||
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
threadfence_system();
|
||||
// Announce to other PEs that all have reached
|
||||
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
|
||||
put_nbi(&pSync[0], &flag_val, 1, j);
|
||||
}
|
||||
|
||||
} else {
|
||||
// Mark current PE offset as reached
|
||||
size_t pe_offset = (pe - PE_start) / stride;
|
||||
put_nbi(&pSync[pe_offset], &flag_val, 1, PE_start);
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::internal_atomic_barrier(int pe, int PE_start,
|
||||
int stride, int n_pes,
|
||||
int64_t *pSync) {
|
||||
int64_t flag_val = 1;
|
||||
if (pe == PE_start) {
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
|
||||
put_nbi(&pSync[0], &flag_val, 1, j);
|
||||
}
|
||||
} else {
|
||||
amo_add<int64_t>(&pSync[0], flag_val, PE_start);
|
||||
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
|
||||
// Uses PE values that are relative to world
|
||||
__device__ void GPUIBContext::internal_sync(int pe, int PE_start, int stride,
|
||||
int PE_size, int64_t *pSync) {
|
||||
__syncthreads();
|
||||
if (is_thread_zero_in_block()) {
|
||||
if (PE_size < 64) {
|
||||
internal_direct_barrier(pe, PE_start, stride, PE_size, pSync);
|
||||
} else {
|
||||
internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync);
|
||||
}
|
||||
}
|
||||
__threadfence();
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::sync(rocshmem_team_t team) {
|
||||
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
|
||||
|
||||
double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
|
||||
int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
|
||||
/**
|
||||
* Ensure that the stride is a multiple of 2 for GPU_IB.
|
||||
* TODO: enable GPU_IB to work with non-powers-of-2 strides
|
||||
* and remove this assert.
|
||||
*/
|
||||
assert((dbl_log_pe_stride - log_pe_stride) == 0);
|
||||
|
||||
int pe = team_obj->my_pe_in_world;
|
||||
int pe_start = team_obj->tinfo_wrt_world->pe_start;
|
||||
int pe_stride = (1 << log_pe_stride);
|
||||
int pe_size = team_obj->num_pes;
|
||||
internal_sync(pe, pe_start, pe_stride, pe_size, barrier_sync);
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::sync_all() {
|
||||
internal_sync(my_pe, 0, 1, num_pes, barrier_sync);
|
||||
}
|
||||
|
||||
__device__ void GPUIBContext::barrier_all() {
|
||||
if (is_thread_zero_in_block()) {
|
||||
quiet();
|
||||
}
|
||||
sync_all();
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,85 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "context_ib_host.hpp"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../backend_type.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "backend_ib.hpp"
|
||||
#include "../host/host.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
__host__ GPUIBHostContext::GPUIBHostContext(Backend *backend,
|
||||
[[maybe_unused]] int64_t options)
|
||||
: Context(backend, true) {
|
||||
GPUIBBackend *b{static_cast<GPUIBBackend *>(backend)};
|
||||
|
||||
host_interface = b->host_interface;
|
||||
|
||||
context_window_info = host_interface->acquire_window_context();
|
||||
}
|
||||
|
||||
__host__ GPUIBHostContext::~GPUIBHostContext() {
|
||||
host_interface->release_window_context(context_window_info);
|
||||
}
|
||||
|
||||
__host__ void GPUIBHostContext::putmem_nbi(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
host_interface->putmem_nbi(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
|
||||
__host__ void GPUIBHostContext::getmem_nbi(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
host_interface->getmem_nbi(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
|
||||
__host__ void GPUIBHostContext::putmem(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
host_interface->putmem(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
|
||||
__host__ void GPUIBHostContext::getmem(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
host_interface->getmem(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
|
||||
__host__ void GPUIBHostContext::fence() {
|
||||
host_interface->fence(context_window_info);
|
||||
}
|
||||
|
||||
__host__ void GPUIBHostContext::quiet() {
|
||||
host_interface->quiet(context_window_info);
|
||||
}
|
||||
|
||||
__host__ void GPUIBHostContext::sync_all() {
|
||||
host_interface->sync_all(context_window_info);
|
||||
}
|
||||
|
||||
__host__ void GPUIBHostContext::barrier_all() {
|
||||
host_interface->barrier_all(context_window_info);
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,149 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
|
||||
|
||||
#include "../context.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class GPUIBHostContext : public Context {
|
||||
public:
|
||||
__host__ GPUIBHostContext(Backend *b, int64_t options);
|
||||
|
||||
__host__ ~GPUIBHostContext();
|
||||
|
||||
template <typename T>
|
||||
__host__ void p(T *dest, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ T g(const T *source, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ void put(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ void get(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ void put_nbi(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
|
||||
|
||||
__host__ void putmem(void *dest, const void *source, size_t nelems, int pe);
|
||||
|
||||
__host__ void getmem(void *dest, const void *source, size_t nelems, int pe);
|
||||
|
||||
__host__ void putmem_nbi(void *dest, const void *source, size_t nelems,
|
||||
int pe);
|
||||
|
||||
__host__ void getmem_nbi(void *dest, const void *source, size_t size, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ void amo_add(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ void amo_cas(void *dst, T value, T cond, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ T amo_fetch_add(void *dst, T value, int pe);
|
||||
|
||||
template <typename T>
|
||||
__host__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
|
||||
|
||||
__host__ void fence();
|
||||
|
||||
__host__ void quiet();
|
||||
|
||||
__host__ void barrier_all();
|
||||
|
||||
__host__ void sync_all();
|
||||
|
||||
template <typename T>
|
||||
__host__ void broadcast(T *dest, const T *source, int nelems, int pe_root,
|
||||
int pe_start, int log_pe_stride, int pe_size,
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T>
|
||||
__host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nelems, int pe_root);
|
||||
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
|
||||
int log_pe_stride, int pe_size, T *p_wrk,
|
||||
long *p_sync); // NOLINT(runtime/int)
|
||||
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void to_all(rocshmem_team_t team, T *dest, const T *source,
|
||||
int nreduce);
|
||||
|
||||
template <typename T>
|
||||
__host__ void wait_until(T *ivars, int cmp, T val);
|
||||
|
||||
template <typename T>
|
||||
__host__ size_t wait_until_any(T *ivars, size_t nelems,
|
||||
const int *status,
|
||||
int cmp, T val);
|
||||
|
||||
template <typename T>
|
||||
__host__ void wait_until_all(T *ivars, size_t nelems,
|
||||
const int *status,
|
||||
int cmp, T val);
|
||||
|
||||
template <typename T>
|
||||
__host__ size_t wait_until_some(T *ivars, size_t nelems,
|
||||
size_t* indices,
|
||||
const int *status,
|
||||
int cmp, T val);
|
||||
|
||||
template <typename T>
|
||||
__host__ void wait_until_all_vector(T *ivars, size_t nelems,
|
||||
const int *status,
|
||||
int cmp, T* vals);
|
||||
|
||||
template <typename T>
|
||||
__host__ size_t wait_until_any_vector(T *ivars, size_t nelems,
|
||||
const int *status,
|
||||
int cmp, T* vals);
|
||||
|
||||
template <typename T>
|
||||
__host__ size_t wait_until_some_vector(T *ivars, size_t nelems,
|
||||
size_t* indices,
|
||||
const int *status,
|
||||
int cmp, T* vals);
|
||||
|
||||
template <typename T>
|
||||
__host__ int test(T *ivars, int cmp, T val);
|
||||
|
||||
public:
|
||||
/* Pointer to the backend's host interface */
|
||||
HostInterface *host_interface{nullptr};
|
||||
|
||||
/* An MPI Window implements a context */
|
||||
WindowInfo *context_window_info{nullptr};
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,173 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../host/host_templates.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::p(T *dest, T value, int pe) {
|
||||
host_interface->p<T>(dest, value, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ T GPUIBHostContext::g(const T *source, int pe) {
|
||||
return host_interface->g<T>(source, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::put(T *dest, const T *source, size_t nelems,
|
||||
int pe) {
|
||||
host_interface->put<T>(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::get(T *dest, const T *source, size_t nelems,
|
||||
int pe) {
|
||||
host_interface->get<T>(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::put_nbi(T *dest, const T *source, size_t nelems,
|
||||
int pe) {
|
||||
host_interface->put_nbi<T>(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::get_nbi(T *dest, const T *source, size_t nelems,
|
||||
int pe) {
|
||||
host_interface->get_nbi<T>(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::amo_add(void *dst, T value, int pe) {
|
||||
host_interface->amo_add(dst, value, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::amo_cas(void *dst, T value, T cond, int pe) {
|
||||
host_interface->amo_cas(dst, value, cond, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ T GPUIBHostContext::amo_fetch_add(void *dst, T value, int pe) {
|
||||
return host_interface->amo_fetch_add(dst, value, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ T GPUIBHostContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
|
||||
return host_interface->amo_fetch_cas(dst, value, cond, pe,
|
||||
context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::broadcast(
|
||||
T *dest, const T *source, int nelems, int pe_root, int pe_start,
|
||||
int log_pe_stride, int pe_size,
|
||||
long *p_sync) { // NOLINT(runtime/int)
|
||||
host_interface->broadcast<T>(dest, source, nelems, pe_root, pe_start,
|
||||
log_pe_stride, pe_size, p_sync);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::broadcast(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelems,
|
||||
int pe_root) {
|
||||
host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
|
||||
}
|
||||
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce,
|
||||
int pe_start, int log_pe_stride,
|
||||
int pe_size, T *p_wrk,
|
||||
long *p_sync) { // NOLINT(runtime/int)
|
||||
host_interface->to_all<T, Op>(dest, source, nreduce, pe_start, log_pe_stride,
|
||||
pe_size, p_wrk, p_sync);
|
||||
}
|
||||
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__host__ void GPUIBHostContext::to_all(rocshmem_team_t team, T *dest,
|
||||
const T *source, int nreduce) {
|
||||
host_interface->to_all<T, Op>(team, dest, source, nreduce);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::wait_until(T *ivars, int cmp, T val) {
|
||||
host_interface->wait_until<T>(ivars, cmp, val, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::wait_until_all(T *ivars, size_t nelems,
|
||||
const int* status,
|
||||
int cmp, T val) {
|
||||
host_interface->wait_until_all<T>(ivars, nelems, status, cmp, val, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ size_t GPUIBHostContext::wait_until_any(T *ivars, size_t nelems,
|
||||
const int* status,
|
||||
int cmp, T val) {
|
||||
return host_interface->wait_until_any<T>(ivars, nelems, status, cmp, val, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ size_t GPUIBHostContext::wait_until_some(T *ivars, size_t nelems,
|
||||
size_t* indices,
|
||||
const int* status,
|
||||
int cmp, T val) {
|
||||
return host_interface->wait_until_some<T>(ivars, nelems, indices, status, cmp, val, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void GPUIBHostContext::wait_until_all_vector(T *ivars, size_t nelems,
|
||||
const int* status,
|
||||
int cmp, T* vals) {
|
||||
host_interface->wait_until_all_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ size_t GPUIBHostContext::wait_until_any_vector(T *ivars, size_t nelems,
|
||||
const int* status,
|
||||
int cmp, T* vals) {
|
||||
return host_interface->wait_until_any_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ size_t GPUIBHostContext::wait_until_some_vector(T *ivars, size_t nelems,
|
||||
size_t* indices,
|
||||
const int* status,
|
||||
int cmp, T* vals) {
|
||||
return host_interface->wait_until_some_vector<T>(ivars, nelems, indices, status, cmp, vals, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ int GPUIBHostContext::test(T *ivars, int cmp, T val) {
|
||||
return host_interface->test<T>(ivars, cmp, val, context_window_info);
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
|
||||
@@ -1,39 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "rocshmem/rocshmem_debug.hpp"
|
||||
|
||||
#include "qe_dumper.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
void debug_print_cq(int dest_pe, int src_wg, int cqe_index) {
|
||||
QeDumper dumper(dest_pe, src_wg, cqe_index);
|
||||
dumper.dump_cq();
|
||||
}
|
||||
|
||||
void debug_print_sq(int dest_pe, int src_wg, int wqe_index) {
|
||||
QeDumper dumper(dest_pe, src_wg, wqe_index);
|
||||
dumper.dump_sq();
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,381 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "dynamic_connection.hpp"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "backend_ib.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
DynamicConnection::DynamicConnection(GPUIBBackend* b) : Connection(b, 4) {
|
||||
char* value = nullptr;
|
||||
|
||||
if ((value = getenv("ROCSHMEM_NUM_DCIs"))) {
|
||||
num_dcis = atoi(value);
|
||||
}
|
||||
|
||||
if ((value = getenv("ROCSHMEM_NUM_DCT"))) {
|
||||
num_dct = atoi(value);
|
||||
}
|
||||
}
|
||||
|
||||
DynamicConnection::~DynamicConnection() {
|
||||
CHECK_HIP(hipFree(vec_lids));
|
||||
CHECK_HIP(hipFree(vec_dct_num));
|
||||
}
|
||||
|
||||
ibv_qp_init_attr_ex DynamicConnection::dct_qp_init_attr(
|
||||
ibv_cq* cq, ibv_srq* srq, [[maybe_unused]] uint8_t port) const {
|
||||
ibv_qp_init_attr_ex attr{};
|
||||
|
||||
attr.comp_mask = IBV_QP_INIT_ATTR_PD;
|
||||
attr.pd = ib_state->pd;
|
||||
attr.recv_cq = cq;
|
||||
attr.send_cq = cq;
|
||||
attr.srq = srq;
|
||||
attr.qp_type = IBV_QPT_DRIVER;
|
||||
|
||||
return attr;
|
||||
}
|
||||
|
||||
mlx5dv_qp_init_attr DynamicConnection::dct_dv_init_attr() {
|
||||
mlx5dv_qp_init_attr dv_attr{};
|
||||
dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC;
|
||||
dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT;
|
||||
dv_attr.dc_init_attr.dct_access_key = DC_IB_KEY;
|
||||
|
||||
return dv_attr;
|
||||
}
|
||||
|
||||
Connection::InitQPState DynamicConnection::initqp(uint8_t port) {
|
||||
InitQPState initqp{};
|
||||
|
||||
initqp.exp_qp_attr.port_num = port;
|
||||
initqp.exp_qp_attr.pkey_index = 0;
|
||||
initqp.exp_qp_attr.qp_access_flags = 0;
|
||||
|
||||
return initqp;
|
||||
}
|
||||
|
||||
Connection::RtrState DynamicConnection::rtr([[maybe_unused]] dest_info_t* dest,
|
||||
uint8_t port) {
|
||||
RtrState rtr{};
|
||||
|
||||
rtr.exp_qp_attr.ah_attr.is_global = 1;
|
||||
rtr.exp_qp_attr.ah_attr.port_num = port;
|
||||
|
||||
rtr.exp_qp_attr.max_dest_rd_atomic = 0;
|
||||
rtr.exp_qp_attr.min_rnr_timer = 0;
|
||||
|
||||
return rtr;
|
||||
}
|
||||
|
||||
Connection::RtsState DynamicConnection::rts([
|
||||
[maybe_unused]] dest_info_t* dest) {
|
||||
RtsState rts{};
|
||||
rts.exp_attr_mask |= IBV_QP_SQ_PSN;
|
||||
return rts;
|
||||
}
|
||||
|
||||
void DynamicConnection::connect_dci(ibv_qp* qp, uint8_t port) {
|
||||
init_qp_status(qp, port);
|
||||
change_status_rtr(qp, nullptr, port);
|
||||
change_status_rts(qp, nullptr);
|
||||
}
|
||||
|
||||
/*
|
||||
* create a DCT and get is to ready state
|
||||
*/
|
||||
void DynamicConnection::create_dct(int32_t* dct_num, ibv_cq* cq, ibv_srq* srq,
|
||||
uint8_t port) {
|
||||
auto init_attr = dct_qp_init_attr(cq, srq, port);
|
||||
auto dv_attr = dct_dv_init_attr();
|
||||
auto dct = mlx5dv_create_qp(ib_state->context, &init_attr, &dv_attr);
|
||||
|
||||
if (dct == nullptr) {
|
||||
printf("Failed to create dct \n");
|
||||
abort();
|
||||
}
|
||||
|
||||
ibv_qp_attr qp_attr{};
|
||||
qp_attr.qp_state = IBV_QPS_INIT;
|
||||
qp_attr.port_num = port;
|
||||
qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
|
||||
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
|
||||
|
||||
int attr_mask =
|
||||
IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
|
||||
|
||||
int ret = ibv_modify_qp(dct, &qp_attr, attr_mask);
|
||||
if (ret) {
|
||||
abort();
|
||||
}
|
||||
|
||||
qp_attr.qp_state = IBV_QPS_RTR;
|
||||
qp_attr.path_mtu = IBV_MTU_4096;
|
||||
qp_attr.min_rnr_timer = 7;
|
||||
qp_attr.ah_attr.is_global = 1;
|
||||
qp_attr.ah_attr.grh.hop_limit = 1;
|
||||
qp_attr.ah_attr.grh.traffic_class = 0;
|
||||
qp_attr.ah_attr.grh.sgid_index = 0;
|
||||
qp_attr.ah_attr.port_num = port;
|
||||
|
||||
attr_mask = IBV_QP_STATE | IBV_QP_MIN_RNR_TIMER | IBV_QP_AV | IBV_QP_PATH_MTU;
|
||||
|
||||
ret = ibv_modify_qp(dct, &qp_attr, attr_mask);
|
||||
if (ret) {
|
||||
abort();
|
||||
}
|
||||
|
||||
*dct_num = dct->qp_num;
|
||||
}
|
||||
|
||||
/*
|
||||
* @brief create a qp (DCI qp) using DEVX
|
||||
*/
|
||||
ibv_qp* DynamicConnection::create_qp_0(ibv_context* context,
|
||||
ibv_qp_init_attr_ex* qp_attr) {
|
||||
ibv_qp* qp;
|
||||
qp_attr->qp_type = IBV_QPT_DRIVER;
|
||||
|
||||
mlx5dv_qp_init_attr dv_attr{};
|
||||
dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC;
|
||||
dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI;
|
||||
dv_attr.dc_init_attr.dct_access_key = DC_IB_KEY;
|
||||
|
||||
qp = mlx5dv_create_qp(context, qp_attr, &dv_attr);
|
||||
|
||||
return qp;
|
||||
}
|
||||
|
||||
void DynamicConnection::create_qps_1() {
|
||||
ibv_srq_init_attr srq_init_attr{};
|
||||
srq_init_attr.attr.max_wr = 1;
|
||||
srq_init_attr.attr.max_sge = 1;
|
||||
|
||||
srq = ibv_create_srq(ib_state->pd, &srq_init_attr);
|
||||
if (!srq) {
|
||||
abort();
|
||||
}
|
||||
|
||||
dct_cq = ibv_create_cq(ib_state->context, 100, nullptr, nullptr, 0);
|
||||
if (!dct_cq) {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicConnection::create_qps_2(int port, int my_rank,
|
||||
ibv_port_attr* ib_port_att) {
|
||||
for (int i = 0; i < num_dct; i++) {
|
||||
int32_t dct_num;
|
||||
create_dct(&dct_num, dct_cq, srq, port);
|
||||
dct_num = htobe32(dct_num);
|
||||
dcts_num[my_rank * num_dct + i] = dct_num;
|
||||
}
|
||||
lids[my_rank] = htobe16(ib_port_att->lid);
|
||||
}
|
||||
|
||||
void DynamicConnection::create_qps_3(
|
||||
int port, ibv_qp* qp, [[maybe_unused]] int offset,
|
||||
[[maybe_unused]] ibv_port_attr* ib_port_att) {
|
||||
return connect_dci(qp, port);
|
||||
}
|
||||
|
||||
void DynamicConnection::get_remote_conn(int* remote_conn) {
|
||||
*remote_conn = num_dcis;
|
||||
}
|
||||
|
||||
void DynamicConnection::allocate_dynamic_members([
|
||||
[maybe_unused]] int num_wg) {
|
||||
size_t num_pes_size_bytes = sizeof(uint16_t) * backend->num_pes;
|
||||
lids = reinterpret_cast<uint16_t*>(malloc(num_pes_size_bytes));
|
||||
if (lids == nullptr) {
|
||||
abort();
|
||||
}
|
||||
|
||||
size_t num_dcts = num_dct * backend->num_pes;
|
||||
size_t num_dcts_size_bytes = sizeof(uint32_t) * num_dcts;
|
||||
dcts_num = reinterpret_cast<uint32_t*>(malloc(num_dcts_size_bytes));
|
||||
if (dcts_num == nullptr) {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* get the wqe_av information from tyhe ibv_ah
|
||||
* rely on DEVX to extract the AV. We use the AV to create
|
||||
* the DC segment
|
||||
*/
|
||||
void DynamicConnection::dc_get_av(ibv_ah* ah, mlx5_wqe_av* mlx5_av) {
|
||||
mlx5dv_obj dv;
|
||||
mlx5dv_ah dah;
|
||||
|
||||
dv.ah.in = ah;
|
||||
dv.ah.out = &dah;
|
||||
mlx5dv_init_obj(&dv, MLX5DV_OBJ_AH);
|
||||
|
||||
memcpy(mlx5_av, dah.av, sizeof(mlx5_wqe_av));
|
||||
}
|
||||
|
||||
void DynamicConnection::free_dynamic_members() {
|
||||
free(lids);
|
||||
free(dcts_num);
|
||||
}
|
||||
|
||||
void DynamicConnection::initialize_1(int port, [[maybe_unused]] int num_wg) {
|
||||
MPI_Allgather(MPI_IN_PLACE, sizeof(int32_t) * num_dct, MPI_CHAR, dcts_num,
|
||||
sizeof(int32_t) * num_dct, MPI_CHAR, backend->thread_comm);
|
||||
|
||||
MPI_Allgather(MPI_IN_PLACE, sizeof(int16_t), MPI_CHAR, lids, sizeof(int16_t),
|
||||
MPI_CHAR, backend->thread_comm);
|
||||
|
||||
hipStream_t stream;
|
||||
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&vec_dct_num),
|
||||
sizeof(int32_t) * num_dct * backend->num_pes));
|
||||
|
||||
CHECK_HIP(hipMemcpyAsync(vec_dct_num, dcts_num,
|
||||
sizeof(int32_t) * num_dct * backend->num_pes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&vec_lids),
|
||||
sizeof(int16_t) * backend->num_pes));
|
||||
|
||||
CHECK_HIP(hipMemcpyAsync(vec_lids, lids, sizeof(int16_t) * backend->num_pes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
struct ibv_ah_attr ah_attr;
|
||||
memset(&ah_attr, 0, sizeof(ah_attr));
|
||||
|
||||
ah_attr.is_global = 1;
|
||||
ah_attr.dlid = ib_state->portinfo.lid;
|
||||
ah_attr.sl = 1;
|
||||
ah_attr.src_path_bits = 0;
|
||||
ah_attr.port_num = port;
|
||||
|
||||
ah = ibv_create_ah(ib_state->pd, &ah_attr);
|
||||
if (ah == nullptr) {
|
||||
abort();
|
||||
}
|
||||
|
||||
dc_get_av(ah, &mlx5_av);
|
||||
|
||||
CHECK_HIP(hipStreamSynchronize(stream));
|
||||
CHECK_HIP(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void DynamicConnection::initialize_rkey_handle(uint32_t** heap_rkey_handle,
|
||||
ibv_mr* mr) {
|
||||
CHECK_HIP(hipMalloc(heap_rkey_handle, sizeof(uint32_t) * backend->num_pes));
|
||||
(*heap_rkey_handle)[backend->my_pe] = htobe32(mr->rkey);
|
||||
}
|
||||
|
||||
void DynamicConnection::free_rkey_handle(uint32_t* heap_rkey_handle) {
|
||||
CHECK_HIP(hipFree(heap_rkey_handle));
|
||||
}
|
||||
|
||||
Connection::QPInitAttr DynamicConnection::qpattr(ibv_qp_cap cap) {
|
||||
QPInitAttr qpattr(cap);
|
||||
return qpattr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create and write the DC segment to SQ.
|
||||
* We get all the info needed from the mlx5_wqe_av that we extract from ibv_ah.
|
||||
*/
|
||||
void DynamicConnection::set_dgram_seg(mlx5_wqe_datagram_seg* dc_seg,
|
||||
uint64_t dc_key, uint32_t dct_num,
|
||||
uint8_t ext, mlx5_wqe_av* mlx5_av) {
|
||||
dc_seg->av.key.dc_key = htobe64(dc_key);
|
||||
dc_seg->av.dqp_dct = htobe32(((uint32_t)ext << 31 | dct_num));
|
||||
dc_seg->av.stat_rate_sl = mlx5_av->stat_rate_sl;
|
||||
dc_seg->av.fl_mlid = mlx5_av->fl_mlid;
|
||||
dc_seg->av.rlid = mlx5_av->rlid;
|
||||
}
|
||||
|
||||
/*
|
||||
* create a DC wqe and post it to the SQ
|
||||
* we rely on mlx5dv functions to ceate the ctrl and data
|
||||
* segments but we use our own function to write teh DC and rdma segments
|
||||
*/
|
||||
void DynamicConnection::post_dv_dc_wqe(int remote_conn) {
|
||||
mlx5_wqe_ctrl_seg* ctrl;
|
||||
mlx5_wqe_datagram_seg* dc_seg;
|
||||
mlx5_wqe_raddr_seg* rdma;
|
||||
mlx5_wqe_data_seg* data;
|
||||
|
||||
for (int i = 0; i < remote_conn; i++) {
|
||||
uint64_t* ptr = get_address_sq(i);
|
||||
|
||||
const uint32_t nb_post = 4 * sq_size;
|
||||
for (uint16_t index = 0; index < nb_post; index++) {
|
||||
uint8_t op_mod = 0;
|
||||
uint8_t op_code = 8;
|
||||
uint32_t qp_num = qps[i]->qp_num;
|
||||
uint8_t fm_ce_se = 0;
|
||||
uint8_t ds = 4;
|
||||
ctrl = reinterpret_cast<mlx5_wqe_ctrl_seg*>(ptr);
|
||||
mlx5dv_set_ctrl_seg(ctrl, index, op_code, op_mod, qp_num, fm_ce_se, ds, 0,
|
||||
0);
|
||||
ptr = ptr + 2;
|
||||
|
||||
uint32_t dct_num = dcts_num[i];
|
||||
uint8_t ext = 1;
|
||||
dc_seg = reinterpret_cast<mlx5_wqe_datagram_seg*>(ptr);
|
||||
set_dgram_seg(dc_seg, (uint64_t)DC_IB_KEY, dct_num, ext, &mlx5_av);
|
||||
ptr = ptr + 2;
|
||||
|
||||
uint64_t address = 0;
|
||||
uint32_t rkey = 0;
|
||||
rdma = reinterpret_cast<mlx5_wqe_raddr_seg*>(ptr);
|
||||
set_rdma_seg(rdma, address, rkey);
|
||||
ptr = ptr + 2;
|
||||
|
||||
uint32_t lkey = backend->networkImpl.heap_mr->lkey;
|
||||
data = reinterpret_cast<mlx5_wqe_data_seg*>(ptr);
|
||||
mlx5dv_set_data_seg(data, 1, lkey, 0);
|
||||
ptr = ptr + 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(bpotter): remove redundancies with the other derived class
|
||||
void DynamicConnection::post_wqes() {
|
||||
int remote_conn;
|
||||
get_remote_conn(&remote_conn);
|
||||
remote_conn *= backend->num_blocks_;
|
||||
post_dv_dc_wqe(remote_conn);
|
||||
}
|
||||
|
||||
void DynamicConnection::initialize_wr_fields([[maybe_unused]] ibv_send_wr* wr,
|
||||
[[maybe_unused]] ibv_ah* ah,
|
||||
[[maybe_unused]] int dc_key) {}
|
||||
|
||||
int DynamicConnection::get_sq_dv_offset([[maybe_unused]] int pe_idx,
|
||||
[[maybe_unused]] int num_qps,
|
||||
int wg_idx) {
|
||||
return wg_idx;
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,122 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
|
||||
|
||||
#include "connection.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class DynamicConnection : public Connection {
|
||||
public:
|
||||
explicit DynamicConnection(GPUIBBackend* backend);
|
||||
|
||||
~DynamicConnection() override;
|
||||
|
||||
void get_remote_conn(int* remote_conn) override;
|
||||
|
||||
void post_wqes() override;
|
||||
|
||||
void initialize_rkey_handle(uint32_t** heap_rkey_handle,
|
||||
ibv_mr* mr) override;
|
||||
|
||||
void free_rkey_handle(uint32_t* heap_rkey_handle) override;
|
||||
|
||||
uint32_t* get_vec_dct_num() const { return vec_dct_num; }
|
||||
|
||||
uint16_t* get_vec_lids() const { return vec_lids; }
|
||||
|
||||
private:
|
||||
InitQPState initqp(uint8_t port) override;
|
||||
|
||||
RtrState rtr(dest_info_t* dest, uint8_t port) override;
|
||||
|
||||
RtsState rts(dest_info_t* dest) override;
|
||||
|
||||
QPInitAttr qpattr(ibv_qp_cap cap) override;
|
||||
|
||||
void connect_dci(ibv_qp* qp, uint8_t port);
|
||||
|
||||
void create_dct(int32_t* dct_num, ibv_cq* cq, ibv_srq* srq, uint8_t port);
|
||||
|
||||
ibv_qp_init_attr_ex dct_qp_init_attr(ibv_cq* cq, ibv_srq* srq,
|
||||
uint8_t port) const;
|
||||
|
||||
mlx5dv_qp_init_attr dct_dv_init_attr();
|
||||
|
||||
void dc_get_av(ibv_ah* ah, mlx5_wqe_av* mlx5_av);
|
||||
|
||||
void set_dgram_seg(mlx5_wqe_datagram_seg* dc_seg, uint64_t dc_key,
|
||||
uint32_t dct_num, uint8_t ext, mlx5_wqe_av* av);
|
||||
|
||||
void set_data_seg(mlx5_wqe_data_seg* data_seg, uint32_t lkey);
|
||||
|
||||
void post_dv_dc_wqe(int remote_conn);
|
||||
|
||||
void create_qps_1() override;
|
||||
|
||||
void create_qps_2(int port, int my_rank,
|
||||
ibv_port_attr* ib_port_att) override;
|
||||
|
||||
void create_qps_3(int port, ibv_qp* qp, int offset,
|
||||
ibv_port_attr* ib_port_att) override;
|
||||
|
||||
ibv_qp* create_qp_0(ibv_context* context,
|
||||
ibv_qp_init_attr_ex* qp_attr) override;
|
||||
|
||||
void allocate_dynamic_members(int num_wg) override;
|
||||
|
||||
void free_dynamic_members() override;
|
||||
|
||||
void initialize_1(int port, int num_wg) override;
|
||||
|
||||
void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, int dc_key) override;
|
||||
|
||||
int get_sq_dv_offset(int pe_idx, int32_t num_qps, int wg_idx) override;
|
||||
|
||||
int num_dcis{1};
|
||||
|
||||
int num_dct{1};
|
||||
|
||||
static constexpr int DC_IB_KEY{0x1ee7a330};
|
||||
|
||||
uint32_t* dcts_num{nullptr};
|
||||
|
||||
uint16_t* lids{nullptr};
|
||||
|
||||
mlx5_wqe_av mlx5_av{};
|
||||
|
||||
ibv_ah* ah{nullptr};
|
||||
|
||||
ibv_srq* srq{nullptr};
|
||||
|
||||
ibv_cq* dct_cq{nullptr};
|
||||
|
||||
uint32_t* vec_dct_num{nullptr};
|
||||
|
||||
uint16_t* vec_lids{nullptr};
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
|
||||
@@ -1,79 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "endian.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
template <typename T>
|
||||
__device__ void swap_endian_store(T *dst, const T val) {
|
||||
typedef union U {
|
||||
T val;
|
||||
uint8_t bytes[sizeof(T)];
|
||||
} union_type;
|
||||
union_type src;
|
||||
union_type dst_tmp;
|
||||
|
||||
src.val = val;
|
||||
std::reverse_copy(src.bytes, src.bytes + sizeof(T), dst_tmp.bytes);
|
||||
*dst = dst_tmp.val;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(uint64_t *dst, const uint64_t val) {
|
||||
uint64_t new_val = ((val << 8) & 0xFF00FF00FF00FF00ULL) |
|
||||
((val >> 8) & 0x00FF00FF00FF00FFULL);
|
||||
|
||||
new_val = ((new_val << 16) & 0xFFFF0000FFFF0000ULL) |
|
||||
((new_val >> 16) & 0x0000FFFF0000FFFFULL);
|
||||
|
||||
*dst = (new_val << 32) | (new_val >> 32);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(int64_t *dst, const int64_t val) {
|
||||
swap_endian_store(reinterpret_cast<uint64_t *>(dst), (const uint64_t)val);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(uint32_t *dst, const uint32_t val) {
|
||||
uint32_t new_val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
|
||||
|
||||
*dst = (new_val << 16) | (new_val >> 16);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(int32_t *dst, const int32_t val) {
|
||||
swap_endian_store(reinterpret_cast<uint32_t *>(dst), (const uint32_t)val);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(uint16_t *dst, const uint16_t val) {
|
||||
*dst = ((val << 8) & 0xFF00) | ((val >> 8) & 0x00FF);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(int16_t *dst, const int16_t val) {
|
||||
swap_endian_store(reinterpret_cast<uint16_t *>(dst), (const uint16_t)val);
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,53 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
template <typename T>
|
||||
__device__ void swap_endian_store(T *dst, const T val);
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(uint64_t *dst, const uint64_t val);
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(int64_t *dst, const int64_t val);
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(uint32_t *dst, const uint32_t val);
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(int32_t *dst, const int32_t val);
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(uint16_t *dst, const uint16_t val);
|
||||
|
||||
template <>
|
||||
__device__ void swap_endian_store(int16_t *dst, const int16_t val);
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
|
||||
@@ -1,56 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "gpu_ib_team.hpp"
|
||||
|
||||
#include "../backend_type.hpp"
|
||||
#include "backend_ib.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
GPUIBTeam::GPUIBTeam(Backend *backend, TeamInfo *team_info_parent,
|
||||
TeamInfo *team_info_world, int num_pes, int my_pe,
|
||||
MPI_Comm mpi_comm, int pool_index)
|
||||
: Team(backend, team_info_parent, team_info_world, num_pes, my_pe,
|
||||
mpi_comm) {
|
||||
type = BackendType::GPU_IB_BACKEND;
|
||||
const GPUIBBackend *b = static_cast<const GPUIBBackend *>(backend);
|
||||
|
||||
pool_index_ = pool_index;
|
||||
|
||||
barrier_pSync =
|
||||
&(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
|
||||
reduce_pSync =
|
||||
&(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
|
||||
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
|
||||
alltoall_pSync =
|
||||
&(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
|
||||
pWrk = reinterpret_cast<char *>(b->pWrk_pool) +
|
||||
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
pAta = reinterpret_cast<char *>(b->pAta_pool) +
|
||||
ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
|
||||
}
|
||||
|
||||
GPUIBTeam::~GPUIBTeam() {}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,50 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
|
||||
|
||||
#include "../team.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class GPUIBTeam : public Team {
|
||||
public:
|
||||
GPUIBTeam(Backend* handle, TeamInfo* team_info_wrt_parent,
|
||||
TeamInfo* team_info_wrt_world, int num_pes, int my_pe,
|
||||
MPI_Comm team_comm, int pool_index);
|
||||
|
||||
virtual ~GPUIBTeam();
|
||||
|
||||
long* barrier_pSync{nullptr};
|
||||
long* reduce_pSync{nullptr};
|
||||
long* bcast_pSync{nullptr};
|
||||
long* alltoall_pSync{nullptr};
|
||||
void* pWrk{nullptr};
|
||||
void* pAta{nullptr};
|
||||
|
||||
int pool_index_{-1};
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
|
||||
@@ -1,49 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
|
||||
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
typedef struct ib_mlx5_base_av {
|
||||
uint64_t dc_key;
|
||||
uint32_t dqp_dct;
|
||||
uint8_t stat_rate_sl;
|
||||
uint8_t fl_mlid;
|
||||
uint16_t rlid;
|
||||
} ib_mlx5_base_av_t;
|
||||
|
||||
union mlx5_segment {
|
||||
mlx5_wqe_ctrl_seg ctrl_seg;
|
||||
mlx5_wqe_raddr_seg raddr_seg;
|
||||
mlx5_wqe_atomic_seg atomic_seg;
|
||||
mlx5_wqe_data_seg data_seg;
|
||||
mlx5_wqe_inl_data_seg inl_data_seg;
|
||||
ib_mlx5_base_av_t base_av;
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
|
||||
@@ -1,78 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class GPUIBContext;
|
||||
|
||||
class MemoryBuilderPolicyWrapper {
|
||||
public:
|
||||
__device__ MemoryBuilderPolicyWrapper() = default;
|
||||
|
||||
__device__ ~MemoryBuilderPolicyWrapper() {
|
||||
if (wrapped_policy_) {
|
||||
delete wrapped_policy_;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ MemoryBuilderPolicyWrapper(T&& policy)
|
||||
: wrapped_policy_(new Wrapper<T>(std::forward<T>(policy))) {}
|
||||
|
||||
__device__ void operator()(GPUIBContext* context) {
|
||||
return (*wrapped_policy_)(context);
|
||||
}
|
||||
|
||||
private:
|
||||
class PolicyBase {
|
||||
public:
|
||||
__device__ virtual void operator()(GPUIBContext* context) = 0;
|
||||
|
||||
__device__ virtual ~PolicyBase() {}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class Wrapper : public PolicyBase {
|
||||
public:
|
||||
__device__ Wrapper(const T& t) : wrapped_policy_(t) {}
|
||||
|
||||
__device__ void operator()(GPUIBContext* context) override {
|
||||
return wrapped_policy_(context);
|
||||
}
|
||||
|
||||
private:
|
||||
T wrapped_policy_;
|
||||
};
|
||||
|
||||
PolicyBase* wrapped_policy_;
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
|
||||
@@ -1,500 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "network_policy.hpp"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../atomic_return.hpp"
|
||||
#include "../context_incl.hpp"
|
||||
#include "backend_ib.hpp"
|
||||
#include "connection.hpp"
|
||||
#include "dynamic_connection.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
#include "reliable_connection.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
void NetworkOnImpl::dump_backend_stats(ROCStats *globalStats) {
|
||||
/*
|
||||
* TODO(bpotter): Refactor this into the Stats class to remove the ifdef
|
||||
*/
|
||||
#ifdef PROFILE
|
||||
int statblocks = connection->total_number_connections();
|
||||
|
||||
uint64_t cycles_ring_sq_db = 0;
|
||||
uint64_t cycles_update_wqe = 0;
|
||||
uint64_t cycles_poll_cq = 0;
|
||||
uint64_t cycles_next_cq = 0;
|
||||
uint64_t cycles_init = gpu_qps[statblocks - 1].profiler.getStat(INIT);
|
||||
uint64_t cycles_finalize = gpu_qps[statblocks - 1].profiler.getStat(FINALIZE);
|
||||
|
||||
uint64_t total_quiet_count = 0;
|
||||
uint64_t total_db_count = 0;
|
||||
uint64_t total_wqe_count = 0;
|
||||
|
||||
for (int i = 0; i < statblocks; i++) {
|
||||
cycles_ring_sq_db += gpu_qps[i].profiler.getStat(RING_SQ_DB);
|
||||
cycles_update_wqe += gpu_qps[i].profiler.getStat(UPDATE_WQE);
|
||||
cycles_poll_cq += gpu_qps[i].profiler.getStat(POLL_CQ);
|
||||
cycles_next_cq += gpu_qps[i].profiler.getStat(NEXT_CQ);
|
||||
total_quiet_count += gpu_qps[i].profiler.getStat(QUIET_COUNT);
|
||||
total_db_count += gpu_qps[i].profiler.getStat(DB_COUNT);
|
||||
total_wqe_count += gpu_qps[i].profiler.getStat(WQE_COUNT);
|
||||
}
|
||||
|
||||
double us_ring_sq_db = cycles_ring_sq_db / gpu_clock_freq_mhz;
|
||||
double us_update_wqe = cycles_update_wqe / gpu_clock_freq_mhz;
|
||||
double us_poll_cq = cycles_poll_cq / gpu_clock_freq_mhz;
|
||||
double us_next_cq = cycles_next_cq / gpu_clock_freq_mhz;
|
||||
double us_init = cycles_init / gpu_clock_freq_mhz;
|
||||
double us_finalize = cycles_finalize / gpu_clock_freq_mhz;
|
||||
|
||||
const int FIELD_WIDTH = 20;
|
||||
const int FLOAT_PRECISION = 2;
|
||||
|
||||
printf("Counts: Internal Quiets %lu DB Rings %lu WQE Posts %lu\n",
|
||||
total_quiet_count, total_db_count, total_wqe_count);
|
||||
|
||||
printf("\n%*s%*s%*s%*s%*s%*s\n", FIELD_WIDTH + 1, "Init (us)",
|
||||
FIELD_WIDTH + 1, "Finalize (us)", FIELD_WIDTH + 1, "Ring SQ DB (us)",
|
||||
FIELD_WIDTH + 1, "Update WQE (us)", FIELD_WIDTH + 1, "Poll CQ (us)",
|
||||
FIELD_WIDTH + 1, "Next CQ (us)");
|
||||
|
||||
uint64_t totalFinalize = globalStats->getStat(NUM_FINALIZE);
|
||||
printf("%*.*f %*.*f %*.*f %*.*f %*.*f %*.*f\n", FIELD_WIDTH, FLOAT_PRECISION,
|
||||
us_init / totalFinalize, FIELD_WIDTH, FLOAT_PRECISION,
|
||||
us_finalize / totalFinalize, FIELD_WIDTH, FLOAT_PRECISION,
|
||||
us_ring_sq_db / total_db_count, FIELD_WIDTH, FLOAT_PRECISION,
|
||||
us_update_wqe / total_wqe_count, FIELD_WIDTH, FLOAT_PRECISION,
|
||||
us_poll_cq / total_quiet_count, FIELD_WIDTH, FLOAT_PRECISION,
|
||||
us_next_cq / total_quiet_count);
|
||||
#endif
|
||||
}
|
||||
|
||||
void NetworkOnImpl::reset_backend_stats() {
|
||||
int statblocks = connection->total_number_connections();
|
||||
|
||||
for (size_t i = 0; i < statblocks; i++) {
|
||||
gpu_qps[i].profiler.resetStats();
|
||||
}
|
||||
}
|
||||
|
||||
void NetworkOnImpl::exchange_hdp_info(HdpPolicy *hdp_policy,
|
||||
MPI_Comm thread_comm) {
|
||||
/*
|
||||
* Using Connection class, register the host-side hdp flush address
|
||||
* with the InfiniBand network.
|
||||
*/
|
||||
connection->reg_mr(hdp_policy->get_hdp_flush_ptr(), 32, &hdp_mr, false);
|
||||
|
||||
/*
|
||||
* Allocate device-side memory for the remote HDP keys.
|
||||
*/
|
||||
CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_rkey),
|
||||
num_pes * sizeof(uint32_t)));
|
||||
|
||||
/*
|
||||
* Allocate device-side memory for the remote HDP addresses.
|
||||
*/
|
||||
CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_address),
|
||||
num_pes * sizeof(uintptr_t)));
|
||||
|
||||
/*
|
||||
* Allocate host-side memory to exchange hdp keys using MPI_Allgather.
|
||||
*/
|
||||
uint32_t *host_hdp_cpy =
|
||||
reinterpret_cast<uint32_t *>(malloc(num_pes * sizeof(uint32_t)));
|
||||
if (host_hdp_cpy == nullptr) {
|
||||
abort();
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate host-side memory to exchange hdp addresses using
|
||||
* MPI_Allgather.
|
||||
*/
|
||||
uint32_t **host_hdp_address_cpy =
|
||||
reinterpret_cast<uint32_t **>(malloc(num_pes * sizeof(uint32_t *)));
|
||||
if (host_hdp_address_cpy == nullptr) {
|
||||
free(host_hdp_cpy);
|
||||
abort();
|
||||
}
|
||||
|
||||
/*
|
||||
* This processing element writes its personal HDP key and HDP address
|
||||
* into the host-side arrays which were just allocated.
|
||||
*/
|
||||
int my_rank = my_pe;
|
||||
host_hdp_cpy[my_rank] = htobe32(hdp_mr->rkey);
|
||||
host_hdp_address_cpy[my_rank] = hdp_policy->get_hdp_flush_ptr();
|
||||
|
||||
/*
|
||||
* Do all-to-all exchange of our HDP key with other processing elements.
|
||||
*/
|
||||
MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_hdp_cpy,
|
||||
sizeof(uint32_t), MPI_CHAR, thread_comm);
|
||||
|
||||
/*
|
||||
* Do all-to-all exchange of our HDP address with other processing
|
||||
* elements.
|
||||
*/
|
||||
MPI_Allgather(MPI_IN_PLACE, sizeof(uintptr_t), MPI_CHAR, host_hdp_address_cpy,
|
||||
sizeof(uint32_t *), MPI_CHAR, thread_comm);
|
||||
|
||||
/*
|
||||
* Copy the recently exchanged HDP keys to device memory.
|
||||
*/
|
||||
hipStream_t stream;
|
||||
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
CHECK_HIP(hipMemcpyAsync(hdp_rkey, host_hdp_cpy, num_pes * sizeof(uint32_t),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
/*
|
||||
* Copy the recently exchanged HDP addresses to device memory.
|
||||
*/
|
||||
CHECK_HIP(hipMemcpyAsync(hdp_address, host_hdp_address_cpy,
|
||||
num_pes * sizeof(uint32_t *), hipMemcpyHostToDevice,
|
||||
stream));
|
||||
CHECK_HIP(hipStreamSynchronize(stream));
|
||||
CHECK_HIP(hipStreamDestroy(stream));
|
||||
|
||||
/*
|
||||
* Free the host-side resources used to exchange HDP resources
|
||||
* between processing elements.
|
||||
*/
|
||||
free(host_hdp_cpy);
|
||||
free(host_hdp_address_cpy);
|
||||
}
|
||||
|
||||
void NetworkOnImpl::setup_atomic_region() {
|
||||
/*
|
||||
* Allocate fine-grained device-side memory for the atomic return
|
||||
* region.
|
||||
*/
|
||||
allocate_atomic_region(&atomic_ret, num_blocks);
|
||||
|
||||
/*
|
||||
* Register the atomic return region on the InfiniBand network.
|
||||
*/
|
||||
connection->reg_mr(atomic_ret->atomic_base_ptr,
|
||||
sizeof(uint64_t) * max_nb_atomic * num_blocks, &mr, false);
|
||||
|
||||
/*
|
||||
* Set member variable from class.
|
||||
*/
|
||||
atomic_ret->atomic_lkey = htobe32(mr->lkey);
|
||||
}
|
||||
|
||||
void NetworkOnImpl::heap_memory_rkey(char *local_heap_base, size_t heap_size,
|
||||
MPI_Comm thread_comm, bool is_managed) {
|
||||
/*
|
||||
* Allocate host-side memory to hold remote keys for all processing
|
||||
* elements.
|
||||
*/
|
||||
const size_t rkeys_size = sizeof(uint32_t) * num_pes;
|
||||
uint32_t *host_rkey_cpy = reinterpret_cast<uint32_t *>(malloc(rkeys_size));
|
||||
if (host_rkey_cpy == nullptr) {
|
||||
abort();
|
||||
}
|
||||
|
||||
/*
|
||||
* Using the Connection class, register the symmetric heap with the
|
||||
* InfiniBand network.
|
||||
*/
|
||||
void *base_heap = local_heap_base;
|
||||
connection->reg_mr(base_heap, heap_size, &heap_mr, is_managed);
|
||||
|
||||
/*
|
||||
* Using the memory region from the prior heap memory registration,
|
||||
* allocate and initialize some device-side memory to hold the remote
|
||||
* keys for the symmetric heap base.
|
||||
*
|
||||
* Only the device-side memory entry for this processing element will be
|
||||
* updated with the key for the heap memory region.
|
||||
*/
|
||||
connection->initialize_rkey_handle(&heap_rkey, heap_mr);
|
||||
|
||||
/*
|
||||
* Copy the device-side heap base remote key array to the host-side
|
||||
* heap base remote key array.
|
||||
*/
|
||||
hipStream_t stream;
|
||||
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
CHECK_HIP(hipMemcpyAsync(host_rkey_cpy, heap_rkey, rkeys_size,
|
||||
hipMemcpyDeviceToHost, stream));
|
||||
CHECK_HIP(hipStreamSynchronize(stream));
|
||||
|
||||
/*
|
||||
* Do all-to-all exchange of symmetric heap base remote key between the
|
||||
* processing elements.
|
||||
*/
|
||||
MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_rkey_cpy,
|
||||
sizeof(uint32_t), MPI_CHAR, thread_comm);
|
||||
|
||||
/*
|
||||
* Copy the recently updated host-side heap base remote key array back
|
||||
* to the device-side memory.
|
||||
*/
|
||||
CHECK_HIP(hipMemcpyAsync(heap_rkey, host_rkey_cpy, rkeys_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
CHECK_HIP(hipStreamSynchronize(stream));
|
||||
CHECK_HIP(hipStreamDestroy(stream));
|
||||
|
||||
/*
|
||||
* Free the host-side resources used to do the processing element
|
||||
* exchange of keys and addresses for the symmetric heap base.
|
||||
*/
|
||||
free(host_rkey_cpy);
|
||||
|
||||
/*
|
||||
* Initialize this member variable to hold the InfiniBand memory
|
||||
* region's local key.
|
||||
*/
|
||||
lkey = heap_mr->lkey;
|
||||
}
|
||||
|
||||
void NetworkOnImpl::setup_gpu_qps(GPUIBBackend *B) {
|
||||
/*
|
||||
* Determine how many connections are needed.
|
||||
* The number of connections depends on the connection type and the
|
||||
* number of workgroups.
|
||||
*/
|
||||
int connections;
|
||||
connection->get_remote_conn(&connections);
|
||||
connections *= num_blocks;
|
||||
|
||||
/*
|
||||
* Allocate device-side memory for the queue pairs.
|
||||
*/
|
||||
CHECK_HIP(hipMalloc(&gpu_qps, sizeof(QueuePair) * connections));
|
||||
|
||||
/*
|
||||
* For every connection, initialize the QueuePair.
|
||||
*/
|
||||
for (int i = 0; i < connections; i++) {
|
||||
new (&gpu_qps[i]) QueuePair(B);
|
||||
connection->init_gpu_qp_from_connection(&gpu_qps[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
void NetworkOnImpl::rocshmem_g_init(SymmetricHeap *heap_handle,
|
||||
MPI_Comm thread_comm) {
|
||||
init_g_ret(heap_handle, thread_comm, num_blocks, &g_ret);
|
||||
}
|
||||
|
||||
__host__ void NetworkOnImpl::networkHostSetup(GPUIBBackend *B) {
|
||||
num_pes = B->num_pes;
|
||||
my_pe = B->my_pe;
|
||||
num_blocks = B->num_blocks_;
|
||||
|
||||
#ifdef USE_DC
|
||||
connection = new DynamicConnection(B);
|
||||
#else
|
||||
connection = new ReliableConnection(B);
|
||||
#endif
|
||||
|
||||
connection->initialize(B->num_blocks_);
|
||||
exchange_hdp_info(B->hdp_policy, B->thread_comm);
|
||||
|
||||
const auto &heap_bases{B->heap.get_heap_bases()};
|
||||
heap_memory_rkey(heap_bases[my_pe], B->heap.get_size(), B->thread_comm,
|
||||
B->heap.is_managed());
|
||||
// The earliest we can allow the main thread to launch a kernel to
|
||||
// avoid potential deadlock
|
||||
network_init_done = true;
|
||||
|
||||
setup_atomic_region();
|
||||
|
||||
connection->initialize_gpu_policy(&connection_policy, heap_rkey);
|
||||
|
||||
rocshmem_g_init(&B->heap, B->thread_comm);
|
||||
|
||||
connection->post_wqes();
|
||||
|
||||
setup_gpu_qps(B);
|
||||
}
|
||||
|
||||
__host__ void NetworkOnImpl::networkHostFinalize() {
|
||||
CHECK_HIP(hipFree(hdp_rkey));
|
||||
hdp_rkey = nullptr;
|
||||
|
||||
CHECK_HIP(hipFree(hdp_address));
|
||||
hdp_address = nullptr;
|
||||
|
||||
CHECK_HIP(hipFree(atomic_ret));
|
||||
atomic_ret = nullptr;
|
||||
|
||||
CHECK_HIP(hipFree(gpu_qps));
|
||||
gpu_qps = nullptr;
|
||||
|
||||
CHECK_HIP(hipFree(connection_policy));
|
||||
connection_policy = nullptr;
|
||||
|
||||
connection->free_rkey_handle(heap_rkey);
|
||||
|
||||
connection->finalize();
|
||||
delete connection;
|
||||
connection = nullptr;
|
||||
}
|
||||
|
||||
__host__ void NetworkOnImpl::networkHostInit(GPUIBContext *ctx, int buffer_id) {
|
||||
int remote_conn = getNumQueuePairs();
|
||||
|
||||
CHECK_HIP(hipMalloc(&ctx->device_qp_proxy, remote_conn * sizeof(QueuePair)));
|
||||
|
||||
for (int i = 0; i < getNumQueuePairs(); i++) {
|
||||
/*
|
||||
* RC gpu_qp is actually [NUM_PE][NUM_BLOCK] qps but is flattened.
|
||||
* Each num_pe entry contains num_block QPs connected to that PE.
|
||||
* For RC, we need to iterate gpu_qp[i][buffer_id] to collect a
|
||||
* single QP for each connected PE in order to build context.
|
||||
* For DC, NUM_PE = 1 so can just use buffer_id directly.
|
||||
*/
|
||||
int offset = num_blocks * i + buffer_id;
|
||||
new (ctx->getQueuePair(i)) QueuePair(gpu_qps[offset]);
|
||||
|
||||
auto *qp = ctx->getQueuePair(i);
|
||||
qp->global_qp = &gpu_qps[offset];
|
||||
qp->num_cqs = getNumQueuePairs();
|
||||
qp->atomic_ret.atomic_base_ptr =
|
||||
&atomic_ret->atomic_base_ptr[max_nb_atomic * buffer_id];
|
||||
qp->base_heap = ctx->base_heap;
|
||||
}
|
||||
ctx->g_ret = g_ret;
|
||||
}
|
||||
|
||||
__device__ void NetworkOnImpl::networkGpuInit(GPUIBContext *ctx,
|
||||
int buffer_id) {
|
||||
for (int i = 0; i < getNumQueuePairs(); i++) {
|
||||
int offset = num_blocks * i + buffer_id;
|
||||
|
||||
auto *qp = ctx->getQueuePair(i);
|
||||
new (qp) QueuePair(gpu_qps[offset]);
|
||||
|
||||
qp->global_qp = &gpu_qps[offset];
|
||||
qp->num_cqs = getNumQueuePairs();
|
||||
qp->atomic_ret.atomic_base_ptr =
|
||||
&atomic_ret->atomic_base_ptr[max_nb_atomic * buffer_id];
|
||||
qp->base_heap = ctx->base_heap;
|
||||
}
|
||||
ctx->g_ret = g_ret;
|
||||
}
|
||||
|
||||
__device__ __host__ QueuePair *NetworkOnImpl::getQueuePair(QueuePair *qp_handle,
|
||||
int pe) {
|
||||
#ifdef USE_DC
|
||||
return qp_handle;
|
||||
#else
|
||||
return &qp_handle[pe];
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __host__ int NetworkOnImpl::getNumQueuePairs() {
|
||||
#ifdef USE_DC
|
||||
return 1;
|
||||
#else
|
||||
return num_pes;
|
||||
#endif
|
||||
}
|
||||
|
||||
void NetworkOffImpl::networkHostSetup(GPUIBBackend *B) {
|
||||
num_pes = B->num_pes;
|
||||
my_pe = B->my_pe;
|
||||
num_blocks = B->num_blocks_;
|
||||
|
||||
exchange_hdp_info(B->hdp_policy, B->thread_comm);
|
||||
}
|
||||
void NetworkOffImpl::exchange_hdp_info(HdpPolicy *hdp_policy,
|
||||
MPI_Comm thread_comm) {
|
||||
#ifdef USE_SINGLE_NODE
|
||||
// We are using the symmetric heap for the HDP flush ptr
|
||||
hdp_address = reinterpret_cast<uintptr_t *>(hdp_policy->get_hdp_flush_ptr());
|
||||
#else
|
||||
/*
|
||||
* Allocate device-side memory for the remote HDP addresses.
|
||||
*/
|
||||
CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_address),
|
||||
num_pes * sizeof(uintptr_t)));
|
||||
|
||||
/*
|
||||
* Allocate host-side memory to exchange hdp keys using MPI_Allgather.
|
||||
*/
|
||||
uint32_t *host_hdp_cpy =
|
||||
reinterpret_cast<uint32_t *>(malloc(num_pes * sizeof(uint32_t)));
|
||||
if (host_hdp_cpy == nullptr) {
|
||||
abort();
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate host-side memory to exchange hdp addresses using
|
||||
* MPI_Allgather.
|
||||
*/
|
||||
uint32_t **host_hdp_address_cpy =
|
||||
reinterpret_cast<uint32_t **>(malloc(num_pes * sizeof(uint32_t *)));
|
||||
if (host_hdp_address_cpy == nullptr) {
|
||||
free(host_hdp_cpy);
|
||||
abort();
|
||||
}
|
||||
|
||||
/*
|
||||
* This processing element writes its personal HDP address
|
||||
* into the host-side array which were just allocated.
|
||||
*/
|
||||
int my_rank = my_pe;
|
||||
host_hdp_address_cpy[my_rank] = hdp_policy->get_hdp_flush_ptr();
|
||||
|
||||
/*
|
||||
* Do all-to-all exchange of our HDP address with other processing
|
||||
* elements.
|
||||
*/
|
||||
MPI_Allgather(MPI_IN_PLACE, sizeof(uintptr_t), MPI_CHAR, host_hdp_address_cpy,
|
||||
sizeof(uint32_t *), MPI_CHAR, thread_comm);
|
||||
|
||||
/*
|
||||
* Copy the recently exchanged HDP addresses to device memory.
|
||||
*/
|
||||
hipStream_t stream;
|
||||
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
CHECK_HIP(hipMemcpyAsync(hdp_address, host_hdp_address_cpy,
|
||||
num_pes * sizeof(uint32_t *), hipMemcpyHostToDevice,
|
||||
stream));
|
||||
CHECK_HIP(hipStreamSynchronize(stream));
|
||||
CHECK_HIP(hipStreamDestroy(stream));
|
||||
|
||||
/*
|
||||
* Free the host-side resources used to exchange HDP resources
|
||||
* between processing elements.
|
||||
*/
|
||||
free(host_hdp_cpy);
|
||||
free(host_hdp_address_cpy);
|
||||
#endif
|
||||
}
|
||||
|
||||
void NetworkOffImpl::networkHostFinalize() {
|
||||
#ifndef USE_SINGLE_NODE
|
||||
CHECK_HIP(hipFree(hdp_address));
|
||||
#endif
|
||||
hdp_address = nullptr;
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,357 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "connection_policy.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
#include "../hdp_policy.hpp"
|
||||
#include "../memory/symmetric_heap.hpp"
|
||||
#include "../stats.hpp"
|
||||
#include "../util.hpp"
|
||||
|
||||
struct ibv_mr;
|
||||
struct hdp_reg_t;
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
struct atomic_ret_t;
|
||||
class GPUIBBackend;
|
||||
class GPUIBContext;
|
||||
class GPUIBHostContext;
|
||||
class Connection;
|
||||
|
||||
class NetworkOnImpl {
|
||||
public:
|
||||
void dump_backend_stats(ROCStats *globalStats);
|
||||
|
||||
void reset_backend_stats();
|
||||
|
||||
/**
|
||||
* @brief setup the network resources and initialization for the
|
||||
* GPUIBBackend
|
||||
*/
|
||||
__host__ void networkHostSetup(GPUIBBackend *B);
|
||||
|
||||
/**
|
||||
* @brief deallocate and close the network resources
|
||||
*/
|
||||
__host__ void networkHostFinalize();
|
||||
|
||||
/**
|
||||
* @brief initialize the network resources for each context
|
||||
*/
|
||||
__host__ void networkHostInit(GPUIBContext *ctx, int buffer_id);
|
||||
|
||||
/**
|
||||
* @brief initialize the network resources for each context on GPU side
|
||||
*/
|
||||
__device__ void networkGpuInit(GPUIBContext *ctx, int buffer_id);
|
||||
|
||||
/**
|
||||
* @brief returns the QP for the targeted pe
|
||||
*/
|
||||
__device__ __host__ QueuePair *getQueuePair(QueuePair *qp, int pe);
|
||||
|
||||
/**
|
||||
* @brief returns the numbers of QPs used per the calling PE
|
||||
*/
|
||||
__device__ __host__ int getNumQueuePairs();
|
||||
|
||||
/**
|
||||
* @brief returns the number of PEs accessible via network
|
||||
*/
|
||||
__device__ __host__ int getNumDest() { return num_pes; }
|
||||
|
||||
static uint32_t externSharedBytes(int num_pes) {
|
||||
int remote_conn{1};
|
||||
#ifndef USE_DC
|
||||
remote_conn = num_pes;
|
||||
#endif
|
||||
return remote_conn * sizeof(QueuePair);
|
||||
}
|
||||
|
||||
protected:
|
||||
/**
|
||||
* @brief flag to indicated that the helper thread reach this milestone
|
||||
*/
|
||||
volatile bool network_init_done{false};
|
||||
|
||||
void heap_memory_rkey(char *local_heap_base, size_t heap_size,
|
||||
MPI_Comm thread_comm, bool is_managed);
|
||||
|
||||
/**
|
||||
* @brief Exchange HDP information between all processing elements.
|
||||
*
|
||||
* Each device has a Host Data Path (HDP) associated with it must be
|
||||
* manually controlled when using fine-grained memory accesses. (The
|
||||
* symmetric heap is allocated with fine-grained memory to support both
|
||||
* host memory accesses and device memory accesses.) The HDP can be
|
||||
* cleared by accessing an address on the device. These addresses must be
|
||||
* shared across the network (to support updates on remote accesses).
|
||||
*
|
||||
* These HDPs are visible to the network by registering them as
|
||||
* InfiniBand memory regions. Every memory region has a remote key
|
||||
* which needs to be shared across the network (to access the memory
|
||||
* region).
|
||||
*
|
||||
* This method is responsible to allocating and initializing the
|
||||
* library's HDP device-side memory and running the all-to-all exchange
|
||||
* to share both the keys and addresses.
|
||||
*
|
||||
* @todo Implement HDP policy class methods to hide most of this
|
||||
* method. The guts should be encapsulated in the policy class and
|
||||
* not exposed here in the backend. Within the policy class methods,
|
||||
* create helper function to improve code reuse regarding the many
|
||||
* data transfers.
|
||||
*/
|
||||
void exchange_hdp_info(HdpPolicy *hdp_policy, MPI_Comm thread_comm);
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize the atomic region.
|
||||
*
|
||||
* The atomic region is used by the atomic operations which have return
|
||||
* values. The library user does not need to provide an address for the
|
||||
* return value so we are forced to do it on their behalf.
|
||||
*
|
||||
* The atomic_ret member is initialized upon completion of this method.
|
||||
*/
|
||||
void setup_atomic_region();
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize device-side queue pair objects.
|
||||
*
|
||||
* Upon completion, the gpu_qps member will be initialized.
|
||||
*/
|
||||
void setup_gpu_qps(GPUIBBackend *B);
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize device-side memory that will be used for
|
||||
* the return of g shmem ops (eg: shmem_int_g)
|
||||
*/
|
||||
void rocshmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm);
|
||||
|
||||
/**
|
||||
* @brief The backend delegates some InfiniBand connection setup to
|
||||
* the Connection class.
|
||||
*/
|
||||
Connection *connection{nullptr};
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Number of PEs. Get directly from the GPUIBBackend.
|
||||
*/
|
||||
int num_pes{0};
|
||||
|
||||
/**
|
||||
* @brief This PE's rank.
|
||||
*/
|
||||
int my_pe{-1};
|
||||
|
||||
/**
|
||||
* @brief Number of WG that will be performing communication
|
||||
*/
|
||||
int num_blocks{0};
|
||||
|
||||
/**
|
||||
* @brief Holds InfiniBand remote keys for HDP memory regions.
|
||||
*
|
||||
* The member holds a C-array allocation for remote keys (from
|
||||
* InfiniBand memory registrations) for remote HDP registers. The C-array
|
||||
* has one entry for each processing element (indexed by processing
|
||||
* element ID).
|
||||
*
|
||||
* @todo Remove duplication between the backend class and the QueuePair
|
||||
* class. QueuePair stores a copy of this member too. The backend
|
||||
* class does not do much besides initialize this data structure and
|
||||
* hold it until the QueuePair can consume it.
|
||||
*/
|
||||
uint32_t *hdp_rkey{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Holds HDP register addresses for each processing element.
|
||||
*
|
||||
* The Host Data Path (HDP) addresses are used to clear a buffer
|
||||
* which interferes with memory visibility of accesses to fine-grained
|
||||
* allocations.
|
||||
*
|
||||
* The member holds a C-array allocation for the register addresses.
|
||||
* The C-array has one entry for each processing element (indexed by
|
||||
* processing element ID).
|
||||
*
|
||||
* @todo Remove duplication between the backend class and the QueuePair
|
||||
* class. QueuePair stores a copy of this member too. The backend
|
||||
* class does not do much besides initialize this data structure and
|
||||
* hold it until the QueuePair can consume it.
|
||||
*/
|
||||
uintptr_t *hdp_address{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for the HDP memory region.
|
||||
*/
|
||||
ibv_mr *hdp_mr{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Set of QueuePairs used by device to do networking.
|
||||
*
|
||||
* The member is used during Context creation.
|
||||
*
|
||||
* @todo What we really need here is a collection of Contexts that can
|
||||
* either be copied into LDS or used directly by the GPU depending on
|
||||
* what type of context it is (shareable, serialized, or private).
|
||||
* No need to pool up QueuePairs, they can just be managed by their
|
||||
* owning Context. Should then consider pushing into base class since
|
||||
* it's not gpu-ib specific.
|
||||
*/
|
||||
QueuePair *gpu_qps{nullptr};
|
||||
|
||||
/**
|
||||
* @brief C-array of symmetric heap base pointers.
|
||||
*
|
||||
* A C-array of char* pointers corresponding to the heap base pointers
|
||||
* virtual address for each processing element that we can communicate
|
||||
* with.
|
||||
*/
|
||||
uint32_t *heap_rkey{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for the symmetric heap memory region.
|
||||
*/
|
||||
ibv_mr *heap_mr{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Local key for the symmetric heap memory region.
|
||||
*/
|
||||
uint32_t lkey{0};
|
||||
|
||||
/**
|
||||
* @brief Control struct for atomic memory region.
|
||||
*
|
||||
* The atomic region is used by the atomic operations which have return
|
||||
* values. The library user does not need to provide an address for the
|
||||
* return value so we are forced to do it on their behalf.
|
||||
*/
|
||||
atomic_ret_t *atomic_ret{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Handle for the atomic memory region.
|
||||
*
|
||||
* @todo Provide more descriptive variable name.
|
||||
*/
|
||||
ibv_mr *mr{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Buffer used to store the results of a *_g operation.
|
||||
*
|
||||
* These operations do not provide a destination buffer so the runtime
|
||||
* must manage one.
|
||||
*/
|
||||
char *g_ret{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Compile-time configuration policy for InfiniBand connections.
|
||||
*
|
||||
* The configuration option "USE_DC" can be enabled to create
|
||||
* Dynamic connection types. By default, Reliable connections are
|
||||
* created.
|
||||
*/
|
||||
ConnectionImpl *connection_policy{nullptr};
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
NOWARN(-Wunused-parameter,
|
||||
class NetworkOffImpl {
|
||||
public:
|
||||
void dump_backend_stats(ROCStats *globalStats) { }
|
||||
|
||||
void reset_backend_stats() { }
|
||||
|
||||
__host__ void networkHostSetup(GPUIBBackend *B);
|
||||
|
||||
__host__ void exchange_hdp_info(HdpPolicy *hdp_policy, MPI_Comm thread_comm);
|
||||
|
||||
__host__ void networkHostFinalize();
|
||||
|
||||
__host__ void networkHostInit(GPUIBContext *ctx, int buffer_id) {}
|
||||
|
||||
__device__ void networkGpuInit(GPUIBContext *ctx, int buffer_id) {}
|
||||
|
||||
__device__ __host__ QueuePair *getQueuePair(QueuePair *qp, int pe) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
__device__ __host__ int getNumQueuePairs() { return 0; }
|
||||
|
||||
__device__ __host__ int getNumDest() { return 0; }
|
||||
|
||||
static uint32_t externSharedBytes(int num_pes) { return 0; }
|
||||
|
||||
public:
|
||||
int num_pes{0};
|
||||
|
||||
int my_pe{-1};
|
||||
|
||||
int num_blocks{0};
|
||||
|
||||
uint32_t *hdp_rkey{nullptr};
|
||||
|
||||
uintptr_t *hdp_address{nullptr};
|
||||
|
||||
ibv_mr *hdp_mr{nullptr};
|
||||
|
||||
QueuePair *gpu_qps{nullptr};
|
||||
|
||||
uint32_t *heap_rkey{nullptr};
|
||||
|
||||
ibv_mr *heap_mr{nullptr};
|
||||
|
||||
uint32_t lkey{0};
|
||||
|
||||
atomic_ret_t *atomic_ret{nullptr};
|
||||
|
||||
ibv_mr *mr{nullptr};
|
||||
|
||||
char *g_ret{nullptr};
|
||||
|
||||
ConnectionImpl *connection_policy{nullptr};
|
||||
};
|
||||
)
|
||||
// clang-format on
|
||||
|
||||
/*
|
||||
* Select which one of our IPC policies to use at compile time.
|
||||
*/
|
||||
#ifdef USE_SINGLE_NODE
|
||||
typedef NetworkOffImpl NetworkImpl;
|
||||
#else
|
||||
typedef NetworkOnImpl NetworkImpl;
|
||||
#endif
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
|
||||
@@ -1,79 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "qe_dumper.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
QeDumper::QeDumper(int dest_pe, int src_wg, int index)
|
||||
: dest_pe_(dest_pe), src_wg_(src_wg), index_(index) {
|
||||
void* temp = malloc(sizeof(GPUIBBackend*));
|
||||
gpu_backend_ = static_cast<GPUIBBackend*>(temp);
|
||||
|
||||
GPUIBBackend* device_backend_proxy_address;
|
||||
CHECK_HIP(hipGetSymbolAddress(
|
||||
reinterpret_cast<void**>(&device_backend_proxy_address),
|
||||
HIP_SYMBOL(device_backend_proxy)));
|
||||
|
||||
CHECK_HIP(hipMemcpy(&gpu_backend_, device_backend_proxy_address,
|
||||
sizeof(GPUIBBackend*), hipMemcpyDeviceToHost));
|
||||
|
||||
int qp_offset = gpu_backend_->num_blocks_ * dest_pe_ + src_wg_;
|
||||
|
||||
qp_ = &(gpu_backend_->networkImpl.gpu_qps[qp_offset]);
|
||||
}
|
||||
|
||||
QeDumper::~QeDumper() {
|
||||
/*if (gpu_backend_) {
|
||||
free(gpu_backend_);
|
||||
}*/
|
||||
}
|
||||
|
||||
void QeDumper::dump_cq() {
|
||||
type_ = "CQ";
|
||||
|
||||
auto* raw_cqe = &(qp_->current_cq_q_H[index_]);
|
||||
raw_u64_ = reinterpret_cast<uint64_t*>(raw_cqe);
|
||||
|
||||
dump_uint64_(8);
|
||||
}
|
||||
|
||||
void QeDumper::dump_sq() {
|
||||
type_ = "SQ";
|
||||
|
||||
auto* raw_sqe = &(qp_->current_sq_H[index_ * 8]);
|
||||
raw_u64_ = reinterpret_cast<uint64_t*>(raw_sqe);
|
||||
|
||||
dump_uint64_(8);
|
||||
}
|
||||
|
||||
void QeDumper::dump_uint64_(size_t num_elems) const {
|
||||
printf("%s(%d, %d, %d) *** = ", type_.c_str(), dest_pe_, src_wg_, index_);
|
||||
|
||||
for (size_t i = 0; i < num_elems; i++) {
|
||||
printf(" %lx ", raw_u64_[i]);
|
||||
}
|
||||
|
||||
printf("done %s\n", type_.c_str());
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,66 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "backend_ib.hpp"
|
||||
#include "queue_pair.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class QeDumper {
|
||||
public:
|
||||
QeDumper(int dest_pe, int src_wg, int index);
|
||||
|
||||
~QeDumper();
|
||||
|
||||
void dump_cq();
|
||||
|
||||
void dump_sq();
|
||||
|
||||
private:
|
||||
void dump_uint64_(size_t num_elems) const;
|
||||
|
||||
int dest_pe_{-1};
|
||||
|
||||
int src_wg_{-1};
|
||||
|
||||
int index_{-1};
|
||||
|
||||
GPUIBBackend* gpu_backend_{nullptr};
|
||||
|
||||
std::string type_{};
|
||||
|
||||
QueuePair* qp_{nullptr};
|
||||
|
||||
uint64_t* raw_u64_{nullptr};
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
|
||||
@@ -1,437 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "queue_pair.hpp"
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "backend_ib.hpp"
|
||||
#include "endian.hpp"
|
||||
#include "segment_builder.hpp"
|
||||
#include "../util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
QueuePair::QueuePair(GPUIBBackend *backend)
|
||||
: hdp_policy(backend->hdp_policy),
|
||||
connection_policy(*backend->networkImpl.connection_policy) {
|
||||
hdp_rkey = backend->networkImpl.hdp_rkey;
|
||||
hdp_address = backend->networkImpl.hdp_address;
|
||||
|
||||
atomic_ret.atomic_lkey = backend->networkImpl.atomic_ret->atomic_lkey;
|
||||
atomic_ret.atomic_counter = 0;
|
||||
}
|
||||
|
||||
__device__ QueuePair::~QueuePair() {
|
||||
uint64_t start = profiler.startTimer();
|
||||
|
||||
global_qp->sq_counter = sq_counter;
|
||||
global_qp->local_sq_cnt = local_sq_cnt;
|
||||
global_qp->cq_consumer_counter = cq_consumer_counter;
|
||||
global_qp->current_sq = current_sq;
|
||||
global_qp->current_cq_q = current_cq_q;
|
||||
global_qp->sq_overflow = sq_overflow;
|
||||
global_qp->quiet_counter = quiet_counter;
|
||||
profiler.endTimer(start, FINALIZE);
|
||||
|
||||
global_qp->profiler.accumulateStats(profiler);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ uint8_t QueuePair::get_cq_error_syndrome(mlx5_cqe64 *cqe_entry) {
|
||||
mlx5_err_cqe *cqe_err = reinterpret_cast<mlx5_err_cqe *>(cqe_entry);
|
||||
return cqe_err->syndrome;
|
||||
}
|
||||
|
||||
__device__ void QueuePair::ring_doorbell(uint64_t db_val) {
|
||||
swap_endian_store(const_cast<uint32_t *>(dbrec_send),
|
||||
reinterpret_cast<uint32_t>(sq_counter));
|
||||
STORE(db.ptr, db_val);
|
||||
db.uint ^= 256;
|
||||
}
|
||||
|
||||
__device__ void QueuePair::set_completion_flag_on_wqe(int num_wqes) {
|
||||
uint64_t *wqe = ¤t_sq[8 * ((sq_counter - num_wqes) % max_nwqe)];
|
||||
uint8_t *wqe_ce = reinterpret_cast<uint8_t *>(wqe) + 11;
|
||||
*wqe_ce = 8;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void QueuePair::update_wqe_ce_single<false>(int num_wqes) {
|
||||
if (sq_counter % max_nwqe == (max_nwqe - 2)) {
|
||||
set_completion_flag_on_wqe(num_wqes);
|
||||
quiet_counter++;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void QueuePair::update_wqe_ce_single<true>(int num_wqes) {
|
||||
set_completion_flag_on_wqe(num_wqes);
|
||||
quiet_counter++;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void QueuePair::update_wqe_ce_thread<false>(int num_wqes) {}
|
||||
|
||||
template <>
|
||||
__device__ void QueuePair::update_wqe_ce_thread<true>(int num_wqes) {
|
||||
set_completion_flag_on_wqe(num_wqes);
|
||||
atomicAdd(&quiet_counter, 1);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::compute_db_val_opcode(uint64_t *db_val,
|
||||
uint16_t dbrec_val,
|
||||
uint8_t opcode) {
|
||||
uint64_t opcode64 = opcode;
|
||||
opcode64 = opcode64 << 24 & 0x000000FFFF000000;
|
||||
|
||||
uint64_t dbrec = dbrec_val << 8;
|
||||
dbrec = dbrec & 0x0000000000FFFF00;
|
||||
|
||||
uint64_t val = *db_val;
|
||||
val = val & 0xFFFFFFFFFF0000FF;
|
||||
|
||||
*db_val = val | dbrec | opcode64;
|
||||
}
|
||||
|
||||
template <class level>
|
||||
__device__ void QueuePair::quiet_internal() {
|
||||
/*
|
||||
* If there are nothing to quiet, just return early.
|
||||
*/
|
||||
uint32_t quiet_val = quiet_counter;
|
||||
if (!quiet_val) {
|
||||
return;
|
||||
}
|
||||
|
||||
profiler.incStat(QUIET_COUNT);
|
||||
uint64_t start = profiler.startTimer();
|
||||
|
||||
/*
|
||||
* Generate a pointer to the completion queue entry.
|
||||
*/
|
||||
cq_consumer_counter = cq_consumer_counter + quiet_val - 1;
|
||||
uint32_t indx = (cq_consumer_counter % cq_size);
|
||||
mlx5_cqe64 *cqe_entry = ¤t_cq_q[indx];
|
||||
|
||||
/*
|
||||
* Access the op_own value in the completion queue entry.
|
||||
*/
|
||||
int val_ld = uncached_load_ubyte(&(cqe_entry->op_own));
|
||||
uint8_t val_op_own = val_ld;
|
||||
|
||||
/*
|
||||
* If the completion queue entry is not valid, wait for it to become so.
|
||||
*/
|
||||
while (!((val_op_own & 0x1) == ((cq_consumer_counter >> cq_log_size) & 1)) ||
|
||||
((val_op_own) >> 4) == 0xF) {
|
||||
val_ld = uncached_load_ubyte(&(cqe_entry->op_own));
|
||||
val_op_own = val_ld;
|
||||
}
|
||||
|
||||
/*
|
||||
* Grab the opcode from the op_own field and report if it is an error.
|
||||
*/
|
||||
uint8_t opcode = val_op_own >> 4;
|
||||
if (opcode != 0) {
|
||||
uint8_t syndrome = get_cq_error_syndrome(cqe_entry);
|
||||
mlx5_err_cqe *cqe_err = reinterpret_cast<mlx5_err_cqe *>(cqe_entry);
|
||||
GPU_DPRINTF("QUIET ERROR: signature %d opcode_qpn %llx wqe_cnt %llx \n",
|
||||
syndrome, cqe_err->s_wqe_opcode_qpn, cqe_err->wqe_counter);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrement the quiet count by the amount determined at the beginning
|
||||
* of this method.
|
||||
*
|
||||
* bpotter - There are two areas of concern in this method for me.
|
||||
* 1) In multithreaded builds, we may need to make this method a critical
|
||||
* section to prevent data races on these variables.
|
||||
*
|
||||
* 2) Is there a data race in the API if a one remote process calls quiet
|
||||
* while another process continues adding events? Is it ever possible for
|
||||
* a quiet to complete, but the quiet_counter decrement here is not set
|
||||
* to zero?
|
||||
*/
|
||||
level L;
|
||||
L.decQuietCounter(&quiet_counter, quiet_val);
|
||||
|
||||
profiler.endTimer(start, POLL_CQ);
|
||||
start = profiler.startTimer();
|
||||
|
||||
/*
|
||||
* Increment the trailing index counter which tracks our spot in the
|
||||
* completion queue.
|
||||
*/
|
||||
cq_consumer_counter++;
|
||||
swap_endian_store(const_cast<uint32_t *>(dbrec_cq), cq_consumer_counter);
|
||||
|
||||
profiler.endTimer(start, NEXT_CQ);
|
||||
}
|
||||
|
||||
template <class level>
|
||||
__device__ void QueuePair::quiet_single() {
|
||||
level L;
|
||||
L.quiet(this);
|
||||
}
|
||||
|
||||
template <class level>
|
||||
__device__ void QueuePair::quiet_single_heavy(int pe) {
|
||||
level L;
|
||||
L.quiet_heavy(this, pe);
|
||||
}
|
||||
|
||||
template <class level, bool cqe>
|
||||
__device__ void QueuePair::update_posted_wqe_generic(
|
||||
int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp, bool ring_db,
|
||||
uint64_t atomic_ret_pos, bool zero_byte_rd) {
|
||||
uint64_t start = profiler.startTimer();
|
||||
|
||||
level L;
|
||||
L.postLock(this, pe);
|
||||
uint32_t num_wqes = connection_policy.getNumWqes(opcode);
|
||||
|
||||
// Get the index for my thread's put in the SQ.
|
||||
uint64_t my_sq_counter = L.threadAtomicAdd(&sq_counter, num_wqes);
|
||||
uint64_t my_sq_index = my_sq_counter % max_nwqe;
|
||||
|
||||
// 16-bit little endian version of the SQ index needed to build the cntrl
|
||||
// segment in the WQE.
|
||||
uint16_t le_sq_counter;
|
||||
uint16_t sq_counter_u16 = my_sq_counter;
|
||||
swap_endian_store(&le_sq_counter, sq_counter_u16);
|
||||
|
||||
bool flag = sq_overflow;
|
||||
uint32_t lkey_in_stack_frame = lkey;
|
||||
uint32_t rkey_in_stack_frame = rkey;
|
||||
uint32_t ctrl_qp_sq_in_stack_frame = ctrl_qp_sq;
|
||||
uint64_t ctrl_sig_in_stack_frame = ctrl_sig;
|
||||
|
||||
connection_policy.setRkey(&rkey_in_stack_frame, pe);
|
||||
|
||||
if (opcode == MLX5_OPCODE_RDMA_WRITE && !size) {
|
||||
rkey_in_stack_frame = hdp_rkey[pe];
|
||||
size = 4;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build out all the segments required for my WQE(s) based on the
|
||||
* operation, starting at my_sq_index into the SQ. SegmentBuilder will
|
||||
* keep track of placing the segments in the correct location.
|
||||
*/
|
||||
SegmentBuilder seg_build(my_sq_index, current_sq);
|
||||
seg_build.update_cntrl_seg(opcode, le_sq_counter, ctrl_qp_sq_in_stack_frame,
|
||||
ctrl_sig_in_stack_frame, &connection_policy,
|
||||
zero_byte_rd);
|
||||
seg_build.update_connection_seg(pe, &connection_policy);
|
||||
seg_build.update_rdma_seg(raddr, rkey_in_stack_frame);
|
||||
|
||||
if (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS) {
|
||||
seg_build.update_atomic_data_seg(atomic_data, atomic_cmp);
|
||||
size = 8;
|
||||
lkey_in_stack_frame = atomic_ret.atomic_lkey;
|
||||
laddr = &atomic_ret.atomic_base_ptr[atomic_ret_pos];
|
||||
}
|
||||
|
||||
if (size <= inline_threshold && opcode == MLX5_OPCODE_RDMA_WRITE) {
|
||||
seg_build.update_inl_data_seg(laddr, size);
|
||||
} else {
|
||||
seg_build.update_data_seg(laddr, size, lkey_in_stack_frame);
|
||||
}
|
||||
|
||||
profiler.incStat(WQE_COUNT);
|
||||
profiler.endTimer(start, UPDATE_WQE);
|
||||
start = profiler.startTimer();
|
||||
|
||||
L.template finishPost<cqe>(this, ring_db, num_wqes, pe, le_sq_counter,
|
||||
opcode);
|
||||
|
||||
profiler.incStat(DB_COUNT);
|
||||
profiler.endTimer(start, RING_SQ_DB);
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
****************************** SHMEM INTERFACE *******************************
|
||||
*****************************************************************************/
|
||||
template <class level>
|
||||
__device__ void QueuePair::put_nbi(void *dest, const void *source,
|
||||
size_t nelems, int pe, bool db_ring) {
|
||||
uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
|
||||
|
||||
update_posted_wqe_generic<level, false>(
|
||||
pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0);
|
||||
}
|
||||
|
||||
template <class level>
|
||||
__device__ void QueuePair::put_nbi_cqe(void *dest, const void *source,
|
||||
size_t nelems, int pe, bool db_ring) {
|
||||
uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
|
||||
|
||||
update_posted_wqe_generic<level, true>(
|
||||
pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0);
|
||||
}
|
||||
|
||||
template <class level>
|
||||
__device__ void QueuePair::get_nbi(void *dest, const void *source,
|
||||
size_t nelems, int pe, bool db_ring) {
|
||||
uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
|
||||
|
||||
update_posted_wqe_generic<level, false>(
|
||||
pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0);
|
||||
}
|
||||
|
||||
template <class level>
|
||||
__device__ void QueuePair::get_nbi_cqe(void *dest, const void *source,
|
||||
size_t nelems, int pe, bool db_ring) {
|
||||
uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
|
||||
|
||||
update_posted_wqe_generic<level, true>(
|
||||
pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0);
|
||||
}
|
||||
|
||||
template <class level>
|
||||
__device__ void QueuePair::zero_b_rd(int pe) {
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t *>(base_heap[pe]);
|
||||
|
||||
update_posted_wqe_generic<level, true>(pe, 0, nullptr, dst,
|
||||
MLX5_OPCODE_RDMA_READ, 0, 0, true, 0,
|
||||
true); // enable 0_byte read op
|
||||
}
|
||||
|
||||
__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t value,
|
||||
int64_t cond, int pe, bool db_ring,
|
||||
uint8_t atomic_op) {
|
||||
THREAD TH;
|
||||
uint64_t pos = TH.threadAtomicAdd(
|
||||
reinterpret_cast<unsigned long long *>(/* NOLINT(runtime/int) */
|
||||
&atomic_ret.atomic_counter));
|
||||
|
||||
pos = pos % max_nb_atomic;
|
||||
|
||||
int64_t *atomic_base_ptr =
|
||||
reinterpret_cast<int64_t *>(atomic_ret.atomic_base_ptr);
|
||||
|
||||
int64_t *load_address = &atomic_base_ptr[pos];
|
||||
|
||||
*load_address = -100;
|
||||
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
|
||||
|
||||
update_posted_wqe_generic<THREAD, true>(pe, sizeof(int64_t), nullptr, dst,
|
||||
atomic_op, value, cond, db_ring, pos);
|
||||
quiet_single<THREAD>();
|
||||
|
||||
while (uncached_load(load_address) == -100) {
|
||||
}
|
||||
|
||||
int64_t ret = *load_address;
|
||||
|
||||
__threadfence();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ void QueuePair::atomic_nofetch(void *dest, int64_t value,
|
||||
int64_t cond, int pe, bool db_ring,
|
||||
uint8_t atomic_op) {
|
||||
THREAD TH;
|
||||
uint64_t pos = TH.threadAtomicAdd(
|
||||
reinterpret_cast<unsigned long long *>(/* NOLINT(runtime/int) */
|
||||
&atomic_ret.atomic_counter));
|
||||
pos = pos % max_nb_atomic;
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
|
||||
|
||||
update_posted_wqe_generic<THREAD, true>(pe, sizeof(int64_t), nullptr, dst,
|
||||
atomic_op, value, cond, db_ring, pos);
|
||||
|
||||
quiet_single<THREAD>();
|
||||
}
|
||||
|
||||
__device__ void QueuePair::fence(int pe) {
|
||||
// TODO(khamidou): should this be replaced by a zero_byte_rd?
|
||||
// FIXME: the relaxed ordering requires an intervening read to order
|
||||
// prior operations.
|
||||
auto remote_hdp_uncast = hdp_address[pe];
|
||||
uintptr_t *remote_hdp = reinterpret_cast<uintptr_t *>(remote_hdp_uncast);
|
||||
update_posted_wqe_generic<THREAD, true>(
|
||||
pe, 0, nullptr, remote_hdp, MLX5_OPCODE_RDMA_WRITE, 0, 0, true, 0);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::waitCQSpace(int num_msgs) {
|
||||
// We cannot post more outstanding requests than the completion queue
|
||||
// size. Force a quiet if we are out of space.
|
||||
if ((quiet_counter + num_msgs) >= cq_size) {
|
||||
GPU_DPRINTF(
|
||||
"*** inside post_cq forcing flush: outstanding %d "
|
||||
"adding %d cq_size %d\n",
|
||||
quiet_counter, num_msgs, cq_size);
|
||||
|
||||
// TODO(khamidou): More targeted flush would be better here.
|
||||
quiet_single<THREAD>();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void QueuePair::waitSQSpace(int num_msgs) {
|
||||
// We cannot post more outstanding requests than the Send queue
|
||||
// size. Force a quiet if we are out of space.
|
||||
local_sq_cnt += num_msgs;
|
||||
int div = local_sq_cnt / max_nwqe;
|
||||
|
||||
if (div > 0) {
|
||||
GPU_DPRINTF(
|
||||
"*** inside waitSQSpace forcing flush to overrun the SQ"
|
||||
" sq_counter %d adding %d quiet_conter %d \n",
|
||||
sq_counter, num_msgs, max_nwqe, quiet_counter);
|
||||
|
||||
quiet_single<THREAD>();
|
||||
local_sq_cnt = local_sq_cnt % max_nwqe;
|
||||
}
|
||||
}
|
||||
|
||||
void QueuePair::setDBval(uint64_t val) { db_val = val; }
|
||||
|
||||
#define THREAD_LEVEL_GEN(T) \
|
||||
template __device__ void QueuePair::put_nbi<T>( \
|
||||
void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
|
||||
template __device__ void QueuePair::put_nbi_cqe<T>( \
|
||||
void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
|
||||
template __device__ void QueuePair::get_nbi<T>( \
|
||||
void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
|
||||
template __device__ void QueuePair::get_nbi_cqe<T>( \
|
||||
void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
|
||||
template __device__ void QueuePair::zero_b_rd<T>(int pe); \
|
||||
template __device__ void QueuePair::quiet_single<T>(); \
|
||||
template __device__ void QueuePair::quiet_single_heavy<T>(int pe); \
|
||||
template __device__ void QueuePair::quiet_internal<T>();
|
||||
|
||||
THREAD_LEVEL_GEN(THREAD)
|
||||
THREAD_LEVEL_GEN(WG)
|
||||
THREAD_LEVEL_GEN(WAVE)
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,431 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
|
||||
|
||||
/**
|
||||
* @file queue_pair.hpp
|
||||
*
|
||||
* @section DESCRIPTION
|
||||
* An IB QueuePair (SQ and CQ) that the device can use to perform network
|
||||
* operations. Most important rocSHMEM operations are performed by this
|
||||
* class.
|
||||
*/
|
||||
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../atomic_return.hpp"
|
||||
#include "connection_policy.hpp"
|
||||
#include "thread_policy.hpp"
|
||||
#include "../hdp_policy.hpp"
|
||||
#include "../stats.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class GPUIBBackend;
|
||||
|
||||
enum gpu_ib_stats {
|
||||
RING_SQ_DB = 0,
|
||||
UPDATE_WQE,
|
||||
POLL_CQ,
|
||||
NEXT_CQ,
|
||||
QUIET_COUNT,
|
||||
DB_COUNT,
|
||||
WQE_COUNT,
|
||||
MEM_WAIT,
|
||||
INIT,
|
||||
FINALIZE,
|
||||
GPU_IB_NUM_STATS
|
||||
};
|
||||
|
||||
typedef union db_reg {
|
||||
uint64_t *ptr;
|
||||
uintptr_t uint;
|
||||
} db_reg_t;
|
||||
|
||||
class QueuePair {
|
||||
public:
|
||||
/**
|
||||
* @brief Constructor.
|
||||
*
|
||||
* @param[in] backend Backend needed for member access.
|
||||
*/
|
||||
explicit QueuePair(GPUIBBackend *backend);
|
||||
|
||||
/**
|
||||
* @brief Destructor.
|
||||
*/
|
||||
__device__ ~QueuePair();
|
||||
|
||||
/**
|
||||
* @brief Inspect completion queue and possibly wait for free space.
|
||||
*
|
||||
* @param[in] num_msgs Number of entries needing space in completion queue.
|
||||
*/
|
||||
__device__ void waitCQSpace(int num_msgs);
|
||||
|
||||
/**
|
||||
* @brief Inspect send queue and possibly wait for free space.
|
||||
*
|
||||
* @param[in] num_msgs Number of entries needing space in send queue.
|
||||
*/
|
||||
__device__ void waitSQSpace(int num_msgs);
|
||||
|
||||
/**
|
||||
* @brief Create and enqueue a non-blocking put work queue entry (wqe).
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
*
|
||||
* @param[in] dest Destination address for data transmission.
|
||||
* @param[in] source Source address for data transmission.
|
||||
* @param[in] nelems Size in bytes of data transmission.
|
||||
* @param[in] pe Destination processing element of data transmission.
|
||||
* @param[in] db_ring Denotes whether send queue door bell should be rung.
|
||||
*/
|
||||
template <class level>
|
||||
__device__ void put_nbi(void *dest, const void *source, size_t nelems, int pe,
|
||||
bool db_ring);
|
||||
|
||||
/**
|
||||
* @brief Create and enqueue a non-blocking put work queue entry (wqe).
|
||||
*
|
||||
* @note This variant differs from put_nbi by requesting that a completion
|
||||
* queue entry is generated in the completion queue.
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
*
|
||||
* @param[in] dest Destination address for data transmission.
|
||||
* @param[in] source Source address for data transmission.
|
||||
* @param[in] nelems Size in bytes of data transmission.
|
||||
* @param[in] pe Destination processing element of data transmission.
|
||||
* @param[in] db_ring Denotes whether send queue door bell should be rung.
|
||||
*/
|
||||
template <class level>
|
||||
__device__ void put_nbi_cqe(void *dest, const void *source, size_t nelems,
|
||||
int pe, bool db_ring);
|
||||
|
||||
/**
|
||||
* @brief Consume a completion queue entry from this queue pair's
|
||||
* completion queue.
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
*/
|
||||
template <class level>
|
||||
__device__ void quiet_single();
|
||||
|
||||
/**
|
||||
* @brief Send a zero-byte read to enforce ordering and then consume
|
||||
* a completion queue entry from this queue pair's completion queue.
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
*
|
||||
* @param[in] pe Processing element id to send the zero_b_rd.
|
||||
*/
|
||||
template <class level>
|
||||
__device__ void quiet_single_heavy(int pe);
|
||||
|
||||
/**
|
||||
* @brief Create and enqueue a HDP flush work queue entry on the remote PE.
|
||||
*
|
||||
* @param[in] pe Processing element id to send the HDP flush operation.
|
||||
*
|
||||
* TODO(@khamidou): does this require a zero_b_rd to enforce write ordering
|
||||
* The HDP flush is itself a write. Could this write be reordered with
|
||||
* respect to other write on the network and arrive out-of-order?
|
||||
*/
|
||||
__device__ void fence(int pe);
|
||||
|
||||
/**
|
||||
* @brief Create and enqueue a non-blocking get work queue entry (wqe).
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
*
|
||||
* @param[in] dest Destination address for data transmission.
|
||||
* @param[in] source Source address for data transmission.
|
||||
* @param[in] nelems Size in bytes of data transmission.
|
||||
* @param[in] pe Destination processing element of data transmission.
|
||||
* @param[in] db_ring Denotes whether send queue door bell should be rung.
|
||||
*/
|
||||
template <class level>
|
||||
__device__ void get_nbi(void *dest, const void *source, size_t nelems, int pe,
|
||||
bool db_ring);
|
||||
|
||||
/**
|
||||
* @brief Create and enqueue a non-blocking get work queue entry (wqe).
|
||||
*
|
||||
* @note This variant differs from get_nbi by requesting that a completion
|
||||
* queue entry is generated in the completion queue.
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
*
|
||||
* @param[in] dest Destination address for data transmission.
|
||||
* @param[in] source Source address for data transmission.
|
||||
* @param[in] nelems Size in bytes of data transmission.
|
||||
* @param[in] pe Destination processing element of data transmission.
|
||||
* @param[in] db_ring Denotes whether send queue door bell should be rung.
|
||||
*/
|
||||
template <class level>
|
||||
__device__ void get_nbi_cqe(void *dest, const void *source, size_t nelems,
|
||||
int pe, bool db_ring);
|
||||
|
||||
/**
|
||||
* @brief Create and enqueue a zero-byte read to enforce write ordering.
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
*
|
||||
* @param[in] pe Processing element id to send the zero_b_rd.
|
||||
*/
|
||||
template <class level>
|
||||
__device__ void zero_b_rd(int pe);
|
||||
|
||||
/**
|
||||
* @brief Create and enqueue an atomic fetch work queue entry (wqe).
|
||||
*
|
||||
* @param[in] dest Destination address for data transmission.
|
||||
* @param[in] value Data value for the atomic operation.
|
||||
* @param[in] cond Used in atomic comparisons.
|
||||
* @param[in] pe Destination processing element of data transmission.
|
||||
* @param[in] db_ring Denotes whether send queue door bell should be rung.
|
||||
* @param[in] atomic_op The atomic operation to perform.
|
||||
*
|
||||
* @return An atomic value
|
||||
*/
|
||||
__device__ int64_t atomic_fetch(void *dest, int64_t value, int64_t cond,
|
||||
int pe, bool db_ring, uint8_t atomic_op);
|
||||
|
||||
/**
|
||||
* @brief Create and enqueue an atomic fetch work queue entry (wqe).
|
||||
*
|
||||
* @param[in] dest Destination address for data transmission.
|
||||
* @param[in] value Data value for the atomic operation.
|
||||
* @param[in] cond Used in atomic comparisons.
|
||||
* @param[in] pe Destination processing element of data transmission.
|
||||
* @param[in] db_ring Denotes whether send queue door bell should be rung.
|
||||
* @param[in] atomic_op The atomic operation to perform.
|
||||
*/
|
||||
__device__ void atomic_nofetch(void *dest, int64_t value, int64_t cond,
|
||||
int pe, bool db_ring, uint8_t atomic_op);
|
||||
|
||||
/**
|
||||
* @brief Helper method to set the doorbell's value.
|
||||
*
|
||||
* @param[in] val Desired value for the doorbell.
|
||||
*/
|
||||
void setDBval(uint64_t val);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* @brief Helper method to build work requests for the send queue.
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
* @tparam cqe Flag to optionally generate cqes.
|
||||
*
|
||||
* @param[in] pe Destination processing element of data transmission.
|
||||
* @param[in] size Size in bytes of data transmission.
|
||||
* @param[in] laddr Local address.
|
||||
* @param[in] raddr Remote address.
|
||||
* @param[in] opcode Operation to be performed.
|
||||
* @param[in] atomic_data An atomic data value to be used.
|
||||
* @param[in] atomic_cmp An atomic comparison operation to be performed.
|
||||
* @param[in] ring_db Boolean denoting if doorbell should be rung.
|
||||
* @param[in] atomic_ret_pos Index into atomic return structure.
|
||||
* @param[in] zero_byte_rd Boolean if zero byte read should be used.
|
||||
*/
|
||||
template <class level, bool cqe>
|
||||
__device__ __attribute__((noinline)) void update_posted_wqe_generic(
|
||||
int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp, bool ring_db,
|
||||
uint64_t atomic_ret_pos, bool zero_byte_rd = false);
|
||||
|
||||
/**
|
||||
* @brief Helper method to drain completion queue entries.
|
||||
*
|
||||
* @tparam level Implements specific behaviors for thread, warp, block access.
|
||||
*
|
||||
*/
|
||||
template <class level>
|
||||
__device__ __attribute__((noinline)) void quiet_internal();
|
||||
|
||||
/**
|
||||
* @brief Helper method to compute doorbell value opcode which is used to
|
||||
* ring the doorbell.
|
||||
*
|
||||
* @param[in,out] db_val
|
||||
* @param[in] dbrec_val
|
||||
* @param[in] opcode
|
||||
*/
|
||||
__device__ void compute_db_val_opcode(uint64_t *db_val, uint16_t dbrec_val,
|
||||
uint8_t opcode);
|
||||
|
||||
/**
|
||||
* @brief Helper method that sets the field in a work queue entry to
|
||||
* generate a completion entry in the completion queue.
|
||||
*
|
||||
* @param num_wqes Number of work entries this completion entry represents.
|
||||
*/
|
||||
__device__ void set_completion_flag_on_wqe(int num_wqes);
|
||||
|
||||
/**
|
||||
* @brief Helper method to update fields for the work queue entry.
|
||||
*
|
||||
* @tparam cqe Flag to optionally generate cqes.
|
||||
*
|
||||
* @note Single variant is meant to be callable by a block leader.
|
||||
*/
|
||||
template <bool cqe>
|
||||
__device__ void update_wqe_ce_single(int num_wqes);
|
||||
|
||||
/**
|
||||
* @brief Helper method to update fields for the work queue entry.
|
||||
*
|
||||
* @tparam cqe Flag to optionally generate cqes.
|
||||
*
|
||||
* @note Thread variant is meant to be callable by multiple threads.
|
||||
*/
|
||||
template <bool cqe>
|
||||
__device__ void update_wqe_ce_thread(int num_wqes);
|
||||
|
||||
/**
|
||||
* @brief Helper method to ring the doorbell
|
||||
*
|
||||
* @param[in] db_val Doorbell value is written by method.
|
||||
*/
|
||||
__device__ void ring_doorbell(uint64_t db_val);
|
||||
|
||||
/**
|
||||
* @brief Helper method to extract syndrome field from cqe.
|
||||
*
|
||||
* @param[in] cq_entry Completion queue entry.
|
||||
*/
|
||||
__device__ uint8_t get_cq_error_syndrome(mlx5_cqe64 *cq_entry);
|
||||
|
||||
private:
|
||||
const int inline_threshold{8};
|
||||
|
||||
/* TODO(bpotter): Most of these should be private/protected */
|
||||
public:
|
||||
#ifdef PROFILE
|
||||
typedef Stats<GPU_IB_NUM_STATS> GPUIBStats;
|
||||
#else
|
||||
typedef NullStats<GPU_IB_NUM_STATS> GPUIBStats;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Pointer to the hardware doorbell register for the QP.
|
||||
*/
|
||||
db_reg_t db{};
|
||||
|
||||
/*
|
||||
* Base pointer of this QP's SQ
|
||||
* TODO(bpotter): Use the correct struct type for this.
|
||||
*/
|
||||
uint64_t *current_sq{nullptr};
|
||||
uint64_t *current_sq_H{nullptr};
|
||||
|
||||
/*
|
||||
* Base pointer of this QP's CQ
|
||||
*/
|
||||
mlx5_cqe64 *current_cq_q{nullptr};
|
||||
mlx5_cqe64 *current_cq_q_H{nullptr};
|
||||
|
||||
/*
|
||||
* Pointer to the doorbell record for this SQ.
|
||||
*/
|
||||
volatile uint32_t *dbrec_send{nullptr};
|
||||
|
||||
/*
|
||||
* Pointer to the doorbell record for the CQ.
|
||||
*/
|
||||
volatile uint32_t *dbrec_cq{nullptr};
|
||||
|
||||
uint32_t *hdp_rkey{nullptr};
|
||||
|
||||
uintptr_t *hdp_address{nullptr};
|
||||
|
||||
HdpPolicy *hdp_policy{};
|
||||
|
||||
atomic_ret_t atomic_ret{};
|
||||
|
||||
ThreadImpl threadImpl{};
|
||||
|
||||
ConnectionImpl connection_policy;
|
||||
|
||||
char *const *base_heap{nullptr};
|
||||
/*
|
||||
* Current index into the SQ (non-modulo size).
|
||||
*/
|
||||
uint32_t sq_counter{0};
|
||||
uint32_t local_sq_cnt{0};
|
||||
|
||||
/*
|
||||
* Number of outstanding messages on this QP that need to be completed
|
||||
* during a quiet operation.
|
||||
*/
|
||||
uint32_t quiet_counter{0};
|
||||
|
||||
int num_cqs{0};
|
||||
|
||||
/*
|
||||
* Current index into the SQ (non-module size).
|
||||
*/
|
||||
uint32_t cq_consumer_counter{0};
|
||||
|
||||
uint16_t cq_log_size{0};
|
||||
|
||||
uint16_t cq_size{0};
|
||||
|
||||
uint32_t ctrl_qp_sq{0};
|
||||
|
||||
uint64_t ctrl_sig{0};
|
||||
|
||||
uint32_t rkey{0};
|
||||
|
||||
uint32_t lkey{0};
|
||||
|
||||
GPUIBStats profiler{};
|
||||
|
||||
uint16_t max_nwqe{0};
|
||||
|
||||
bool sq_overflow{0};
|
||||
|
||||
uint64_t db_val{};
|
||||
/*
|
||||
* Pointer to the QP in global memory that this QP is copied from. When
|
||||
* this QP is destroyed, the dynamic (indicies, stats, etc) in the
|
||||
* global_qp are updated.
|
||||
*/
|
||||
QueuePair *global_qp{nullptr};
|
||||
|
||||
friend SingleThreadImpl;
|
||||
friend MultiThreadImpl;
|
||||
friend THREAD;
|
||||
friend WG;
|
||||
friend WAVE;
|
||||
friend RCConnectionImpl;
|
||||
friend DCConnectionImpl;
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
|
||||
@@ -1,201 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "reliable_connection.hpp"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "backend_ib.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
ReliableConnection::ReliableConnection(GPUIBBackend* b) : Connection(b, 0) {}
|
||||
|
||||
ReliableConnection::~ReliableConnection() {}
|
||||
|
||||
Connection::InitQPState ReliableConnection::initqp(uint8_t port) {
|
||||
InitQPState init{};
|
||||
|
||||
init.exp_qp_attr.qp_access_flags =
|
||||
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
|
||||
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
|
||||
init.exp_qp_attr.port_num = port;
|
||||
|
||||
init.exp_attr_mask |= IBV_QP_ACCESS_FLAGS;
|
||||
|
||||
return init;
|
||||
}
|
||||
|
||||
Connection::RtrState ReliableConnection::rtr(dest_info_t* dest, uint8_t port) {
|
||||
RtrState rtr{};
|
||||
|
||||
rtr.exp_qp_attr.dest_qp_num = dest->qpn;
|
||||
rtr.exp_qp_attr.rq_psn = dest->psn;
|
||||
rtr.exp_qp_attr.ah_attr.port_num = port;
|
||||
if (ib_state->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND) {
|
||||
rtr.exp_qp_attr.ah_attr.dlid = dest->lid;
|
||||
} else {
|
||||
rtr.exp_qp_attr.ah_attr.is_global = 1;
|
||||
rtr.exp_qp_attr.ah_attr.grh.dgid = dest->gid;
|
||||
rtr.exp_qp_attr.ah_attr.grh.sgid_index = 0;
|
||||
rtr.exp_qp_attr.ah_attr.grh.hop_limit = 1;
|
||||
}
|
||||
|
||||
rtr.exp_attr_mask |= IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
|
||||
IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER;
|
||||
|
||||
return rtr;
|
||||
}
|
||||
|
||||
Connection::RtsState ReliableConnection::rts(dest_info_t* dest) {
|
||||
RtsState rts{};
|
||||
|
||||
rts.exp_qp_attr.sq_psn = dest->psn;
|
||||
|
||||
rts.exp_attr_mask |= IBV_QP_SQ_PSN;
|
||||
|
||||
return rts;
|
||||
}
|
||||
|
||||
ibv_qp* ReliableConnection::create_qp_0(ibv_context* context,
|
||||
ibv_qp_init_attr_ex* qp_attr) {
|
||||
return ibv_create_qp_ex(context, qp_attr);
|
||||
}
|
||||
|
||||
void ReliableConnection::create_qps_1() { }
|
||||
|
||||
void ReliableConnection::create_qps_2(int port, int my_rank,
|
||||
ibv_port_attr* ib_port_att) { }
|
||||
|
||||
void ReliableConnection::create_qps_3(int port, ibv_qp* qp, int offset,
|
||||
ibv_port_attr* ib_port_att) {
|
||||
init_qp_status(qp, port);
|
||||
|
||||
all_qp[offset].lid = ib_port_att->lid;
|
||||
all_qp[offset].qpn = qp->qp_num;
|
||||
all_qp[offset].psn = 0;
|
||||
union ibv_gid gid;
|
||||
ibv_query_gid(ib_state->context, port, 0, &gid);
|
||||
all_qp[offset].gid = gid;
|
||||
}
|
||||
|
||||
void ReliableConnection::get_remote_conn(int* remote_conn) {
|
||||
*remote_conn = backend->num_pes;
|
||||
}
|
||||
|
||||
void ReliableConnection::allocate_dynamic_members(int num_blocks) {
|
||||
all_qp.resize(backend->num_pes * num_blocks);
|
||||
}
|
||||
|
||||
void ReliableConnection::free_dynamic_members() {
|
||||
}
|
||||
|
||||
void ReliableConnection::initialize_1(int port, int num_blocks) {
|
||||
MPI_Alltoall(MPI_IN_PLACE, sizeof(dest_info_t) * num_blocks, MPI_CHAR,
|
||||
all_qp.data(), sizeof(dest_info_t) * num_blocks, MPI_CHAR,
|
||||
backend->thread_comm);
|
||||
|
||||
for (int i = 0; i < qps.size(); i++) {
|
||||
change_status_rtr(qps[i], &all_qp[i], port);
|
||||
}
|
||||
|
||||
MPI_Barrier(backend->thread_comm);
|
||||
|
||||
for (int i = 0; i < qps.size(); i++) {
|
||||
change_status_rts(qps[i], &all_qp[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ReliableConnection::initialize_rkey_handle(uint32_t** heap_rkey_handle,
|
||||
ibv_mr* mr) {
|
||||
CHECK_HIP(
|
||||
hipHostMalloc(heap_rkey_handle, sizeof(uint32_t) * backend->num_pes));
|
||||
(*heap_rkey_handle)[backend->my_pe] = mr->rkey;
|
||||
}
|
||||
|
||||
void ReliableConnection::free_rkey_handle(uint32_t* heap_rkey_handle) {
|
||||
CHECK_HIP(hipHostFree(heap_rkey_handle));
|
||||
}
|
||||
|
||||
Connection::QPInitAttr ReliableConnection::qpattr(ibv_qp_cap cap) {
|
||||
QPInitAttr qpattr(cap);
|
||||
qpattr.attr.qp_type = IBV_QPT_RC;
|
||||
return qpattr;
|
||||
}
|
||||
|
||||
void ReliableConnection::post_dv_rc_wqe(int remote_conn) {
|
||||
mlx5_wqe_ctrl_seg* ctrl;
|
||||
mlx5_wqe_raddr_seg* rdma;
|
||||
mlx5_wqe_data_seg* data;
|
||||
|
||||
for (int i = 0; i < remote_conn; i++) {
|
||||
int num_blocks = backend->num_blocks_;
|
||||
for (int j = 0; j < num_blocks; j++) {
|
||||
int qp_index = i * num_blocks + j;
|
||||
uint64_t* ptr = get_address_sq(qp_index);
|
||||
|
||||
const uint16_t nb_post = 1; // 4 * sq_size;
|
||||
for (uint16_t index = 0; index < nb_post; index++) {
|
||||
uint8_t op_mod = 0;
|
||||
uint8_t op_code = 8;
|
||||
uint32_t qp_num = qps[qp_index]->qp_num;
|
||||
uint8_t fm_ce_se = 0;
|
||||
uint8_t ds = 3;
|
||||
ctrl = reinterpret_cast<mlx5_wqe_ctrl_seg*>(ptr);
|
||||
mlx5dv_set_ctrl_seg(ctrl, index, op_code, op_mod, qp_num, fm_ce_se, ds,
|
||||
0, 0);
|
||||
ptr = ptr + 2;
|
||||
|
||||
rdma = reinterpret_cast<mlx5_wqe_raddr_seg*>(ptr);
|
||||
const auto& heap_bases = backend->heap.get_heap_bases();
|
||||
auto temp = heap_bases[(backend->my_pe + 1) % 2];
|
||||
uint64_t r_address = reinterpret_cast<uint64_t>(temp);
|
||||
uint32_t rkey = backend->networkImpl.heap_rkey[i];
|
||||
set_rdma_seg(rdma, r_address, rkey);
|
||||
ptr = ptr + 2;
|
||||
|
||||
data = reinterpret_cast<mlx5_wqe_data_seg*>(ptr);
|
||||
uint32_t lkey = backend->networkImpl.heap_mr->lkey;
|
||||
temp = heap_bases[backend->my_pe];
|
||||
uint64_t address = reinterpret_cast<uint64_t>(temp);
|
||||
mlx5dv_set_data_seg(data, 1, lkey, address);
|
||||
ptr = ptr + 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(bpotter): remove redundancies with the other derived class
|
||||
void ReliableConnection::post_wqes() {
|
||||
int remote_conn;
|
||||
get_remote_conn(&remote_conn);
|
||||
post_dv_rc_wqe(remote_conn);
|
||||
}
|
||||
|
||||
void ReliableConnection::initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah,
|
||||
int dc_key) {}
|
||||
|
||||
int ReliableConnection::get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) {
|
||||
return pe_idx * num_qps + wg_idx;
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,84 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "connection.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class ReliableConnection : public Connection {
|
||||
public:
|
||||
explicit ReliableConnection(GPUIBBackend* backend);
|
||||
|
||||
~ReliableConnection() override;
|
||||
|
||||
void get_remote_conn(int* remote_conn) override;
|
||||
|
||||
void post_wqes() override;
|
||||
|
||||
void initialize_rkey_handle(uint32_t** heap_rkey_handle,
|
||||
ibv_mr* mr) override;
|
||||
|
||||
void free_rkey_handle(uint32_t* heap_rkey_handle) override;
|
||||
|
||||
private:
|
||||
InitQPState initqp(uint8_t port) override;
|
||||
|
||||
RtrState rtr(dest_info_t* dest, uint8_t port) override;
|
||||
|
||||
RtsState rts(dest_info_t* dest) override;
|
||||
|
||||
QPInitAttr qpattr(ibv_qp_cap cap) override;
|
||||
|
||||
void create_qps_1() override;
|
||||
|
||||
void create_qps_2(int port, int my_rank,
|
||||
ibv_port_attr* ib_port_att) override;
|
||||
|
||||
void create_qps_3(int port, ibv_qp* qp, int offset,
|
||||
ibv_port_attr* ib_port_att) override;
|
||||
|
||||
ibv_qp* create_qp_0(ibv_context* context,
|
||||
ibv_qp_init_attr_ex* qp_attr) override;
|
||||
|
||||
void allocate_dynamic_members(int num_wg) override;
|
||||
|
||||
void free_dynamic_members() override;
|
||||
|
||||
void initialize_1(int port, int num_wg) override;
|
||||
|
||||
void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, int dc_key) override;
|
||||
|
||||
int get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) override;
|
||||
|
||||
std::vector<dest_info_t> all_qp;
|
||||
|
||||
void post_dv_rc_wqe(int remote_conn);
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
|
||||
@@ -1,138 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "segment_builder.hpp"
|
||||
|
||||
#include "../util.hpp"
|
||||
#include "endian.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
__device__ SegmentBuilder::SegmentBuilder(uint64_t wqe_idx, void *base) {
|
||||
mlx5_segment *base_ptr = static_cast<mlx5_segment *>(base);
|
||||
size_t segment_offset = SEGMENTS_PER_WQE * wqe_idx;
|
||||
seg_ptr = &base_ptr[segment_offset];
|
||||
}
|
||||
|
||||
__device__ void SegmentBuilder::update_cntrl_seg(
|
||||
uint8_t opcode, uint16_t wqe_idx, uint32_t ctrl_qp_sq, uint64_t ctrl_sig,
|
||||
ConnectionImpl *connection_policy, bool zero_byte_rd) {
|
||||
mlx5_wqe_ctrl_seg ctrl_seg;
|
||||
|
||||
ctrl_seg.opmod_idx_opcode = (opcode << 24) | (wqe_idx << 8);
|
||||
|
||||
uint32_t DS = 2;
|
||||
if (zero_byte_rd == false) {
|
||||
DS = (opcode == MLX5_OPCODE_RDMA_WRITE || opcode == MLX5_OPCODE_RDMA_READ)
|
||||
? 3
|
||||
: 4;
|
||||
}
|
||||
|
||||
DS += connection_policy->wqeCntrlOffset();
|
||||
|
||||
ctrl_seg.qpn_ds = (DS << 24) | ctrl_qp_sq;
|
||||
|
||||
ctrl_seg.signature = ctrl_sig;
|
||||
|
||||
ctrl_seg.fm_ce_se = ctrl_sig >> 24;
|
||||
|
||||
ctrl_seg.imm = ctrl_sig >> 32;
|
||||
|
||||
memcpy(&seg_ptr->ctrl_seg, &ctrl_seg, sizeof(mlx5_wqe_ctrl_seg));
|
||||
|
||||
seg_ptr++;
|
||||
}
|
||||
|
||||
__device__ void SegmentBuilder::update_atomic_data_seg(uint64_t atomic_data,
|
||||
uint64_t atomic_cmp) {
|
||||
mlx5_wqe_atomic_seg atomic_seg;
|
||||
|
||||
swap_endian_store(reinterpret_cast<uint64_t *>(&atomic_seg.swap_add),
|
||||
atomic_data);
|
||||
|
||||
swap_endian_store(reinterpret_cast<uint64_t *>(&atomic_seg.compare),
|
||||
atomic_cmp);
|
||||
|
||||
memcpy(&seg_ptr->atomic_seg, &atomic_seg, sizeof(mlx5_wqe_atomic_seg));
|
||||
seg_ptr++;
|
||||
}
|
||||
|
||||
__device__ void SegmentBuilder::update_rdma_seg(uintptr_t *raddr,
|
||||
uint32_t rkey) {
|
||||
mlx5_wqe_raddr_seg raddr_seg;
|
||||
|
||||
raddr_seg.rkey = rkey;
|
||||
|
||||
swap_endian_store(reinterpret_cast<uint64_t *>(&raddr_seg.raddr),
|
||||
reinterpret_cast<uint64_t>(raddr));
|
||||
|
||||
memcpy(&seg_ptr->raddr_seg, &raddr_seg, sizeof(mlx5_wqe_raddr_seg));
|
||||
seg_ptr++;
|
||||
}
|
||||
|
||||
__device__ void SegmentBuilder::update_data_seg(uintptr_t *laddr, int32_t size,
|
||||
uint32_t lkey) {
|
||||
if (laddr == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
mlx5_wqe_data_seg data_seg;
|
||||
data_seg.lkey = lkey;
|
||||
|
||||
swap_endian_store(&data_seg.byte_count, size & 0x7FFFFFFFU);
|
||||
swap_endian_store(reinterpret_cast<uint64_t *>(&data_seg.addr),
|
||||
reinterpret_cast<uint64_t>(laddr));
|
||||
|
||||
memcpy(&seg_ptr->data_seg, &data_seg, sizeof(mlx5_wqe_data_seg));
|
||||
seg_ptr++;
|
||||
}
|
||||
|
||||
__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr,
|
||||
int32_t size) {
|
||||
mlx5_wqe_inl_data_seg inl_data_seg;
|
||||
|
||||
swap_endian_store(&inl_data_seg.byte_count, (size & 0x3FF) | 0x80000000);
|
||||
|
||||
// Assume fence HDP flush
|
||||
// TODO(khamidou): Rework fence interface to avoid this
|
||||
size_t field_size{sizeof(mlx5_wqe_inl_data_seg)};
|
||||
if (!laddr) {
|
||||
uint8_t flush_val = 1;
|
||||
memcpy(&inl_data_seg + 1, &flush_val, sizeof(flush_val));
|
||||
field_size += sizeof(flush_val);
|
||||
} else {
|
||||
memcpy(&inl_data_seg + 1, laddr, size);
|
||||
field_size += size;
|
||||
}
|
||||
|
||||
memcpy(&seg_ptr->inl_data_seg, &inl_data_seg, field_size);
|
||||
seg_ptr++;
|
||||
}
|
||||
|
||||
__device__ void SegmentBuilder::update_connection_seg(
|
||||
int pe, ConnectionImpl *conn_policy) {
|
||||
if (conn_policy->updateConnectionSegmentImpl(&seg_ptr->base_av, pe)) {
|
||||
seg_ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,64 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
|
||||
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
#include "connection_policy.hpp"
|
||||
#include "infiniband_structs.hpp"
|
||||
#include "../util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class SegmentBuilder {
|
||||
public:
|
||||
__device__ SegmentBuilder(uint64_t wqe_idx, void *base);
|
||||
|
||||
__device__ void update_cntrl_seg(uint8_t opcode, uint16_t wqe_idx,
|
||||
uint32_t ctrl_qp_sq, uint64_t ctrl_sig,
|
||||
ConnectionImpl *connection_policy,
|
||||
bool zero_byte_rd);
|
||||
|
||||
__device__ void update_connection_seg(int pe,
|
||||
ConnectionImpl *connection_policy);
|
||||
|
||||
__device__ void update_atomic_data_seg(uint64_t atomic_data,
|
||||
uint64_t atomic_cmp);
|
||||
|
||||
__device__ void update_rdma_seg(uintptr_t *raddr, uint32_t rkey);
|
||||
|
||||
__device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size);
|
||||
|
||||
__device__ void update_data_seg(uintptr_t *laddr, int32_t size,
|
||||
uint32_t lkey);
|
||||
|
||||
private:
|
||||
const int SEGMENTS_PER_WQE = 4;
|
||||
|
||||
mlx5_segment *seg_ptr;
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
|
||||
@@ -1,358 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "thread_policy.hpp"
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "queue_pair.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
__device__ void SingleThreadImpl::quiet(QueuePair *handle) {
|
||||
handle->quiet_internal<THREAD>();
|
||||
}
|
||||
|
||||
__device__ void SingleThreadImpl::quiet_heavy(QueuePair *handle, int pe) {
|
||||
handle->zero_b_rd<THREAD>(pe);
|
||||
handle->quiet_internal<THREAD>();
|
||||
}
|
||||
|
||||
__device__ void MultiThreadImpl::quiet(QueuePair *handle) {
|
||||
int thread_id = get_flat_block_id();
|
||||
/*
|
||||
* Each WF selects one thread to perform the quiet. Only one thread
|
||||
* per WG is allowed to do a quiet at once to avoid races with the CQ.
|
||||
*/
|
||||
if (thread_id % WF_SIZE == lowerID()) {
|
||||
while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
|
||||
}
|
||||
handle->quiet_internal<THREAD>();
|
||||
__threadfence();
|
||||
handle->threadImpl.cq_lock = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void MultiThreadImpl::quiet_heavy(QueuePair *handle, int pe) {
|
||||
int thread_id = get_flat_block_id();
|
||||
/*
|
||||
* Each WF selects one thread to perform the quiet. Only one thread
|
||||
* per WG is allowed to do a quiet at once to avoid races with the CQ.
|
||||
*/
|
||||
if (thread_id % WF_SIZE == lowerID()) {
|
||||
// zero_byte read
|
||||
handle->zero_b_rd<THREAD>(pe);
|
||||
|
||||
while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
|
||||
}
|
||||
handle->quiet_internal<THREAD>();
|
||||
__threadfence();
|
||||
handle->threadImpl.cq_lock = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void WG::quiet(QueuePair *handle) { handle->quiet_internal<WG>(); }
|
||||
|
||||
__device__ void WG::quiet_heavy(QueuePair *handle, int pe) {
|
||||
handle->zero_b_rd<THREAD>(pe);
|
||||
handle->quiet_internal<WG>();
|
||||
}
|
||||
|
||||
__device__ void WAVE::quiet(QueuePair *handle) {
|
||||
int thread_id = get_flat_block_id();
|
||||
/*
|
||||
* Each WF selects one thread to perform the quiet. Only one thread
|
||||
* per WG is allowed to do a quiet at once to avoid races with the CQ.
|
||||
*/
|
||||
|
||||
if (thread_id % WF_SIZE == 0) {
|
||||
while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
|
||||
}
|
||||
handle->quiet_internal<WAVE>();
|
||||
__threadfence();
|
||||
handle->threadImpl.cq_lock = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void WAVE::quiet_heavy(QueuePair *handle, int pe) {
|
||||
int thread_id = get_flat_block_id();
|
||||
/*
|
||||
* Each WF selects one thread to perform the quiet. Only one thread
|
||||
* per WG is allowed to do a quiet at once to avoid races with the CQ.
|
||||
*/
|
||||
if (thread_id % WF_SIZE == 0) {
|
||||
// post a zero-byte read
|
||||
handle->zero_b_rd<THREAD>(pe);
|
||||
while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
|
||||
}
|
||||
handle->quiet_internal<WAVE>();
|
||||
__threadfence();
|
||||
handle->threadImpl.cq_lock = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void SingleThreadImpl::decQuietCounter(uint32_t *quiet_counter,
|
||||
int num) {
|
||||
*quiet_counter -= num;
|
||||
}
|
||||
|
||||
__device__ void MultiThreadImpl::decQuietCounter(uint32_t *quiet_counter,
|
||||
int num) {
|
||||
atomicSub(quiet_counter, num);
|
||||
}
|
||||
|
||||
__device__ void WG::decQuietCounter(uint32_t *quiet_counter, int num) {
|
||||
*quiet_counter -= num;
|
||||
}
|
||||
|
||||
__device__ void WAVE::decQuietCounter(uint32_t *quiet_counter, int num) {
|
||||
*quiet_counter -= num;
|
||||
}
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void SingleThreadImpl::finishPost(QueuePair *handle, bool ring_db,
|
||||
int num_wqes, int pe,
|
||||
uint16_t le_sq_counter,
|
||||
uint8_t opcode) {
|
||||
if (ring_db) {
|
||||
uint64_t db_val = handle->db_val;
|
||||
handle->compute_db_val_opcode(&db_val, le_sq_counter, opcode);
|
||||
handle->update_wqe_ce_single<cqe>(num_wqes);
|
||||
handle->ring_doorbell(db_val);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void MultiThreadImpl::finishPost(QueuePair *handle, bool ring_db,
|
||||
int num_wqes, int pe,
|
||||
uint16_t le_sq_counter,
|
||||
uint8_t opcode) {
|
||||
/*
|
||||
* For RC, we can't allow a wave to have different PEs in it, else the
|
||||
* doorbell ringing logic will not work. This little for loop forces
|
||||
* control flow divergence based on the PE. It works well for small
|
||||
* numbers of PEs, but we might want a different solution for large
|
||||
* numbers.
|
||||
*/
|
||||
if (handle->connection_policy.forcePostDivergence()) {
|
||||
for (int i = 0; i < handle->num_cqs; i++) {
|
||||
if (i != pe) {
|
||||
continue;
|
||||
}
|
||||
|
||||
finishPost_internal<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
|
||||
opcode);
|
||||
}
|
||||
} else {
|
||||
finishPost_internal<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
|
||||
opcode);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void MultiThreadImpl::finishPost_internal(QueuePair *handle,
|
||||
bool ring_db, int num_wqes,
|
||||
int pe,
|
||||
uint16_t le_sq_counter,
|
||||
uint8_t opcode) {
|
||||
/*
|
||||
* Assuming here that postLock locks out all wavefronts in this WG but
|
||||
* one, and that this will select a single thread in the wavefront.
|
||||
*/
|
||||
__threadfence();
|
||||
if (get_flat_block_id() % WF_SIZE == lowerID()) {
|
||||
if (ring_db) {
|
||||
uint64_t db_val =
|
||||
handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
|
||||
handle->max_nwqe)];
|
||||
handle->update_wqe_ce_thread<true>(num_wqes);
|
||||
handle->ring_doorbell(db_val);
|
||||
}
|
||||
|
||||
handle->threadImpl.sq_lock = 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void WG::finishPost(QueuePair *handle, bool ring_db, int num_wqes,
|
||||
int pe, uint16_t le_sq_counter, uint8_t opcode) {
|
||||
if (ring_db) {
|
||||
uint64_t db_val = handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
|
||||
handle->max_nwqe)];
|
||||
handle->update_wqe_ce_single<cqe>(num_wqes);
|
||||
handle->ring_doorbell(db_val);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void WAVE::finishPost(QueuePair *handle, bool ring_db, int num_wqes,
|
||||
int pe, uint16_t le_sq_counter,
|
||||
uint8_t opcode) {
|
||||
if (ring_db) {
|
||||
uint64_t db_val = handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
|
||||
handle->max_nwqe)];
|
||||
handle->update_wqe_ce_thread<cqe>(num_wqes);
|
||||
handle->ring_doorbell(db_val);
|
||||
}
|
||||
handle->threadImpl.sq_lock = 0;
|
||||
}
|
||||
|
||||
__device__ void SingleThreadImpl::postLock(QueuePair *handle, int pe) {
|
||||
handle->hdp_policy->hdp_flush();
|
||||
// handle->waitCQSpace(1);
|
||||
handle->waitSQSpace(1);
|
||||
}
|
||||
|
||||
__device__ void MultiThreadImpl::postLock_internal(QueuePair *handle) {
|
||||
int thread_id = get_flat_block_id();
|
||||
int active_threads = wave_SZ();
|
||||
|
||||
if (thread_id % WF_SIZE == lowerID()) {
|
||||
handle->hdp_policy->hdp_flush();
|
||||
/*
|
||||
* Don't let more than one wave in this WG go any further or a
|
||||
* horrible variety of impossible to debug race conditions can occur.
|
||||
*/
|
||||
while (atomicCAS(&(handle->threadImpl.sq_lock), 0, 1) == 1) {
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a tiny bit over-aggressive as it assumes that all of the
|
||||
* active_threads are going to the same PE when calculating whether
|
||||
* we are full.
|
||||
*/
|
||||
// handle->waitCQSpace(active_threads);
|
||||
handle->waitSQSpace(active_threads);
|
||||
}
|
||||
|
||||
/*
|
||||
* Double check we've got the same exec mask (assuming divergence after
|
||||
* the previous if.
|
||||
*/
|
||||
if (active_threads != wave_SZ()) {
|
||||
__builtin_trap();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void MultiThreadImpl::postLock(QueuePair *handle, int pe) {
|
||||
/*
|
||||
* For RC, we can't allow a wave to have different PEs in it, else the
|
||||
* doorbell ringing logic will not work. This little for loop forces
|
||||
* control flow divergence based on the PE. It works well for small
|
||||
* numbers of PEs, but we might want a different solution for large
|
||||
* numbers.
|
||||
*/
|
||||
if (handle->connection_policy.forcePostDivergence()) {
|
||||
for (int i = 0; i < handle->num_cqs; i++) {
|
||||
if (i != pe) {
|
||||
continue;
|
||||
}
|
||||
|
||||
postLock_internal(handle);
|
||||
}
|
||||
} else {
|
||||
postLock_internal(handle);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void WG::postLock(QueuePair *handle, int pe) {
|
||||
handle->hdp_policy->hdp_flush();
|
||||
// handle->waitCQSpace(1);
|
||||
handle->waitSQSpace(1);
|
||||
}
|
||||
|
||||
__device__ void WAVE::postLock(QueuePair *handle, int pe) {
|
||||
handle->hdp_policy->hdp_flush();
|
||||
|
||||
/*
|
||||
* Don't let more than one wave in this WG go any further or a horrible
|
||||
* variety of impossible to debug race conditions can occur.
|
||||
*/
|
||||
while (atomicCAS(&(handle->threadImpl.sq_lock), 0, 1) == 1) {
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a tiny bit over-aggressive as it assumes that all of the
|
||||
* active_threads are going to the same PE when calculating whether
|
||||
* we are full.
|
||||
*/
|
||||
// handle->waitCQSpace(1);
|
||||
handle->waitSQSpace(1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T SingleThreadImpl::threadAtomicAdd(T *val, T value) {
|
||||
T old_val = *val;
|
||||
*val += value;
|
||||
return old_val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T MultiThreadImpl::threadAtomicAdd(T *val, T value) {
|
||||
return atomicAdd(val, value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T WG::threadAtomicAdd(T *val, T value) {
|
||||
T old_val = *val;
|
||||
*val += value;
|
||||
return old_val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T WAVE::threadAtomicAdd(T *val, T value) {
|
||||
return atomicAdd(val, value);
|
||||
}
|
||||
|
||||
#define TYPE_GEN(T) \
|
||||
template __device__ T SingleThreadImpl::threadAtomicAdd<T>(T * val, \
|
||||
T value); \
|
||||
template __device__ T MultiThreadImpl::threadAtomicAdd<T>(T * val, T value); \
|
||||
template __device__ T WG::threadAtomicAdd<T>(T * val, T value); \
|
||||
template __device__ T WAVE::threadAtomicAdd<T>(T * val, T value);
|
||||
|
||||
TYPE_GEN(float)
|
||||
TYPE_GEN(double)
|
||||
TYPE_GEN(int)
|
||||
TYPE_GEN(unsigned int)
|
||||
TYPE_GEN(unsigned long long) // NOLINT(runtime/int)
|
||||
|
||||
#define TYPE_BOOL(T) \
|
||||
template __device__ void SingleThreadImpl::finishPost<T>( \
|
||||
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
|
||||
uint16_t le_sq_counter, uint8_t opcode); \
|
||||
template __device__ void MultiThreadImpl::finishPost<T>( \
|
||||
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
|
||||
uint16_t le_sq_counter, uint8_t opcode); \
|
||||
template __device__ void WG::finishPost<T>( \
|
||||
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
|
||||
uint16_t le_sq_counter, uint8_t opcode); \
|
||||
template __device__ void WAVE::finishPost<T>( \
|
||||
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
|
||||
uint16_t le_sq_counter, uint8_t opcode); \
|
||||
template __device__ void MultiThreadImpl::finishPost_internal<T>( \
|
||||
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
|
||||
uint16_t le_sq_counter, uint8_t opcode);
|
||||
|
||||
TYPE_BOOL(true)
|
||||
TYPE_BOOL(false)
|
||||
|
||||
} // namespace rocshmem
|
||||
@@ -1,176 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
|
||||
#define LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "../util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
class QueuePair;
|
||||
|
||||
/*
|
||||
* GPU single-thread policy class. Only a single work-item per work-group
|
||||
* is allowed to call into a rocSHMEM function (unless it is specifically
|
||||
* called out as a collective API. This thread policy is the fastest but
|
||||
* is not as flexible.
|
||||
*/
|
||||
class SingleThreadImpl {
|
||||
public:
|
||||
uint32_t cq_lock = 0;
|
||||
uint32_t sq_lock = 0;
|
||||
|
||||
__device__ void quiet(QueuePair *handle);
|
||||
|
||||
__device__ void quiet_heavy(QueuePair *handle, int pe);
|
||||
|
||||
__device__ void decQuietCounter(uint32_t *quiet_counter, int num);
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
|
||||
int pe, uint16_t le_sq_counter, uint8_t opcode);
|
||||
|
||||
__device__ void postLock(QueuePair *handle, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T threadAtomicAdd(T *val, T value = 1);
|
||||
};
|
||||
|
||||
/*
|
||||
* GPU multi-thread policy class. Multiple work-items per work-group are
|
||||
* allowed to call into a rocSHMEM function. A bit slower than its
|
||||
* single-thread counterpart but it enables a much more flexible user-facing
|
||||
* API.
|
||||
*/
|
||||
class MultiThreadImpl {
|
||||
/*
|
||||
* Per-wg locks for the CQ and the SQ, respectively.
|
||||
*/
|
||||
template <bool cqe>
|
||||
__device__ void finishPost_internal(QueuePair *handle, bool ring_db,
|
||||
int num_wqes, int pe,
|
||||
uint16_t le_sq_counter, uint8_t opcode);
|
||||
|
||||
__device__ void postLock_internal(QueuePair *handle);
|
||||
|
||||
public:
|
||||
uint32_t cq_lock = 0;
|
||||
uint32_t sq_lock = 0;
|
||||
|
||||
__device__ void quiet(QueuePair *handle);
|
||||
|
||||
__device__ void quiet_heavy(QueuePair *handle, int pe);
|
||||
|
||||
__device__ void decQuietCounter(uint32_t *quiet_counter, int num);
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
|
||||
int pe, uint16_t le_sq_counter, uint8_t opcode);
|
||||
|
||||
__device__ void postLock(QueuePair *handle, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T threadAtomicAdd(T *val, T value = 1);
|
||||
};
|
||||
|
||||
/*
|
||||
* Select which one of our thread policies to use at compile time.
|
||||
*/
|
||||
#ifdef USE_THREADS
|
||||
typedef MultiThreadImpl ThreadImpl;
|
||||
#else
|
||||
typedef SingleThreadImpl ThreadImpl;
|
||||
#endif
|
||||
|
||||
class THREAD {
|
||||
public:
|
||||
ThreadImpl threadImpl;
|
||||
|
||||
__device__ void quiet(QueuePair *handle) { threadImpl.quiet(handle); }
|
||||
|
||||
__device__ void quiet_heavy(QueuePair *handle, int pe) {
|
||||
threadImpl.quiet_heavy(handle, pe);
|
||||
}
|
||||
|
||||
__device__ void decQuietCounter(uint32_t *quiet_counter, int num) {
|
||||
threadImpl.decQuietCounter(quiet_counter, num);
|
||||
}
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
|
||||
int pe, uint16_t le_sq_counter, uint8_t opcode) {
|
||||
threadImpl.finishPost<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
|
||||
opcode);
|
||||
}
|
||||
|
||||
__device__ void postLock(QueuePair *handle, int pe) {
|
||||
threadImpl.postLock(handle, pe);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T threadAtomicAdd(T *val, T value = 1) {
|
||||
T tmp = threadImpl.threadAtomicAdd(val, value);
|
||||
return tmp;
|
||||
}
|
||||
};
|
||||
|
||||
class WAVE {
|
||||
public:
|
||||
__device__ void quiet(QueuePair *handle);
|
||||
|
||||
__device__ void quiet_heavy(QueuePair *handle, int pe);
|
||||
|
||||
__device__ void decQuietCounter(uint32_t *quiet_counter, int num);
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
|
||||
int pe, uint16_t le_sq_counter, uint8_t opcode);
|
||||
|
||||
__device__ void postLock(QueuePair *handle, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T threadAtomicAdd(T *val, T value = 1);
|
||||
};
|
||||
|
||||
class WG {
|
||||
public:
|
||||
__device__ void quiet(QueuePair *handle);
|
||||
|
||||
__device__ void quiet_heavy(QueuePair *handle, int pe);
|
||||
|
||||
__device__ void decQuietCounter(uint32_t *quiet_counter, int num);
|
||||
|
||||
template <bool cqe>
|
||||
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
|
||||
int pe, uint16_t le_sq_counter, uint8_t opcode);
|
||||
|
||||
__device__ void postLock(QueuePair *handle, int pe);
|
||||
|
||||
template <typename T>
|
||||
__device__ T threadAtomicAdd(T *val, T value = 1);
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
|
||||
@@ -35,7 +35,7 @@ namespace rocshmem {
|
||||
|
||||
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
|
||||
|
||||
rocshmem_team_t get_external_team(GPUIBTeam *team) {
|
||||
rocshmem_team_t get_external_team(IPCTeam *team) {
|
||||
return reinterpret_cast<rocshmem_team_t>(team);
|
||||
}
|
||||
|
||||
@@ -223,7 +223,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team,
|
||||
* Allocate device-side memory for team_world and
|
||||
* construct a IPC team in it
|
||||
*/
|
||||
GPUIBTeam *new_team_obj;
|
||||
IPCTeam *new_team_obj;
|
||||
CHECK_HIP(hipMalloc(&new_team_obj, sizeof(IPCTeam)));
|
||||
new (new_team_obj)
|
||||
IPCTeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes,
|
||||
|
||||
@@ -298,4 +298,4 @@ class IPCContext : public Context {
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
|
||||
#endif // LIBRARY_SRC_IPC_CONTEXT_DEVICE_HPP_
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
* both host access and device access to the memory space.
|
||||
*
|
||||
* The symmetric heaps are visible to network by registering them as
|
||||
* InfiniBand memory regions. Every memory region has a remote key
|
||||
* memory regions. Every memory region has a remote key
|
||||
* which needs to be shared across the network (to access the memory
|
||||
* region).
|
||||
*/
|
||||
|
||||
@@ -29,14 +29,14 @@ namespace rocshmem {
|
||||
|
||||
template <typename T>
|
||||
__host__ void ROHostContext::p(T *dest, T value, int pe) {
|
||||
DPRINTF("Function: gpu_ib_host_p\n");
|
||||
DPRINTF("Function: ro_host_p\n");
|
||||
|
||||
host_interface->p<T>(dest, value, pe, context_window_info);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ T ROHostContext::g(const T *source, int pe) {
|
||||
DPRINTF("Function: gpu_ib_host_g\n");
|
||||
DPRINTF("Function: ro_host_g\n");
|
||||
|
||||
return host_interface->g<T>(source, pe, context_window_info);
|
||||
}
|
||||
@@ -44,7 +44,7 @@ __host__ T ROHostContext::g(const T *source, int pe) {
|
||||
template <typename T>
|
||||
__host__ void ROHostContext::put(T *dest, const T *source, size_t nelems,
|
||||
int pe) {
|
||||
DPRINTF("Function: gpu_ib_host_put\n");
|
||||
DPRINTF("Function: ro_host_put\n");
|
||||
|
||||
host_interface->put<T>(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
@@ -52,7 +52,7 @@ __host__ void ROHostContext::put(T *dest, const T *source, size_t nelems,
|
||||
template <typename T>
|
||||
__host__ void ROHostContext::get(T *dest, const T *source, size_t nelems,
|
||||
int pe) {
|
||||
DPRINTF("Function: gpu_ib_host_get\n");
|
||||
DPRINTF("Function: ro_host_get\n");
|
||||
|
||||
host_interface->get<T>(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
@@ -60,7 +60,7 @@ __host__ void ROHostContext::get(T *dest, const T *source, size_t nelems,
|
||||
template <typename T>
|
||||
__host__ void ROHostContext::put_nbi(T *dest, const T *source, size_t nelems,
|
||||
int pe) {
|
||||
DPRINTF("Function: gpu_ib_host_put_nbi\n");
|
||||
DPRINTF("Function: ro_host_put_nbi\n");
|
||||
|
||||
host_interface->put_nbi<T>(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
@@ -68,7 +68,7 @@ __host__ void ROHostContext::put_nbi(T *dest, const T *source, size_t nelems,
|
||||
template <typename T>
|
||||
__host__ void ROHostContext::get_nbi(T *dest, const T *source, size_t nelems,
|
||||
int pe) {
|
||||
DPRINTF("Function: gpu_ib_host_get_nbi\n");
|
||||
DPRINTF("Function: ro_host_get_nbi\n");
|
||||
|
||||
host_interface->get_nbi<T>(dest, source, nelems, pe, context_window_info);
|
||||
}
|
||||
@@ -107,7 +107,7 @@ __host__ void ROHostContext::broadcast(T *dest, const T *source, int nelems,
|
||||
int pe_root, int pe_start,
|
||||
int log_pe_stride, int pe_size,
|
||||
long *p_sync) {
|
||||
DPRINTF("Function: gpu_ib_host_broadcast\n");
|
||||
DPRINTF("Function: ro_host_broadcast\n");
|
||||
|
||||
host_interface->broadcast<T>(dest, source, nelems, pe_root, pe_start,
|
||||
log_pe_stride, pe_size, p_sync);
|
||||
|
||||
+2
-8
@@ -36,10 +36,7 @@
|
||||
|
||||
#include "backend_bc.hpp"
|
||||
#include "context_incl.hpp"
|
||||
#ifdef USE_GPU_IB
|
||||
#include "gpu_ib/backend_ib.hpp"
|
||||
#include "gpu_ib/context_ib_tmpl_host.hpp"
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#include "reverse_offload/backend_ro.hpp"
|
||||
#include "reverse_offload/context_ro_tmpl_host.hpp"
|
||||
#else
|
||||
@@ -86,10 +83,7 @@ rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
|
||||
|
||||
rocshmem_env_config_init();
|
||||
|
||||
#ifdef USE_GPU_IB
|
||||
CHECK_HIP(hipHostMalloc(&backend, sizeof(GPUIBBackend)));
|
||||
backend = new (backend) GPUIBBackend(comm);
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
|
||||
backend = new (backend) ROBackend(comm);
|
||||
#else
|
||||
|
||||
@@ -49,9 +49,7 @@
|
||||
#include "templates.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
#ifdef USE_GPU_IB
|
||||
#include "gpu_ib/context_ib_tmpl_device.hpp"
|
||||
#elif defined(USE_RO)
|
||||
#ifdef USE_RO
|
||||
#include "reverse_offload/context_ro_tmpl_device.hpp"
|
||||
#else
|
||||
#include "ipc/context_ipc_tmpl_device.hpp"
|
||||
|
||||
@@ -36,10 +36,6 @@ __host__ __device__ Team* get_internal_team(rocshmem_team_t team) {
|
||||
return reinterpret_cast<Team*>(team);
|
||||
}
|
||||
|
||||
GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team) {
|
||||
return reinterpret_cast<GPUIBTeam*>(team);
|
||||
}
|
||||
|
||||
ROTeam* get_internal_ro_team(rocshmem_team_t team) {
|
||||
return reinterpret_cast<ROTeam*>(team);
|
||||
}
|
||||
|
||||
+1
-4
@@ -33,7 +33,6 @@ namespace rocshmem {
|
||||
class Backend;
|
||||
class Team;
|
||||
class ROTeam;
|
||||
class GPUIBTeam;
|
||||
class IPCTeam;
|
||||
|
||||
class TeamInfo {
|
||||
@@ -154,13 +153,11 @@ class Team {
|
||||
*
|
||||
* @note This is required to do some reinterpret_casts.
|
||||
*/
|
||||
BackendType type{BackendType::GPU_IB_BACKEND};
|
||||
BackendType type{BackendType::RO_BACKEND};
|
||||
};
|
||||
|
||||
__host__ __device__ Team* get_internal_team(rocshmem_team_t team);
|
||||
|
||||
GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team);
|
||||
|
||||
ROTeam* get_internal_ro_team(rocshmem_team_t team);
|
||||
|
||||
IPCTeam* get_internal_ipc_team(rocshmem_team_t team);
|
||||
|
||||
Reference in New Issue
Block a user