Removed GPU_IB (#59)

This commit is contained in:
Yiltan
2025-03-24 09:04:52 -04:00
committed by GitHub
parent 1210b6419f
commit 658bf2a3b5
76 changed files with 33 additions and 8539 deletions
-20
View File
@@ -39,9 +39,7 @@ endif()
###############################################################################
option(DEBUG "Enable debug trace" OFF)
option(PROFILE "Enable statistics and timing support" OFF)
option(USE_GPU_IB "Enable GPU_IB conduit." ON)
option(USE_RO "Enable RO conduit." ON)
option(USE_DC "Enable IB dynamically connected transport (DC)" OFF)
option(USE_IPC "Enable IPC support (using HIP)" OFF)
option(USE_THREADS "Enable workgroup threads to share network queues" OFF)
option(USE_WF_COAL "Enable wavefront message coalescing" OFF)
@@ -158,10 +156,6 @@ if (NOT BUILD_TESTS_ONLY)
find_package(hip REQUIRED)
find_package(hsa-runtime64 REQUIRED)
if (USE_GPU_IB)
find_package(Ibverbs REQUIRED)
endif()
set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
set(THREADS_PREFER_PTHREAD_FLAG TRUE)
find_package(Threads REQUIRED)
@@ -188,20 +182,6 @@ if (NOT BUILD_TESTS_ONLY)
hip::host
hsa-runtime64::hsa-runtime64
)
if (USE_GPU_IB)
target_include_directories(
${PROJECT_NAME}
PUBLIC
${IBVERBS_INCLUDE_DIRS}
)
target_link_libraries(
${PROJECT_NAME}
PUBLIC
${IBVERBS_LIBRARIES}
)
endif()
endif()
###############################################################################
+3 -7
View File
@@ -7,11 +7,11 @@ code complexity and enables more fine-grained communication/computation
overlap than traditional host-driven networking.
rocSHMEM uses a single symmetric heap (SHEAP) that is allocated on GPU memories.
There are currently three backends for rocSHMEM;
IPC, Reverse Offload (RO), and GPU-IB.
There are currently two backends for rocSHMEM;
IPC and Reverse Offload (RO).
The backends primarily differ in their implementations of intra-kernel networking.
Currently, only the IPC backend is supported.
The RO and GPU-IB backends are provided as-is with
The RO backend is provided as-is with
no guarantees of support from AMD or AMD Research.
The IPC backend implements communication primitives using load/store operations issued from the GPU.
@@ -21,10 +21,6 @@ to the host-side runtime, which calls into a traditional MPI or OpenSHMEM
implementation. This forwarding of requests is transparent to the
programmer, who only sees the GPU-side interface.
The GPU InfiniBand (GPU-IB) backend implements a lightweight InfiniBand verbs interface
on the GPU. The GPU itself is responsible for building commands and ringing
the doorbell on the NIC to send network commands.
## Requirements
rocSHMEM base requirements:
-62
View File
@@ -1,62 +0,0 @@
###############################################################################
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
find_package(PkgConfig REQUIRED QUIET)
pkg_check_modules(PC_IBVERBS QUIET libibverbs)
find_path(
IBVERBS_INCLUDE_DIR infiniband/verbs.h
HINTS ${PC_IBVERBS_INCLUDEDIR} ${PC_IBVERBS_INCLUDE_DIRS}
PATH_SUFFIXES include
)
find_library(
IBVERBS_LIBRARY
NAMES ibverbs libibverbs
HINTS ${PC_IBVERBS_LIBDIR} ${PC_IBVERBS_LIBRARY_DIRS}
PATH_SUFFIXES lib lib64
)
find_library(
MLX5_LIBRARY
NAMES mlx5 libmlx5
HINTS ${PC_IBVERBS_LIBDIR} ${PC_IBVERBS_LIBRARY_DIRS}
PATH_SUFFIXES lib lib64
)
set(
IBVERBS_LIBRARIES
${IBVERBS_LIBRARY} ${MLX5_LIBRARY}
CACHE INTERNAL ""
)
set(
IBVERBS_INCLUDE_DIRS
${IBVERBS_INCLUDE_DIR}
CACHE INTERNAL ""
)
find_package_handle_standard_args(
Ibverbs DEFAULT_MSG IBVERBS_LIBRARY IBVERBS_INCLUDE_DIR
)
mark_as_advanced(IBVERBS_LIBRARY IBVERBS_INCLUDE_DIR)
-2
View File
@@ -1,8 +1,6 @@
#cmakedefine DEBUG
#cmakedefine PROFILE
#cmakedefine USE_GPU_IB
#cmakedefine USE_RO
#cmakedefine USE_DC
#cmakedefine USE_IPC
#cmakedefine USE_THREADS
#cmakedefine USE_SHARED_CTX
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=ON \
-DUSE_IPC=OFF \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DDEBUG=ON \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=ON \
-DUSE_IPC=OFF \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=ON \
-DUSE_IPC=ON \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=ON \
-DUSE_GPU_IB=ON \
-DUSE_DC=ON \
-DUSE_IPC=OFF \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=ON \
-DUSE_IPC=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-2
View File
@@ -17,9 +17,7 @@ cmake \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=OFF \
-DUSE_RO=OFF \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-2
View File
@@ -22,9 +22,7 @@ cmake \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=OFF \
-DUSE_RO=OFF \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=ON \
$src_path
cmake --build . --parallel 8
cmake --install .
-28
View File
@@ -1,28 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_COHERENT_HEAP=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DDEBUG=ON \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
-28
View File
@@ -1,28 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_MANAGED_HEAP=ON \
$src_path
cmake --build . --parallel 8
cmake --install .
@@ -1,28 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DDEBUG=ON \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_MANAGED_HEAP=ON \
$src_path
cmake --build . --parallel 8
cmake --install .
-27
View File
@@ -1,27 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DDEBUG=OFF \
-DPROFILE=ON \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
@@ -1,30 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
-DUSE_HOST_SIDE_HDP_FLUSH=ON\
$src_path
cmake --build . --parallel 8
cmake --install .
@@ -1,30 +0,0 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
set -e
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=ON \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
-DUSE_HOST_SIDE_HDP_FLUSH=ON\
$src_path
cmake --build . --parallel 8
cmake --install .
-2
View File
@@ -17,8 +17,6 @@ cmake \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=OFF \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-2
View File
@@ -17,8 +17,6 @@ cmake \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=OFF \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-2
View File
@@ -17,8 +17,6 @@ cmake \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=OFF \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
+1 -4
View File
@@ -145,12 +145,9 @@ def determine_algos_from_library_config_type(config):
if config['algorithms']:
return config
gpu_ib = re.match('^[rd]c_', config['library_build_config_type'])
thread_single = re.match('.*single.*', config['library_build_config_type'])
if not gpu_ib:
config['algorithms'] = reverse_offload_algorithms
elif thread_single:
if thread_single:
config['algorithms'] = single_thread_algorithms
else:
config['algorithms'] = multi_thread_algorithms
+1 -3
View File
@@ -58,9 +58,7 @@ target_compile_options(
###############################################################################
# ROCSHMEM TARGET FOR BACKENDS
###############################################################################
IF (USE_GPU_IB)
add_subdirectory(gpu_ib)
ELSEIF(USE_RO)
IF (USE_RO)
add_subdirectory(reverse_offload)
ELSE()
add_subdirectory(ipc)
+3 -9
View File
@@ -25,9 +25,7 @@
#include "backend_type.hpp"
#include "context_incl.hpp"
#ifdef USE_GPU_IB
#include "gpu_ib/backend_ib.hpp"
#elif defined(USE_RO)
#ifdef USE_RO
#include "reverse_offload/backend_ro.hpp"
#else
#include "ipc/backend_ipc.hpp"
@@ -203,9 +201,7 @@ void Backend::reset_stats() {
}
__device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
#ifdef USE_GPU_IB
return static_cast<GPUIBBackend*>(this)->create_ctx(option, ctx);
#elif defined(USE_RO)
#ifdef USE_RO
return static_cast<ROBackend*>(this)->create_ctx(option, ctx);
#else
return static_cast<IPCBackend*>(this)->create_ctx(option, ctx);
@@ -213,9 +209,7 @@ __device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
}
__device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) {
#ifdef USE_GPU_IB
static_cast<GPUIBBackend*>(this)->destroy_ctx(ctx);
#elif defined(USE_RO)
#ifdef USE_RO
static_cast<ROBackend*>(this)->destroy_ctx(ctx);
#else
static_cast<IPCBackend*>(this)->destroy_ctx(ctx);
+1 -1
View File
@@ -278,7 +278,7 @@ class Backend {
* rely on the normal inheritance mechanism to tailor behavior for
* derived backend types.
*/
BackendType type{BackendType::GPU_IB_BACKEND};
BackendType type{BackendType::RO_BACKEND};
/**
* @brief Dumps derived class statistics.
+6 -24
View File
@@ -44,7 +44,7 @@ namespace rocshmem {
* @note Derived classes which use Backend as a base class must add
* themselves to this enum class to support static polymorphism.
*/
enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
enum class BackendType { RO_BACKEND, IPC_BACKEND };
/**
* @brief Helper macro for some dispatch calls
@@ -54,10 +54,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
/**
* @brief Device static dispatch method call.
*/
#ifdef USE_GPU_IB
#define DISPATCH(Func) \
static_cast<GPUIBContext *>(this)->Func;
#elif defined(USE_RO)
#ifdef USE_RO
#define DISPATCH(Func) \
static_cast<ROContext *>(this)->Func;
#else
@@ -68,11 +65,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
/**
* @brief Device static dispatch method call with a return value.
*/
#ifdef USE_GPU_IB
#define DISPATCH_RET(Func) \
auto ret_val = static_cast<GPUIBContext *>(this)->Func; \
return ret_val;
#elif defined(USE_RO)
#ifdef USE_RO
#define DISPATCH_RET(Func) \
auto ret_val = static_cast<ROContext *>(this)->Func; \
return ret_val;
@@ -85,12 +78,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
/**
* @brief Device static dispatch method call with a return type of pointer.
*/
#ifdef USE_GPU_IB
#define DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
ret_val = static_cast<GPUIBContext *>(this)->Func; \
return ret_val;
#elif defined(USE_RO)
#ifdef USE_RO
#define DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
ret_val = static_cast<ROContext *>(this)->Func; \
@@ -109,9 +97,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
* MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and
* threading semantics of collectives in OpenSHMEM match those of MPI.
*/
#ifdef USE_GPU_IB
#define HOST_DISPATCH(Func) static_cast<GPUIBHostContext *>(this)->Func;
#elif defined(USE_RO)
#ifdef USE_RO
#define HOST_DISPATCH(Func) static_cast<ROHostContext *>(this)->Func;
#else
#define HOST_DISPATCH(Func) static_cast<IPCHostContext *>(this)->Func;
@@ -124,11 +110,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
* threading semantics of collectives in OpenSHMEM match those of MPI.
*/
#ifdef USE_GPU_IB
#define HOST_DISPATCH_RET(Func) \
auto ret_val = static_cast<GPUIBHostContext *>(this)->Func; \
return ret_val;
#elif defined(USE_RO)
#ifdef USE_RO
#define HOST_DISPATCH_RET(Func) \
auto ret_val = static_cast<ROHostContext *>(this)->Func; \
return ret_val;
+1 -4
View File
@@ -26,10 +26,7 @@
#include "context.hpp"
#include "context_tmpl_device.hpp"
#include "context_tmpl_host.hpp"
#ifdef USE_GPU_IB
#include "gpu_ib/context_ib_device.hpp"
#include "gpu_ib/context_ib_host.hpp"
#elif defined (USE_RO)
#ifdef USE_RO
#include "reverse_offload/context_ro_device.hpp"
#include "reverse_offload/context_ro_host.hpp"
#else
+1 -3
View File
@@ -25,9 +25,7 @@
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_type.hpp"
#ifdef USE_GPU_IB
#include "gpu_ib/context_ib_device.hpp"
#elif defined(USE_RO)
#ifdef USE_RO
#include "reverse_offload/context_ro_device.hpp"
#else
#include "ipc/context_ipc_device.hpp"
+1 -3
View File
@@ -25,9 +25,7 @@
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_type.hpp"
#ifdef USE_GPU_IB
#include "gpu_ib/context_ib_host.hpp"
#elif defined(USE_RO)
#ifdef USE_RO
#include "reverse_offload/context_ro_host.hpp"
#else
#include "ipc/context_ipc_host.hpp"
-45
View File
@@ -1,45 +0,0 @@
###############################################################################
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
###############################################################################
# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY
###############################################################################
target_sources(
${PROJECT_NAME}
PRIVATE
backend_ib.cpp
connection.cpp
connection_policy.cpp
context_ib_device.cpp
context_ib_device_coll.cpp
context_ib_host.cpp
debug.cpp
dynamic_connection.cpp
endian.cpp
gpu_ib_team.cpp
network_policy.cpp
qe_dumper.cpp
queue_pair.cpp
reliable_connection.cpp
segment_builder.cpp
thread_policy.cpp
)
-493
View File
@@ -1,493 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "backend_ib.hpp"
#include <endian.h>
#include <mpi.h>
#include <unistd.h>
#include <cstdio>
#include <cstdlib>
#include <mutex> // NOLINT(build/c++11)
#include "rocshmem/rocshmem.hpp"
#include "../backend_type.hpp"
#include "../context_incl.hpp"
#include "gpu_ib_team.hpp"
#include "queue_pair.hpp"
#include "../host/host.hpp"
namespace rocshmem {
#define NET_CHECK(cmd) \
{ \
if (cmd != MPI_SUCCESS) { \
fprintf(stderr, "Unrecoverable error: MPI Failure\n"); \
abort(); \
} \
}
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
rocshmem_team_t get_external_team(GPUIBTeam *team) {
return reinterpret_cast<rocshmem_team_t>(team);
}
int get_ls_non_zero_bit(char *bitmask, int mask_length) {
int position = -1;
for (int bit_i = 0; bit_i < mask_length; bit_i++) {
int byte_i = bit_i / CHAR_BIT;
if (bitmask[byte_i] & (1 << (bit_i % CHAR_BIT))) {
position = bit_i;
break;
}
}
return position;
}
GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
std::stringstream sstream(maximum_num_contexts_str);
sstream >> maximum_num_contexts_;
}
num_blocks_ = maximum_num_contexts_;
init_mpi_once(comm);
type = BackendType::GPU_IB_BACKEND;
NET_CHECK(MPI_Comm_dup(backend_comm, &gpu_ib_comm_world));
NET_CHECK(MPI_Comm_size(gpu_ib_comm_world, &num_pes));
NET_CHECK(MPI_Comm_rank(gpu_ib_comm_world, &my_pe));
/* Initialize the host interface */
host_interface =
new HostInterface(hdp_proxy_.get(), gpu_ib_comm_world, &heap);
/*
* Construct default host context independently of the
* default device context (done in the async thread)
* so that host operations can execute regardless of
* device operations.
*/
setup_default_host_ctx();
setup_team_world();
rocshmem_collective_init();
teams_init();
// MPI_Comm_dup(gpu_ib_comm_world, &thread_comm);
thread_comm = gpu_ib_comm_world;
NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
worker_thread_exit = false;
#ifdef USE_HOST_SIDE_HDP_FLUSH
hdp_gpu_cpu_flush_flag_ =
static_cast<unsigned int *>(rocshmem_malloc(sizeof(unsigned int)));
hdp_policy->set_flush_polling_ptr(hdp_gpu_cpu_flush_flag_);
hdp_flush_worker_thread = std::thread(&GPUIBBackend::hdp_flush_poll, this);
// We can now initialize and set the HDP window in the host interface
host_interface->create_hdp_window();
#endif
// commenting out the async thread as there is some issues with ROCm
// this makes the CPU init blocking
// async_thread_ = thread_spawn(this);
thread_func_internal(this);
}
__device__ bool GPUIBBackend::create_ctx(int64_t options,
rocshmem_ctx_t *ctx) {
GPUIBContext *ctx_;
auto pop_result = ctx_free_list.get()->pop_front();
if (!pop_result.success) {
return false;
}
ctx_ = pop_result.value;
ctx->ctx_opaque = ctx_;
return true;
}
void GPUIBBackend::ctx_create(int64_t options, void **ctx) {
GPUIBHostContext *new_ctx = nullptr;
new_ctx = new GPUIBHostContext(this, options);
*ctx = new_ctx;
}
GPUIBHostContext *get_internal_gpu_ib_ctx(Context *ctx) {
return reinterpret_cast<GPUIBHostContext *>(ctx);
}
void GPUIBBackend::ctx_destroy(Context *ctx) {
GPUIBHostContext *gpu_ib_host_ctx = get_internal_gpu_ib_ctx(ctx);
delete gpu_ib_host_ctx;
}
__device__ void GPUIBBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
ctx_free_list.get()->push_back(static_cast<GPUIBContext *>(ctx->ctx_opaque));
}
GPUIBBackend::~GPUIBBackend() {
// need to get this back once ROCm is fixed
// async_thread_.join();
worker_thread_exit = true;
#ifdef USE_HOST_SIDE_HDP_FLUSH
hdp_flush_worker_thread.join();
hdp_policy->set_flush_polling_ptr(nullptr);
rocshmem_free(hdp_gpu_cpu_flush_flag_);
#endif
/**
* Destroy teams infrastructure
* and team world
*/
teams_destroy();
auto *team_world{team_tracker.get_team_world()};
team_world->~Team();
CHECK_HIP(hipFree(team_world));
delete default_host_ctx_;
NET_CHECK(MPI_Comm_free(&gpu_ib_comm_world));
CHECK_HIP(hipFree(default_ctx_->device_qp_proxy));
CHECK_HIP(hipFree(default_ctx_));
default_ctx_ = nullptr;
delete host_interface;
host_interface = nullptr;
networkImpl.networkHostFinalize();
CHECK_HIP(hipFree(ctx_array));
}
__host__ void GPUIBBackend::global_exit(int status) {
MPI_Abort(gpu_ib_comm_world, status);
}
void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team,
TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
rocshmem_team_t *new_team) {
/**
* Read the bit mask and find out a common index into
* the pool of available work arrays.
*/
NET_CHECK(MPI_Allreduce(pool_bitmask_, reduced_bitmask_, bitmask_size_,
MPI_CHAR, MPI_BAND, team_comm));
/* Pick the least significant non-zero bit (logical layout) in the reduced
* bitmask */
auto max_num_teams{team_tracker.get_max_num_teams()};
int common_index = get_ls_non_zero_bit(reduced_bitmask_, max_num_teams);
if (common_index < 0) {
/* No team available */
abort();
}
/* Mark the team as taken (by unsetting the bit in the pool bitmask) */
int byte = common_index / CHAR_BIT;
pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT));
/**
* Allocate device-side memory for team_world and
* construct a GPU_IB team in it
*/
GPUIBTeam *new_team_obj;
CHECK_HIP(hipMalloc(&new_team_obj, sizeof(GPUIBTeam)));
new (new_team_obj)
GPUIBTeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes,
my_pe_in_new_team, team_comm, common_index);
*new_team = get_external_team(new_team_obj);
}
void GPUIBBackend::team_destroy(rocshmem_team_t team) {
GPUIBTeam *team_obj = get_internal_gpu_ib_team(team);
/* Mark the pool as available */
int bit = team_obj->pool_index_;
int byte_i = bit / CHAR_BIT;
pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT);
team_obj->~GPUIBTeam();
CHECK_HIP(hipFree(team_obj));
}
void GPUIBBackend::dump_backend_stats() {
networkImpl.dump_backend_stats(&globalStats);
}
void GPUIBBackend::reset_backend_stats() { networkImpl.reset_backend_stats(); }
void GPUIBBackend::initialize_ipc() {
ipcImpl.ipcHostInit(my_pe, heap.get_heap_bases(), thread_comm);
}
void GPUIBBackend::initialize_network() { networkImpl.networkHostSetup(this); }
void GPUIBBackend::setup_default_host_ctx() {
default_host_ctx_ = new GPUIBHostContext(this, 0);
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_;
}
void GPUIBBackend::setup_ctxs() {
/*
* Allocate device-side memory for all context and construct an
* InfiniBand context in it.
*/
CHECK_HIP(
hipMalloc(&ctx_array, sizeof(GPUIBContext) * maximum_num_contexts_));
for (int i = 0; i < maximum_num_contexts_; i++) {
new (&ctx_array[i]) GPUIBContext(this, false, i);
ctx_free_list.get()->push_back(ctx_array + i);
}
}
void GPUIBBackend::setup_default_ctx() {
/*
* Allocate device-side memory for default context and construct an
* InfiniBand context in it.
*/
CHECK_HIP(hipMalloc(&default_ctx_, sizeof(GPUIBContext)));
new (default_ctx_) GPUIBContext(this, true, 0);
/*
* Set the ROCSHMEM_CTX_DEFAULT in constant memory.
*/
int *symbol_address;
CHECK_HIP(hipGetSymbolAddress(reinterpret_cast<void **>(&symbol_address),
HIP_SYMBOL(ROCSHMEM_CTX_DEFAULT)));
TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world;
rocshmem_ctx_t ctx_default_host{default_ctx_, tinfo};
hipStream_t stream;
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
CHECK_HIP(hipMemcpyAsync(symbol_address, &ctx_default_host,
sizeof(rocshmem_ctx_t), hipMemcpyDefault, stream));
CHECK_HIP(hipStreamSynchronize(stream));
CHECK_HIP(hipStreamDestroy(stream));
}
void GPUIBBackend::setup_team_world() {
TeamInfo *team_info_wrt_parent, *team_info_wrt_world;
/**
* Allocate device-side memory for team_world and construct a
* GPU_IB team in it.
*/
CHECK_HIP(hipMalloc(&team_info_wrt_parent, sizeof(TeamInfo)));
CHECK_HIP(hipMalloc(&team_info_wrt_world, sizeof(TeamInfo)));
new (team_info_wrt_parent) TeamInfo(nullptr, 0, 1, num_pes);
new (team_info_wrt_world) TeamInfo(nullptr, 0, 1, num_pes);
MPI_Comm team_world_comm;
NET_CHECK(MPI_Comm_dup(gpu_ib_comm_world, &team_world_comm));
GPUIBTeam *team_world{nullptr};
CHECK_HIP(hipMalloc(&team_world, sizeof(GPUIBTeam)));
new (team_world) GPUIBTeam(this, team_info_wrt_parent, team_info_wrt_world,
num_pes, my_pe, team_world_comm, 0);
team_tracker.set_team_world(team_world);
/**
* Copy the address to ROCSHMEM_TEAM_WORLD.
*/
ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
}
void GPUIBBackend::init_mpi_once(MPI_Comm comm) {
static std::mutex init_mutex;
const std::lock_guard<std::mutex> lock(init_mutex);
int init_done = 0;
NET_CHECK(MPI_Initialized(&init_done));
if (init_done == 0) {
int provided;
NET_CHECK(
MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided));
}
if (comm == MPI_COMM_NULL) {
NET_CHECK(MPI_Comm_dup(MPI_COMM_WORLD, &backend_comm));
} else {
NET_CHECK(MPI_Comm_dup(comm, &backend_comm));
}
}
std::thread GPUIBBackend::thread_spawn(GPUIBBackend *b) {
return std::thread(&GPUIBBackend::thread_func_internal, this, b);
}
void GPUIBBackend::thread_func_internal(GPUIBBackend *b) {
CHECK_HIP(hipSetDevice(hip_dev_id));
b->initialize_ipc();
b->initialize_network();
b->setup_ctxs();
b->setup_default_ctx();
*(b->done_init) = 1;
}
#ifdef USE_HOST_SIDE_HDP_FLUSH
void GPUIBBackend::hdp_flush_poll() {
while (!worker_thread_exit) {
if (hdp_policy->has_active_flush_request()) {
hdp_policy->hdp_flush();
hdp_policy->clear_active_flush_flag();
}
}
}
#endif
void GPUIBBackend::teams_init() {
/**
* Allocate pools for the teams sync and work arrary from the SHEAP.
*/
auto max_num_teams{team_tracker.get_max_num_teams()};
barrier_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams));
reduce_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams));
bcast_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams));
alltoall_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
sizeof(long) * ROCSHMEM_ALLTOALL_SYNC_SIZE * max_num_teams));
/* Accommodating for largest possible data type for pWrk */
pWrk_pool = rocshmem_malloc(
sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams);
pAta_pool = rocshmem_malloc(sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE *
max_num_teams);
/**
* Initialize the sync arrays in the pool with default values.
*/
long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
for (int team_i = 0; team_i < max_num_teams; team_i++) {
barrier_pSync = reinterpret_cast<long *>(
&barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
reduce_pSync = reinterpret_cast<long *>(
&reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = reinterpret_cast<long *>(
&bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
alltoall_pSync = reinterpret_cast<long *>(
&alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
}
/**
* Initialize bit mask
*
* Logical:
* MSB..........................................................................LSB
* Physical: MSB...1st least significant 8 bits...LSB MSB...2nd least
* signifant 8 bits...LSB
*
* Description shows only a 2-byte long mask but idea extends to any
* arbitrary size.
*/
bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1)
: (max_num_teams / CHAR_BIT);
pool_bitmask_ = reinterpret_cast<char *>(malloc(bitmask_size_));
reduced_bitmask_ = reinterpret_cast<char *>(malloc(bitmask_size_));
memset(pool_bitmask_, 0, bitmask_size_);
memset(reduced_bitmask_, 0, bitmask_size_);
/* Set all to available except the 0th one (reserved for TEAM_WORLD) */
for (int bit_i = 1; bit_i < max_num_teams; bit_i++) {
int byte_i = bit_i / CHAR_BIT;
pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT);
}
/**
* Make sure that all processing elements have done this before
* continuing.
*/
NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
}
void GPUIBBackend::teams_destroy() {
rocshmem_free(barrier_pSync_pool);
rocshmem_free(reduce_pSync_pool);
rocshmem_free(bcast_pSync_pool);
rocshmem_free(alltoall_pSync_pool);
rocshmem_free(pWrk_pool);
rocshmem_free(pAta_pool);
free(pool_bitmask_);
free(reduced_bitmask_);
}
void GPUIBBackend::rocshmem_collective_init() {
/*
* Allocate heap space for barrier_sync
*/
size_t one_sync_size_bytes{sizeof(*barrier_sync)};
size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
heap.malloc(reinterpret_cast<void **>(&barrier_sync), sync_size_bytes);
/*
* Initialize the barrier synchronization array with default values.
*/
for (int i = 0; i < num_pes; i++) {
barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
}
/*
* Make sure that all processing elements have done this before
* continuing.
*/
NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
}
} // namespace rocshmem
-351
View File
@@ -1,351 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
#define LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
#include "../backend_bc.hpp"
#include "../containers/free_list_impl.hpp"
#include "network_policy.hpp"
#include "../hdp_policy.hpp"
#include "../hdp_proxy.hpp"
#include "../memory/hip_allocator.hpp"
namespace rocshmem {
class HostInterface;
/**
* @class GPUIBBackend backend.hpp
* @brief InfiniBand specific backend.
*
* The InfiniBand (GPUIB) backend enables the device to enqueue network
* requests to InfiniBand queues (with minimal host intervention). The setup
* requires some effort from the host, but the device is able to craft
* InfiniBand requests and send them on its own.
*/
class GPUIBBackend : public Backend {
public:
/**
* @copydoc Backend::Backend(unsigned)
*/
explicit GPUIBBackend(MPI_Comm comm);
/**
* @copydoc Backend::~Backend()
*/
virtual ~GPUIBBackend();
/**
* @brief Abort the application.
*
* @param[in] status Exit code.
*
* @return void.
*
* @note This routine terminates the entire application.
*/
void global_exit(int status) override;
/**
* @copydoc Backend::create_new_team
*/
void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
rocshmem_team_t *new_team) override;
/**
* @copydoc Backend::team_destroy(rocshmem_team_t)
*/
void team_destroy(rocshmem_team_t team) override;
/**
* @copydoc Backend::ctx_create
*/
void ctx_create(int64_t options, void **ctx) override;
__device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
/**
* @copydoc Backend::ctx_destroy
*/
void ctx_destroy(Context *ctx) override;
/**
* @copydoc Backend::ctx_destroy
*/
__device__ void destroy_ctx(rocshmem_ctx_t *ctx);
protected:
/**
* @copydoc Backend::dump_backend_stats()
*/
void dump_backend_stats() override;
/**
* @copydoc Backend::reset_backend_stats()
*/
void reset_backend_stats() override;
/**
* @brief spawn a new thread to perform the rest of initialization
*/
std::thread thread_spawn(GPUIBBackend *b);
/**
* @brief overheads for helper thread to run
*
* @param[in] the thread needs access to the class
*
* @return void
*/
void thread_func_internal(GPUIBBackend *b);
/**
* @brief initialize MPI.
*
* GPUIB relies on MPI just to exchange the connection information.
*
* todo: remove the dependency on MPI and make it generic to PMI-X or just
* to OpenSHMEM to have support for both CPU and GPU
*/
void init_mpi_once(MPI_Comm comm);
/**
* @brief init the network support
*/
void initialize_network();
/**
* @brief Invokes the IPC policy class initialization method.
*
* This method delegates Inter Process Communication (IPC)
* initialization to the appropriate policy class. The initialization
* needs to be exposed to the Backed due to initialization ordering
* constraints. (The symmetric heaps needs to be allocated and
* initialized before this method can be called.)
*
* The policy class encapsulates what the initialization process so
* refer to that class for more details.
*/
void initialize_ipc();
/**
* @brief Allocate and initialize the ROCSHMEM_CTX_DEFAULT variable.
*
* @todo The default_ctx member looks unused after it is copied into
* the ROCSHMEM_CTX_DEFAULT variable.
*/
void setup_default_ctx();
void setup_ctxs();
/**
* @brief Allocate and initialize the default context for host
* operations.
*/
void setup_default_host_ctx();
/**
* @brief Allocate and initialize team world.
*/
void setup_team_world();
/**
* @brief Initialize the resources required to support teams
*/
void teams_init();
/**
* @brief Destruct the resources required to support teams
*/
void teams_destroy();
/**
* @brief Allocate and initialize barrier operation addresses on
* symmetric heap.
*
* When this method completes, the barrier_sync member will be available
* for use.
*/
void rocshmem_collective_init();
#ifdef USE_HOST_SIDE_HDP_FLUSH
/**
* @brief A service thread routine that flushes the hdp cache on behalf of the
* GPU.
*/
void hdp_flush_poll();
/**
* @brief Workers used to poll on the device hdp flush request.
*/
std::thread hdp_flush_worker_thread{};
#endif
/**
* @brief Signals to the worker threads to exist
*/
std::atomic<bool> worker_thread_exit{false};
public:
/**
* @brief The host-facing interface that will be used
* by all contexts of the GPUIBBackend
*/
HostInterface *host_interface{nullptr};
/**
* @brief Handle for raw memory for barrier sync
*/
long *barrier_pSync_pool{nullptr};
/**
* @brief Handle for raw memory for reduce sync
*/
long *reduce_pSync_pool{nullptr};
/**
* @brief Handle for raw memory for broadcast sync
*/
long *bcast_pSync_pool{nullptr};
/**
* @brief Handle for raw memory for alltoall sync
*/
long *alltoall_pSync_pool{nullptr};
/**
* @brief Handle for raw memory for work
*/
void *pWrk_pool{nullptr};
/**
* @brief Handle for raw memory for alltoall
*/
void *pAta_pool{nullptr};
/**
* @brief rocSHMEM's copy of MPI_COMM_WORLD (for interoperability
* with orthogonal MPI usage in an MPI+rocSHMEM program).
*/
MPI_Comm gpu_ib_comm_world{};
MPI_Comm backend_comm{};
/**
* @brief Holds number of blocks used in library
*/
size_t num_blocks_{1};
private:
/**
* @brief Allocates cacheable, device memory for the hdp policy.
*
* @note Internal data ownership is managed by the proxy
*/
HdpProxy<HIPAllocator> hdp_proxy_{};
public:
/**
* @brief Policy choice for two HDP implementations.
*
* @todo Combine HDP related stuff together into a class with a
* reasonable interface. The functionality does not need to exist in
* multiple pieces in the Backend and QueuePair classes. The hdp_rkey,
* hdp_addresses, and hdp_policy fields should all live in the class.
*/
HdpPolicy *hdp_policy{hdp_proxy_.get()};
/**
* @brief Scratchpad for the internal barrier algorithms.
*/
int64_t *barrier_sync{nullptr};
/**
* @brief Compile-time configuration policy for network (IB)
*
*
* The configuration option "USE_SINGLE_NODE" can be enabled to not build
* with network support.
*/
NetworkImpl networkImpl{};
private:
/**
* @brief An array of @ref ROContexts that backs the context FreeList.
*/
GPUIBContext *ctx_array{nullptr};
/**
* @brief A free-list containing contexts.
*/
FreeListProxy<HIPAllocator, GPUIBContext *> ctx_free_list{};
/**
* @brief Holds maximum number of contexts used in library
*/
size_t maximum_num_contexts_{1024};
/**
* @brief The bitmask representing the availability of teams in the pool
*/
char *pool_bitmask_{nullptr};
/**
* @brief Bitmask to store the reduced result of bitmasks on pariticipating
* PEs
*
* With no thread-safety for this bitmask, multithreaded creation of teams is
* not supported.
*/
char *reduced_bitmask_{nullptr};
/**
* @brief Size of the bitmask
*/
int bitmask_size_{-1};
/**
* @brief a helper thread to perform the initialization (non-blocking init)
*/
std::thread async_thread_{};
/**
* @brief Holds a copy of the default context (see OpenSHMEM
* specification).
*
* @todo Remove this member from the backend class. There is another
* copy stored in ROCSHMEM_CTX_DEFAULT.
*/
GPUIBContext *default_ctx_{nullptr};
/**
* @brief Holds a copy of the default context for host functions
*/
GPUIBHostContext *default_host_ctx_{nullptr};
unsigned int* hdp_gpu_cpu_flush_flag_;
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
-54
View File
@@ -1,54 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include <gtest/gtest.h>
#include <mpi.h>
#include "dynamic_connection.hpp"
#include "reliable_connection.hpp"
namespace rocshmem {
// test with different use_ib_hca
// test with different heap size
// test with different sleep
// test with different sq_size
TEST(DynamicConnect, ToNothing) {
DynamicConnection connect;
connect.construct_init(1);
}
// test with different num_dcis
// test with different num_dcts
TEST(ReliableConnect, ToNothing) {
ReliableConnection connect;
connect.construct_init(1);
}
int main(int argc, char **argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
} // namespace rocshmem
-431
View File
@@ -1,431 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "connection.hpp"
#include <mpi.h>
#include <mutex> // NOLINT(build/c++11)
#include <vector>
#include "backend_ib.hpp"
#include "queue_pair.hpp"
#include "../util.hpp"
namespace rocshmem {
int Connection::use_gpu_mem = 0;
int Connection::coherent_cq = 0;
Connection::Connection(GPUIBBackend* b, int k) : backend(b), key_offset(k) {
char* value = nullptr;
if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) {
requested_dev = value;
}
if ((value = getenv("ROCSHMEM_SQ_SIZE"))) {
sq_size = atoi(value);
}
if ((value = getenv("ROCSHMEM_USE_CQ_GPU_MEM")) != nullptr) {
cq_use_gpu_mem = atoi(value);
}
if ((value = getenv("ROCSHMEM_USE_SQ_GPU_MEM")) != nullptr) {
sq_use_gpu_mem = atoi(value);
}
}
Connection::~Connection() { delete ib_state; }
void Connection::reg_mr(void* ptr, size_t size, ibv_mr** mr, bool managed) {
int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
if (managed) {
access |= IBV_ACCESS_ON_DEMAND;
}
*mr = ibv_reg_mr(ib_state->pd, ptr, size, access);
if (*mr == nullptr) {
abort();
}
}
unsigned Connection::total_number_connections() {
int connections;
get_remote_conn(&connections);
return backend->num_blocks_ * connections;
}
void Connection::initialize(int num_block) {
allocate_dynamic_members(num_block);
int ib_devices{0};
dev_list = ibv_get_device_list(&ib_devices);
if (dev_list == nullptr) {
abort();
}
struct ibv_device* ib_dev = dev_list[0];
if (requested_dev != nullptr) {
for (int i = 0; i < ib_devices; i++) {
const char* select_dev = ibv_get_device_name(dev_list[i]);
if (strstr(select_dev, requested_dev) != nullptr) {
ib_dev = dev_list[i];
break;
}
}
}
uint8_t port = 1;
ib_init(ib_dev, port);
int hip_dev_id = 0;
CHECK_HIP(hipGetDevice(&hip_dev_id));
int ib_fork_err = ibv_fork_init();
if (ib_fork_err != 0) printf("error: ibv_fork_init failed \n");
sq_post_dv = static_cast<sq_post_dv_t*>(
malloc(sizeof(sq_post_dv_t) * total_number_connections()));
if (sq_post_dv == nullptr) {
abort();
}
create_qps(port, backend->my_pe, &ib_state->portinfo);
initialize_1(port, num_block);
MPI_Barrier(backend->thread_comm);
free_dynamic_members();
}
void Connection::finalize() {
ibv_free_device_list(dev_list);
int ret = ibv_dereg_mr(backend->networkImpl.heap_mr);
if (ret) {
abort();
}
// comment until rocm 4.5
// ibv_dereg_mr(backend->networkImpl.hdp_mr);
ibv_dereg_mr(backend->networkImpl.mr);
}
void Connection::ib_init(struct ibv_device* ib_dev, uint8_t port) {
ib_state = new ib_state_t;
if (!ib_state) {
abort();
}
ib_state->context = ibv_open_device(ib_dev);
if (!ib_state->context) {
delete ib_state;
abort();
}
ib_state->pd = ibv_alloc_pd(ib_state->context);
if (!ib_state->pd) {
delete ib_state;
abort();
}
ibv_parent_domain_init_attr pattr;
init_parent_domain_attr(&pattr);
ib_state->pd = ibv_alloc_parent_domain(ib_state->context, &pattr);
ibv_query_port(ib_state->context, port, &ib_state->portinfo);
}
template <typename StateType>
void Connection::try_to_modify_qp(ibv_qp* qp, StateType state) {
ibv_modify_qp(qp, &state.exp_qp_attr, state.exp_attr_mask);
}
void Connection::init_qp_status(ibv_qp* qp, uint8_t port) {
try_to_modify_qp<InitQPState>(qp, initqp(port));
}
/**
* rtr stands for 'ready to receive'
*/
void Connection::change_status_rtr(ibv_qp* qp, dest_info_t* dest,
uint8_t port) {
try_to_modify_qp<RtrState>(qp, rtr(dest, port));
}
/**
* rts stands for 'ready to send'
*/
void Connection::change_status_rts(ibv_qp* qp, dest_info_t* dest) {
try_to_modify_qp<RtsState>(qp, rts(dest));
}
void Connection::create_qps(uint8_t port, int my_rank,
ibv_port_attr* ib_port_att) {
create_qps_1();
ibv_qp_cap cap{};
cap.max_send_wr = sq_size;
cap.max_send_sge = 1;
cap.max_inline_data = 4;
QPInitAttr qp_init_attr = qpattr(cap);
size_t qp_size = total_number_connections();
cqs.resize(qp_size);
qps.resize(qp_size);
int cqe = qp_init_attr.attr.cap.max_send_wr;
for (auto& entry : cqs) {
entry = create_cq(ib_state->context, ib_state->pd, cqe);
if (!entry) {
abort();
}
}
create_qps_2(port, my_rank, ib_port_att);
for (int i = 0; i < qps.size(); i++) {
qps[i] =
create_qp(ib_state->pd, ib_state->context, &qp_init_attr.attr, cqs[i]);
if (!qps[i]) {
abort();
}
create_qps_3(port, qps[i], i, ib_port_att);
}
}
void Connection::initialize_gpu_policy(ConnectionImpl** conn,
uint32_t* heap_rkey) {
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(conn), sizeof(ConnectionImpl)));
new (*conn) ConnectionImpl(this, heap_rkey);
}
/*
* Create and write the rdma segment to the SQ
*/
void Connection::set_rdma_seg(mlx5_wqe_raddr_seg* rdma, uint64_t address,
uint32_t rkey) {
rdma->raddr = htobe64(address);
rdma->rkey = htobe32(rkey);
}
/*
* Retrieve the address of a SQ.
* We used this address to write the WQE directly to the SQ.
*/
uint64_t* Connection::get_address_sq(int i) {
mlx5dv_obj mlx_obj;
mlx5dv_qp qp_out;
mlx_obj.qp.in = qps[i];
mlx_obj.qp.out = &qp_out;
mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP);
return reinterpret_cast<uint64_t*>(qp_out.sq.buf);
}
void* Connection::buf_alloc([[maybe_unused]] struct ibv_pd* pd,
[[maybe_unused]] void* pd_context, size_t size,
[[maybe_unused]] size_t alignment,
[[maybe_unused]] uint64_t resource_type) {
if (use_gpu_mem) {
void* dev_ptr;
if (coherent_cq == 1) {
#if defined USE_COHERENT_HEAP
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&dev_ptr), size));
#else
#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
hipDeviceMallocUncached));
#else
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
hipDeviceMallocFinegrained));
#endif
#endif
} else {
#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
hipDeviceMallocUncached));
#else
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
hipDeviceMallocFinegrained));
#endif
}
memset(dev_ptr, 0, size);
return dev_ptr;
}
return IBV_ALLOCATOR_USE_DEFAULT;
}
void Connection::buf_release([[maybe_unused]] struct ibv_pd* pd,
[[maybe_unused]] void* pd_context, void* ptr,
[[maybe_unused]] uint64_t resource_type) {
if (use_gpu_mem) {
CHECK_HIP(hipFree(ptr));
} else {
free(ptr);
}
}
void Connection::init_parent_domain_attr(ibv_parent_domain_init_attr* attr1) {
attr1->pd = ib_state->pd;
attr1->td = nullptr;
attr1->comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS;
attr1->alloc = Connection::buf_alloc;
attr1->free = Connection::buf_release;
attr1->pd_context = nullptr;
}
ibv_cq* Connection::create_cq(ibv_context* context, ibv_pd* pd, int cqe) {
use_gpu_mem = cq_use_gpu_mem;
ibv_cq_init_attr_ex cq_attr;
memset(&cq_attr, 0, sizeof(ibv_cq_init_attr_ex));
cq_attr.cqe = cqe;
cq_attr.cq_context = nullptr;
cq_attr.channel = nullptr;
cq_attr.comp_vector = 0;
cq_attr.flags = 0; // see ibv_exp_cq_create_flags
cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_PD;
cq_attr.parent_domain = pd;
coherent_cq = 1;
ibv_cq_ex* cq = ibv_create_cq_ex(context, &cq_attr);
coherent_cq = 0;
if (!cq) {
printf("error in ibv_create_cq_ex: %d %s\n", errno, strerror(errno));
return nullptr;
}
return ibv_cq_ex_to_cq(cq);
}
void Connection::init_gpu_qp_from_connection(QueuePair* gpu_qp,
int conn_num) {
int hip_dev_id = 0;
CHECK_HIP(hipGetDevice(&hip_dev_id));
use_gpu_mem = cq_use_gpu_mem;
mlx5dv_cq cq_out;
mlx5dv_obj mlx_obj;
mlx_obj.cq.in = cqs[conn_num];
mlx_obj.cq.out = &cq_out;
mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
gpu_qp->cq_log_size = log2(cq_out.cqe_cnt);
gpu_qp->cq_size = cq_out.cqe_cnt;
void* gpu_ptr = nullptr;
if (use_gpu_mem) {
gpu_qp->current_cq_q = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
} else {
rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(cq_out.buf),
cq_out.cqe_cnt * 64, &gpu_ptr, hip_dev_id);
gpu_qp->current_cq_q = reinterpret_cast<mlx5_cqe64*>(gpu_ptr);
}
gpu_qp->current_cq_q_H = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(cq_out.dbrec), 64,
&gpu_ptr, hip_dev_id);
gpu_qp->dbrec_cq = reinterpret_cast<volatile uint32_t*>(gpu_ptr);
use_gpu_mem = sq_use_gpu_mem;
mlx5dv_qp qp_out;
mlx_obj.qp.in = qps[conn_num];
mlx_obj.qp.out = &qp_out;
mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP);
gpu_qp->max_nwqe = (qp_out.sq.wqe_cnt);
volatile uint32_t* dbrec_send = qp_out.dbrec + 1;
if (use_gpu_mem) {
gpu_qp->current_sq = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
gpu_qp->dbrec_send = reinterpret_cast<volatile uint32_t*>(dbrec_send);
} else {
gpu_ptr = nullptr;
rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(qp_out.sq.buf),
qp_out.sq.wqe_cnt * 64, &gpu_ptr,
hip_dev_id);
gpu_qp->current_sq = reinterpret_cast<uint64_t*>(gpu_ptr);
rocm_memory_lock_to_fine_grain(
reinterpret_cast<void*>(const_cast<uint32_t*>(dbrec_send)), 32,
&gpu_ptr, hip_dev_id);
gpu_qp->dbrec_send = reinterpret_cast<volatile uint32_t*>(gpu_ptr);
}
gpu_qp->current_sq_H = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
gpu_qp->setDBval(*(reinterpret_cast<uint64_t*>(qp_out.sq.buf)));
rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr,
hip_dev_id);
gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
uint32_t* sq = reinterpret_cast<uint32_t*>(qp_out.sq.buf);
uint32_t ctrl_qp_sq = (reinterpret_cast<uint32_t*>(sq))[1];
gpu_qp->ctrl_qp_sq = ctrl_qp_sq & 0xFFFFFF;
gpu_qp->ctrl_sig = (reinterpret_cast<uint64_t*>(sq))[1];
gpu_qp->rkey = (reinterpret_cast<uint32_t*>(sq))[6 + key_offset];
gpu_qp->lkey = (reinterpret_cast<uint32_t*>(sq))[9 + key_offset];
}
ibv_qp* Connection::create_qp(ibv_pd* pd, ibv_context* context,
ibv_qp_init_attr_ex* qp_attr, ibv_cq* cq) {
use_gpu_mem = sq_use_gpu_mem;
ibv_qp* qp = nullptr;
assert(pd);
assert(context);
assert(qp_attr);
qp_attr->send_cq = cq;
qp_attr->recv_cq = cq;
qp_attr->pd = pd;
qp_attr->comp_mask = IBV_QP_INIT_ATTR_PD;
qp = create_qp_0(context, qp_attr);
if (!qp) {
printf("***** error ibv_create_qp failed %d m %m \n", errno, errno);
ibv_destroy_cq(cq);
}
return qp;
}
} // namespace rocshmem
-259
View File
@@ -1,259 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
#define LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
#include <infiniband/verbs.h>
extern "C" {
#include <infiniband/mlx5dv.h>
}
#include <vector>
#include "rocshmem/rocshmem.hpp"
#include "connection_policy.hpp"
namespace rocshmem {
class GPUIBBackend;
class QueuePair;
class Connection {
protected:
typedef struct ib_state {
struct ibv_context* context;
struct ibv_pd* pd;
struct ibv_mr* mr;
struct ibv_port_attr portinfo;
} ib_state_t;
typedef struct dest_info {
int lid;
int qpn;
int psn;
union ibv_gid gid;
} dest_info_t;
typedef struct heap_info {
void* base_heap;
uint32_t rkey;
} heap_info_t;
struct sq_post_dv_t {
uint64_t segments[16];
uint32_t current_sq;
uint16_t wqe_idx;
};
class State {
public:
ibv_qp_attr exp_qp_attr{};
uint64_t exp_attr_mask{};
};
class InitQPState : public State {
public:
InitQPState() {
exp_qp_attr.qp_state = IBV_QPS_INIT;
exp_qp_attr.qp_access_flags =
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
exp_attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
}
};
class RtrState : public State {
public:
RtrState() {
exp_qp_attr.qp_state = IBV_QPS_RTR;
exp_qp_attr.path_mtu = IBV_MTU_4096;
exp_qp_attr.ah_attr.sl = 1;
exp_qp_attr.max_dest_rd_atomic = 1;
exp_qp_attr.min_rnr_timer = 12;
exp_attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU;
}
};
class RtsState : public State {
public:
RtsState() {
exp_qp_attr.qp_state = IBV_QPS_RTS;
exp_qp_attr.timeout = 14;
exp_qp_attr.retry_cnt = 7;
exp_qp_attr.rnr_retry = 7;
exp_qp_attr.max_rd_atomic = 1;
exp_attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC;
}
};
class QPInitAttr {
public:
explicit QPInitAttr(ibv_qp_cap cap) {
attr.cap = cap;
attr.sq_sig_all = 0;
}
ibv_qp_init_attr_ex attr{};
};
public:
Connection(GPUIBBackend* backend, int key_offset);
virtual ~Connection();
void initialize(int num_block);
void finalize();
virtual void post_wqes() = 0;
void reg_mr(void* ptr, size_t size, ibv_mr** mr, bool is_managed);
virtual void get_remote_conn(int* remote_conn) = 0;
unsigned total_number_connections();
virtual void initialize_rkey_handle(uint32_t** heap_rkey_handle,
ibv_mr* mr) = 0;
virtual void free_rkey_handle(uint32_t* heap_rkey_handle) = 0;
void initialize_gpu_policy(ConnectionImpl** conn, uint32_t* heap_rkey);
/*
* Populate a QueuePair for use on the GPU from the internal IB state.
*/
void init_gpu_qp_from_connection(QueuePair* qp, int conn_num);
protected:
Connection() = default;
virtual InitQPState initqp(uint8_t port) = 0;
virtual RtrState rtr(dest_info_t* dest, uint8_t port) = 0;
virtual RtsState rts(dest_info_t* dest) = 0;
virtual QPInitAttr qpattr(ibv_qp_cap cap) = 0;
void init_qp_status(ibv_qp* qp, uint8_t port);
void change_status_rtr(ibv_qp* qp, dest_info_t* dest, uint8_t port);
void change_status_rts(ibv_qp* qp, dest_info_t* dest);
void create_qps(uint8_t port, int my_rank, ibv_port_attr* ib_port_att);
template <typename T>
void try_to_modify_qp(ibv_qp* qp, T state);
virtual void create_qps_1() = 0;
virtual void create_qps_2(int port, int my_rank,
ibv_port_attr* ib_port_att) = 0;
virtual void create_qps_3(int port, ibv_qp* qp, int offset,
ibv_port_attr* ib_port_att) = 0;
virtual ibv_qp* create_qp_0(ibv_context* context,
ibv_qp_init_attr_ex* qp_attr) = 0;
virtual void allocate_dynamic_members(int num_block) = 0;
virtual void free_dynamic_members() = 0;
virtual void initialize_1(int port, int num_block) = 0;
virtual void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah,
int dc_key) = 0;
virtual int get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) = 0;
void set_sq_dv(int num_block, int wg_idx, int pe_idx);
/*
* ibv interface functions must be static.
*/
static void* buf_alloc(ibv_pd* pd, void* pd_context, size_t size,
size_t alignment, uint64_t resource_type);
static void buf_release(ibv_pd* pd, void* pd_context, void* ptr,
uint64_t resource_type);
void init_parent_domain_attr(ibv_parent_domain_init_attr* attr);
void set_rdma_seg(mlx5_wqe_raddr_seg* rdma, uint64_t address, uint32_t rkey);
uint64_t* get_address_sq(int i);
ibv_cq* create_cq(ibv_context* context, ibv_pd* pd, int cqe);
ibv_qp* create_qp(ibv_pd* pd, ibv_context* context,
ibv_qp_init_attr_ex* qp_attr, ibv_cq* rcq);
/*
* TODO: Remove this eventually. Goal is to have backend delegate
* connection stuff to this class, while this class knows nothing about
* GPUs or backends.
*/
GPUIBBackend* backend{nullptr};
uint32_t sq_size{1024};
ib_state_t* ib_state{nullptr};
const int key_offset{0};
sq_post_dv_t* sq_post_dv{nullptr};
std::vector<ibv_cq*> cqs;
std::vector<ibv_qp*> qps;
uint64_t counter_wqe{0};
static int use_gpu_mem;
static int coherent_cq;
int cq_use_gpu_mem{1};
int sq_use_gpu_mem{1};
private:
void init_shmem_handle();
void ib_init(ibv_device* ib_dev, uint8_t port);
char* requested_dev{nullptr};
ibv_device** dev_list{nullptr};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
-81
View File
@@ -1,81 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "connection_policy.hpp"
#include <infiniband/mlx5dv.h>
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "dynamic_connection.hpp"
#include "queue_pair.hpp"
#ifdef DEBUG
#define HIP_ENABLE_PRINTF 1
#endif
namespace rocshmem {
RCConnectionImpl::RCConnectionImpl([[maybe_unused]] Connection* conn,
[[maybe_unused]] uint32_t* _vec_rkey) {}
DCConnectionImpl::DCConnectionImpl(Connection* conn, uint32_t* _vec_rkey)
: vec_dct_num(static_cast<DynamicConnection*>(conn)->get_vec_dct_num()),
vec_rkey(_vec_rkey),
vec_lids(static_cast<DynamicConnection*>(conn)->get_vec_lids()) {}
__device__ uint32_t RCConnectionImpl::getNumWqesImpl([
[maybe_unused]] uint8_t opcode) {
return 1;
}
__device__ uint32_t DCConnectionImpl::getNumWqesImpl(uint8_t opcode) {
// FIXME: We assume all threads in wave are performing ATOMIC ops.
// While this might be common, we do not have such restriction
// so need to be fixed.
// Since OFED 5.2, a DC segments uses 48bytes - so with or without
// atomic we need 2 wqes.
// return 2;
return (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS)
? 2
: 1;
}
__device__ bool RCConnectionImpl::updateConnectionSegmentImpl(
[[maybe_unused]] ib_mlx5_base_av_t* wqe, [[maybe_unused]] int pe) {
return false;
}
__device__ bool DCConnectionImpl::updateConnectionSegmentImpl(
ib_mlx5_base_av_t* wqe, int pe) {
wqe->dqp_dct = vec_dct_num[pe];
wqe->rlid = vec_lids[pe];
return true;
}
__device__ void RCConnectionImpl::setRkeyImpl([[maybe_unused]] uint32_t* rkey,
[[maybe_unused]] int pe) {}
__device__ void DCConnectionImpl::setRkeyImpl(uint32_t* rkey, int pe) {
*rkey = vec_rkey[pe];
}
} // namespace rocshmem
-130
View File
@@ -1,130 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
#define LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "infiniband_structs.hpp"
namespace rocshmem {
/*
* CRTP base class for connection type
*/
template <typename Derived>
class ConnectionBase {
public:
/*
* Control segment WQE offset imposed by this connection type.
*/
__device__ int wqeCntrlOffset() {
return static_cast<Derived*>(this)->wqeCntrlOffsetImpl();
}
/*
* Whether or not we need to force PE-level divergence when posting for
* this connection type.
*/
__device__ bool forcePostDivergence() {
return static_cast<Derived*>(this)->forcePostDivergenceImpl();
}
/*
* Number of WQEs produced by this connection type for the given opcode.
*/
__device__ uint32_t getNumWqes(uint8_t opcode) {
return static_cast<Derived*>(this)->getNumWqesImpl(opcode);
}
/*
* Updates the connection-specific segment in the SQ.
*/
__device__ bool updateConnectionSegment(ib_mlx5_base_av_t* wqe, int pe) {
return static_cast<Derived*>(this)->updateConnectionSegmentImpl(wqe, pe);
}
/*
* Set the rkey based on this connection type.
*/
__device__ void setRkey(uint32_t* rkey, int pe) {
static_cast<Derived*>(this)->setRkeyImpl(rkey, pe);
}
};
class Connection;
/*
* Connection policy corresponding to an RC connection type.
*/
class RCConnectionImpl : public ConnectionBase<RCConnectionImpl> {
public:
RCConnectionImpl(Connection* conn, uint32_t* _vec_rkey);
__device__ int wqeCntrlOffsetImpl() { return 0; }
__device__ bool forcePostDivergenceImpl() { return true; }
__device__ uint32_t getNumWqesImpl(uint8_t opcode);
__device__ bool updateConnectionSegmentImpl(ib_mlx5_base_av_t* wqe, int pe);
__device__ void setRkeyImpl(uint32_t* rkey, int pe);
};
/*
* Connection policy corresponding to a DC connection type.
*/
class DCConnectionImpl : public ConnectionBase<DCConnectionImpl> {
public:
DCConnectionImpl(Connection* conn, uint32_t* _vec_rkey);
__device__ int wqeCntrlOffsetImpl() { return 1; }
__device__ bool forcePostDivergenceImpl() { return false; }
__device__ uint32_t getNumWqesImpl(uint8_t opcode);
__device__ bool updateConnectionSegmentImpl(ib_mlx5_base_av_t* wqe, int pe);
__device__ void setRkeyImpl(uint32_t* rkey, int pe);
private:
uint32_t* vec_dct_num{nullptr};
uint32_t* vec_rkey{nullptr};
uint16_t* vec_lids{nullptr};
};
/*
* Select which one of our connection policies to use at compile time.
*/
#ifdef USE_DC
typedef DCConnectionImpl ConnectionImpl;
#else
typedef RCConnectionImpl ConnectionImpl;
#endif
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
-371
View File
@@ -1,371 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "context_ib_device.hpp"
#include <hip/hip_runtime.h>
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "../backend_type.hpp"
#include "../context_incl.hpp"
#include "backend_ib.hpp"
#include "queue_pair.hpp"
namespace rocshmem {
__host__ GPUIBContext::GPUIBContext(Backend *backend, bool option, int idx)
: Context(backend, option) {
GPUIBBackend *b{static_cast<GPUIBBackend *>(backend)};
ctx_idx = idx;
networkImpl = b->networkImpl;
base_heap = b->heap.get_heap_bases().data();
networkImpl.networkHostInit(this, idx);
barrier_sync = b->barrier_sync;
ipcImpl_.ipc_bases = b->ipcImpl.ipc_bases;
ipcImpl_.shm_size = b->ipcImpl.shm_size;
}
__device__ void GPUIBContext::ctx_create() {
/* Nothing to do in the GPU_IB backend */
return;
}
/*
* TODO(bpotter): these will go in a policy class based on DC/RC.
* I am not completely sure at this point what else is needed in said class,
* so just leave them up here for now.
*/
__device__ __host__ QueuePair *GPUIBContext::getQueuePair(int pe) {
return networkImpl.getQueuePair(device_qp_proxy, pe);
}
__device__ __host__ int GPUIBContext::getNumQueuePairs() {
return networkImpl.getNumQueuePairs();
}
__device__ __host__ int GPUIBContext::getNumDest() {
return networkImpl.getNumDest();
}
__device__ void GPUIBContext::fence() {
#ifdef USE_SINGLE_NODE
threadfence_system();
#else
for (int k = 0; k < getNumDest(); k++) {
getQueuePair(k)->fence(k);
}
fence_.flush();
#endif
}
__device__ void GPUIBContext::fence(int pe) {
#ifdef USE_SINGLE_NODE
threadfence_system();
#else
getQueuePair(pe)->fence(pe);
fence_.flush();
#endif
}
__device__ void GPUIBContext::putmem_nbi(void *dest, const void *source,
size_t nelems, int pe) {
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[local_pe] + L_offset,
const_cast<void *>(source), nelems);
} else {
bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
if (!must_send_message) {
return;
}
auto *qp = getQueuePair(pe);
qp->put_nbi<THREAD>(base_heap[pe] + L_offset, source, nelems, pe, true);
}
}
__device__ void GPUIBContext::getmem_nbi(void *dest, const void *source,
size_t nelems, int pe) {
const char *src_typed = reinterpret_cast<const char *>(source);
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
} else {
bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
if (!must_send_message) {
return;
}
auto *qp = getQueuePair(pe);
qp->get_nbi<THREAD>(base_heap[pe] + L_offset, dest, nelems, pe, true);
}
}
__device__ void GPUIBContext::quiet() {
#ifdef USE_SINGLE_NODE
threadfence_system();
for (int pe = 0; pe < ipcImpl_.shm_size; pe++) {
if (pe != my_pe) {
ipcImpl_.zero_byte_read(pe);
}
}
#else
for (int k = 0; k < getNumDest(); k++) {
getQueuePair(k)->quiet_single_heavy<THREAD>(k);
}
fence_.flush();
#endif
}
__device__ void *GPUIBContext::shmem_ptr(const void *dest, int pe) {
void *ret = nullptr;
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
void *dst = const_cast<void *>(dest);
uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
int local_pe = pe % ipcImpl_.shm_size;
ret = ipcImpl_.ipc_bases[local_pe] + L_offset;
}
return ret;
}
__device__ void GPUIBContext::threadfence_system() {
int thread_id = get_flat_block_id();
if (thread_id % WF_SIZE == lowerID()) {
#ifdef USE_SINGLE_NODE
// Flush current PE HDP
HdpPolicy::hdp_flush(
reinterpret_cast<unsigned int *>(networkImpl.hdp_address));
// Flush the rest of the HDPs
for (int pe = 0; pe < ipcImpl_.shm_size; pe++) {
auto target_address = networkImpl.hdp_address;
const int value = HdpPolicy::HDP_FLUSH_VAL;
if (pe != my_pe) {
const int value = HdpPolicy::HDP_FLUSH_VAL;
auto mapped_address =
shmem_ptr(reinterpret_cast<void *>(target_address), pe);
__hip_atomic_store(static_cast<int *>(mapped_address), value,
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}
}
#else
getQueuePair(my_pe)->hdp_policy->flushCoherency();
#endif
}
__threadfence_system();
}
__device__ void GPUIBContext::getmem(void *dest, const void *source,
size_t nelems, int pe) {
const char *src_typed = reinterpret_cast<const char *>(source);
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
} else {
bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
if (!must_send_message) {
return;
}
auto *qp = getQueuePair(pe);
qp->get_nbi_cqe<THREAD>(base_heap[pe] + L_offset, dest, nelems, pe, true);
qp->quiet_single<THREAD>();
}
fence_.flush();
}
__device__ void GPUIBContext::putmem(void *dest, const void *source,
size_t nelems, int pe) {
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[local_pe] + L_offset,
const_cast<void *>(source), nelems);
threadfence_system();
ipcImpl_.zero_byte_read(pe);
} else {
bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
if (!must_send_message) {
return;
}
auto *qp = getQueuePair(pe);
qp->put_nbi_cqe<THREAD>(base_heap[pe] + L_offset, source, nelems, pe, true);
qp->quiet_single<THREAD>();
}
fence_.flush();
}
/******************************************************************************
************************ WORKGROUP/WAVE-LEVEL RMA API ************************
*****************************************************************************/
__device__ void GPUIBContext::putmem_nbi_wg(void *dest, const void *source,
size_t nelems, int pe) {
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[local_pe] + L_offset,
const_cast<void *>(source), nelems);
} else {
if (is_thread_zero_in_block()) {
auto *qp = getQueuePair(pe);
qp->put_nbi<WG>(base_heap[pe] + L_offset, source, nelems, pe, true);
}
}
__syncthreads();
}
__device__ void GPUIBContext::putmem_nbi_wave(void *dest, const void *source,
size_t nelems, int pe) {
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[local_pe] + L_offset,
const_cast<void *>(source), nelems);
} else {
if (is_thread_zero_in_wave()) {
auto *qp = getQueuePair(pe);
qp->put_nbi<WAVE>(base_heap[pe] + L_offset, source, nelems, pe, true);
}
}
}
__device__ void GPUIBContext::putmem_wg(void *dest, const void *source,
size_t nelems, int pe) {
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[local_pe] + L_offset,
const_cast<void *>(source), nelems);
__syncthreads();
threadfence_system();
ipcImpl_.zero_byte_read(pe);
} else {
auto *qp = getQueuePair(pe);
if (is_thread_zero_in_block()) {
qp->put_nbi_cqe<WG>(base_heap[pe] + L_offset, source, nelems, pe, true);
}
qp->quiet_single<WG>();
}
__syncthreads();
fence_.flush();
}
__device__ void GPUIBContext::putmem_wave(void *dest, const void *source,
size_t nelems, int pe) {
uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
auto *qp = getQueuePair(pe);
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[local_pe] + L_offset,
const_cast<void *>(source), nelems);
threadfence_system();
ipcImpl_.zero_byte_read(pe);
} else {
if (is_thread_zero_in_wave()) {
qp->put_nbi_cqe<WAVE>(base_heap[pe] + L_offset, source, nelems, pe, true);
}
qp->quiet_single<WAVE>();
}
fence_.flush();
}
__device__ void GPUIBContext::getmem_wg(void *dest, const void *source,
size_t nelems, int pe) {
const char *src_typed = reinterpret_cast<const char *>(source);
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
auto *qp = getQueuePair(pe);
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
} else {
if (is_thread_zero_in_block()) {
qp->get_nbi_cqe<WG>(base_heap[pe] + L_offset, dest, nelems, pe, true);
}
qp->quiet_single<WG>();
}
__syncthreads();
fence_.flush();
}
__device__ void GPUIBContext::getmem_wave(void *dest, const void *source,
size_t nelems, int pe) {
const char *src_typed = reinterpret_cast<const char *>(source);
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
auto *qp = getQueuePair(pe);
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[local_pe] + L_offset,
nelems);
} else {
if (is_thread_zero_in_wave()) {
qp->get_nbi_cqe<WAVE>(base_heap[pe] + L_offset, dest, nelems, pe, true);
}
qp->quiet_single<WAVE>();
}
fence_.flush();
}
__device__ void GPUIBContext::getmem_nbi_wg(void *dest, const void *source,
size_t nelems, int pe) {
const char *src_typed = reinterpret_cast<const char *>(source);
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
} else {
if (is_thread_zero_in_block()) {
auto *qp = getQueuePair(pe);
qp->get_nbi<WG>(base_heap[pe] + L_offset, dest, nelems, pe, true);
}
}
__syncthreads();
}
__device__ void GPUIBContext::getmem_nbi_wave(void *dest, const void *source,
size_t nelems, int pe) {
const char *src_typed = reinterpret_cast<const char *>(source);
uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
int local_pe = pe % ipcImpl_.shm_size;
ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[local_pe] + L_offset,
nelems);
} else {
if (is_thread_zero_in_wave()) {
auto *qp = getQueuePair(pe);
qp->get_nbi<WAVE>(base_heap[pe] + L_offset, dest, nelems, pe, true);
}
}
}
} // namespace rocshmem
-304
View File
@@ -1,304 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
#include "../context.hpp"
#include "memory_builder_policy.hpp"
#include "network_policy.hpp"
namespace rocshmem {
class QueuePair;
class GPUIBContext : public Context {
public:
__host__ GPUIBContext(Backend *b, bool option, int idx);
__device__ __host__ QueuePair *getQueuePair(int pe);
__device__ __host__ int getNumQueuePairs();
__device__ __host__ int getNumDest();
__device__ __attribute__((noinline)) void threadfence_system();
__device__ void ctx_create();
__device__ void ctx_destroy();
__device__ void putmem(void *dest, const void *source, size_t nelems, int pe);
__device__ void getmem(void *dest, const void *source, size_t nelems, int pe);
__device__ void putmem_nbi(void *dest, const void *source, size_t nelems,
int pe);
__device__ void getmem_nbi(void *dest, const void *source, size_t size,
int pe);
__device__ void fence();
__device__ void fence(int pe);
__device__ void quiet();
__device__ void *shmem_ptr(const void *dest, int pe);
__device__ void barrier_all();
__device__ void sync_all();
__device__ void sync(rocshmem_team_t team);
template <typename T>
__device__ void amo_add(void *dst, T value, int pe);
template <typename T>
__device__ void amo_set(void *dst, T value, int pe);
template <typename T>
__device__ T amo_swap(void *dst, T value, int pe);
template <typename T>
__device__ T amo_fetch_and(void *dst, T value, int pe);
template <typename T>
__device__ void amo_and(void *dst, T value, int pe);
template <typename T>
__device__ T amo_fetch_or(void *dst, T value, int pe);
template <typename T>
__device__ void amo_or(void *dst, T value, int pe);
template <typename T>
__device__ T amo_fetch_xor(void *dst, T value, int pe);
template <typename T>
__device__ void amo_xor(void *dst, T value, int pe);
template <typename T>
__device__ void amo_cas(void *dst, T value, T cond, int pe);
template <typename T>
__device__ T amo_fetch_add(void *dst, T value, int pe);
template <typename T>
__device__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
template <typename T>
__device__ void p(T *dest, T value, int pe);
template <typename T>
__device__ T g(const T *source, int pe);
template <typename T, ROCSHMEM_OP Op>
__device__ void to_all(T *dest, const T *source, int nreduce, int PE_start,
int logPE_stride, int PE_size, T *pWrk,
long *pSync); // NOLINT(runtime/int)
template <typename T, ROCSHMEM_OP Op>
__device__ void to_all(rocshmem_team_t team, T *dest, const T *source,
int nreduce);
template <typename T>
__device__ void put(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void put_nbi(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void get(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
int nelems, int pe_root);
template <typename T>
__device__ void broadcast(T *dest, const T *source, int nelems, int pe_root,
int pe_start, int log_pe_stride, int pe_size,
long *p_sync); // NOLINT(runtime/int)
template <typename T>
__device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void alltoall_broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void alltoall_brucks(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void alltoall_gcen2(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void fcollect_broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void fcollect_brucks(rocshmem_team_t team, T *dest,
const T *source, int nelems);
template <typename T>
__device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source,
int nelems);
template <typename T>
__device__ void fcollect_gcen2(rocshmem_team_t team, T *dest,
const T *source, int nelems);
__device__ void putmem_wg(void *dest, const void *source, size_t nelems,
int pe);
__device__ void getmem_wg(void *dest, const void *source, size_t nelems,
int pe);
__device__ void putmem_nbi_wg(void *dest, const void *source, size_t nelems,
int pe);
__device__ void getmem_nbi_wg(void *dest, const void *source, size_t size,
int pe);
__device__ void putmem_wave(void *dest, const void *source, size_t nelems,
int pe);
__device__ void getmem_wave(void *dest, const void *source, size_t nelems,
int pe);
__device__ void putmem_nbi_wave(void *dest, const void *source, size_t nelems,
int pe);
__device__ void getmem_nbi_wave(void *dest, const void *source, size_t size,
int pe);
template <typename T>
__device__ void put_wg(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void put_nbi_wg(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void get_wg(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void get_nbi_wg(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void put_wave(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void put_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void get_wave(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
private:
template <typename T, ROCSHMEM_OP Op>
__device__ void internal_direct_allreduce(
T *dst, const T *src, int nelems, int PE_start, int logPE_stride,
int PE_size, T *pWrk,
long *pSync); // NOLINT(runtime/int)
template <typename T, ROCSHMEM_OP Op>
__device__ void internal_ring_allreduce(T *dst, const T *src, int nelems,
int PE_start, int logPE_stride,
int PE_size, T *pWrk,
long *pSync, // NOLINT(runtime/int)
int n_seg, int seg_size,
int chunk_size);
template <typename T>
__device__ void internal_put_broadcast(T *dst, const T *src, int nelems,
int pe_root, int PE_start,
int logPE_stride, int PE_size,
long *pSync); // NOLINT(runtime/int)
template <typename T>
__device__ void internal_get_broadcast(T *dst, const T *src, int nelems,
int pe_root,
long *pSync); // NOLINT(runtime/int)
__device__ void internal_direct_barrier(int pe, int PE_start, int stride,
int n_pes, int64_t *pSync);
__device__ void internal_atomic_barrier(int pe, int PE_start, int stride,
int n_pes, int64_t *pSync);
__device__ void internal_sync(int pe, int PE_start, int stride, int PE_size,
int64_t *pSync);
__device__ void quiet_single(int cq_num);
public:
/*
* Collection of queue pairs that are currently checked out by this
* context from GPUIBBackend.
*/
// FIXME: keep it private and destroy in destructor for better
// encapsulation.
QueuePair *device_qp_proxy{nullptr};
/*
* Array of char * pointers corresponding to the heap base pointers VA for
* each PE that we can communicate with.
*/
char *const *base_heap{nullptr};
/*
* Buffer used to store the results of a *_g operation. These ops do not
* provide a destination buffer, so the runtime must manage one.
*/
char *g_ret{nullptr};
NetworkImpl networkImpl{};
/*
* Temporary scratchpad memory used by internal barrier algorithms.
*/
int64_t *barrier_sync{nullptr};
int ctx_idx;
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
-122
View File
@@ -1,122 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "rocshmem/rocshmem.hpp"
#include "../context_incl.hpp"
#include "context_ib_tmpl_device.hpp"
#include "../util.hpp"
namespace rocshmem {
__device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start,
int stride, int n_pes,
int64_t *pSync) {
int64_t flag_val = 1;
if (pe == PE_start) {
// Go through all PE offsets (except current offset = 0)
// and wait until they all reach
for (size_t i = 1; i < n_pes; i++) {
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
pSync[i] = ROCSHMEM_SYNC_VALUE;
}
threadfence_system();
// Announce to other PEs that all have reached
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
put_nbi(&pSync[0], &flag_val, 1, j);
}
} else {
// Mark current PE offset as reached
size_t pe_offset = (pe - PE_start) / stride;
put_nbi(&pSync[pe_offset], &flag_val, 1, PE_start);
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
}
}
__device__ void GPUIBContext::internal_atomic_barrier(int pe, int PE_start,
int stride, int n_pes,
int64_t *pSync) {
int64_t flag_val = 1;
if (pe == PE_start) {
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
put_nbi(&pSync[0], &flag_val, 1, j);
}
} else {
amo_add<int64_t>(&pSync[0], flag_val, PE_start);
wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
pSync[0] = ROCSHMEM_SYNC_VALUE;
threadfence_system();
}
}
// Uses PE values that are relative to world
__device__ void GPUIBContext::internal_sync(int pe, int PE_start, int stride,
int PE_size, int64_t *pSync) {
__syncthreads();
if (is_thread_zero_in_block()) {
if (PE_size < 64) {
internal_direct_barrier(pe, PE_start, stride, PE_size, pSync);
} else {
internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync);
}
}
__threadfence();
__syncthreads();
}
__device__ void GPUIBContext::sync(rocshmem_team_t team) {
GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
/**
* Ensure that the stride is a multiple of 2 for GPU_IB.
* TODO: enable GPU_IB to work with non-powers-of-2 strides
* and remove this assert.
*/
assert((dbl_log_pe_stride - log_pe_stride) == 0);
int pe = team_obj->my_pe_in_world;
int pe_start = team_obj->tinfo_wrt_world->pe_start;
int pe_stride = (1 << log_pe_stride);
int pe_size = team_obj->num_pes;
internal_sync(pe, pe_start, pe_stride, pe_size, barrier_sync);
}
__device__ void GPUIBContext::sync_all() {
internal_sync(my_pe, 0, 1, num_pes, barrier_sync);
}
__device__ void GPUIBContext::barrier_all() {
if (is_thread_zero_in_block()) {
quiet();
}
sync_all();
__syncthreads();
}
} // namespace rocshmem
-85
View File
@@ -1,85 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "context_ib_host.hpp"
#include <mpi.h>
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../backend_type.hpp"
#include "../context_incl.hpp"
#include "backend_ib.hpp"
#include "../host/host.hpp"
namespace rocshmem {
__host__ GPUIBHostContext::GPUIBHostContext(Backend *backend,
[[maybe_unused]] int64_t options)
: Context(backend, true) {
GPUIBBackend *b{static_cast<GPUIBBackend *>(backend)};
host_interface = b->host_interface;
context_window_info = host_interface->acquire_window_context();
}
__host__ GPUIBHostContext::~GPUIBHostContext() {
host_interface->release_window_context(context_window_info);
}
__host__ void GPUIBHostContext::putmem_nbi(void *dest, const void *source,
size_t nelems, int pe) {
host_interface->putmem_nbi(dest, source, nelems, pe, context_window_info);
}
__host__ void GPUIBHostContext::getmem_nbi(void *dest, const void *source,
size_t nelems, int pe) {
host_interface->getmem_nbi(dest, source, nelems, pe, context_window_info);
}
__host__ void GPUIBHostContext::putmem(void *dest, const void *source,
size_t nelems, int pe) {
host_interface->putmem(dest, source, nelems, pe, context_window_info);
}
__host__ void GPUIBHostContext::getmem(void *dest, const void *source,
size_t nelems, int pe) {
host_interface->getmem(dest, source, nelems, pe, context_window_info);
}
__host__ void GPUIBHostContext::fence() {
host_interface->fence(context_window_info);
}
__host__ void GPUIBHostContext::quiet() {
host_interface->quiet(context_window_info);
}
__host__ void GPUIBHostContext::sync_all() {
host_interface->sync_all(context_window_info);
}
__host__ void GPUIBHostContext::barrier_all() {
host_interface->barrier_all(context_window_info);
}
} // namespace rocshmem
-149
View File
@@ -1,149 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
#include "../context.hpp"
namespace rocshmem {
class GPUIBHostContext : public Context {
public:
__host__ GPUIBHostContext(Backend *b, int64_t options);
__host__ ~GPUIBHostContext();
template <typename T>
__host__ void p(T *dest, T value, int pe);
template <typename T>
__host__ T g(const T *source, int pe);
template <typename T>
__host__ void put(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__host__ void get(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__host__ void put_nbi(T *dest, const T *source, size_t nelems, int pe);
template <typename T>
__host__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
__host__ void putmem(void *dest, const void *source, size_t nelems, int pe);
__host__ void getmem(void *dest, const void *source, size_t nelems, int pe);
__host__ void putmem_nbi(void *dest, const void *source, size_t nelems,
int pe);
__host__ void getmem_nbi(void *dest, const void *source, size_t size, int pe);
template <typename T>
__host__ void amo_add(void *dst, T value, int pe);
template <typename T>
__host__ void amo_cas(void *dst, T value, T cond, int pe);
template <typename T>
__host__ T amo_fetch_add(void *dst, T value, int pe);
template <typename T>
__host__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
__host__ void fence();
__host__ void quiet();
__host__ void barrier_all();
__host__ void sync_all();
template <typename T>
__host__ void broadcast(T *dest, const T *source, int nelems, int pe_root,
int pe_start, int log_pe_stride, int pe_size,
long *p_sync); // NOLINT(runtime/int)
template <typename T>
__host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
int nelems, int pe_root);
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
int log_pe_stride, int pe_size, T *p_wrk,
long *p_sync); // NOLINT(runtime/int)
template <typename T, ROCSHMEM_OP Op>
__host__ void to_all(rocshmem_team_t team, T *dest, const T *source,
int nreduce);
template <typename T>
__host__ void wait_until(T *ivars, int cmp, T val);
template <typename T>
__host__ size_t wait_until_any(T *ivars, size_t nelems,
const int *status,
int cmp, T val);
template <typename T>
__host__ void wait_until_all(T *ivars, size_t nelems,
const int *status,
int cmp, T val);
template <typename T>
__host__ size_t wait_until_some(T *ivars, size_t nelems,
size_t* indices,
const int *status,
int cmp, T val);
template <typename T>
__host__ void wait_until_all_vector(T *ivars, size_t nelems,
const int *status,
int cmp, T* vals);
template <typename T>
__host__ size_t wait_until_any_vector(T *ivars, size_t nelems,
const int *status,
int cmp, T* vals);
template <typename T>
__host__ size_t wait_until_some_vector(T *ivars, size_t nelems,
size_t* indices,
const int *status,
int cmp, T* vals);
template <typename T>
__host__ int test(T *ivars, int cmp, T val);
public:
/* Pointer to the backend's host interface */
HostInterface *host_interface{nullptr};
/* An MPI Window implements a context */
WindowInfo *context_window_info{nullptr};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
File diff suppressed because it is too large Load Diff
-173
View File
@@ -1,173 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../host/host_templates.hpp"
namespace rocshmem {
template <typename T>
__host__ void GPUIBHostContext::p(T *dest, T value, int pe) {
host_interface->p<T>(dest, value, pe, context_window_info);
}
template <typename T>
__host__ T GPUIBHostContext::g(const T *source, int pe) {
return host_interface->g<T>(source, pe, context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::put(T *dest, const T *source, size_t nelems,
int pe) {
host_interface->put<T>(dest, source, nelems, pe, context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::get(T *dest, const T *source, size_t nelems,
int pe) {
host_interface->get<T>(dest, source, nelems, pe, context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::put_nbi(T *dest, const T *source, size_t nelems,
int pe) {
host_interface->put_nbi<T>(dest, source, nelems, pe, context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::get_nbi(T *dest, const T *source, size_t nelems,
int pe) {
host_interface->get_nbi<T>(dest, source, nelems, pe, context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::amo_add(void *dst, T value, int pe) {
host_interface->amo_add(dst, value, pe, context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::amo_cas(void *dst, T value, T cond, int pe) {
host_interface->amo_cas(dst, value, cond, pe, context_window_info);
}
template <typename T>
__host__ T GPUIBHostContext::amo_fetch_add(void *dst, T value, int pe) {
return host_interface->amo_fetch_add(dst, value, pe, context_window_info);
}
template <typename T>
__host__ T GPUIBHostContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
return host_interface->amo_fetch_cas(dst, value, cond, pe,
context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::broadcast(
T *dest, const T *source, int nelems, int pe_root, int pe_start,
int log_pe_stride, int pe_size,
long *p_sync) { // NOLINT(runtime/int)
host_interface->broadcast<T>(dest, source, nelems, pe_root, pe_start,
log_pe_stride, pe_size, p_sync);
}
template <typename T>
__host__ void GPUIBHostContext::broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems,
int pe_root) {
host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
}
template <typename T, ROCSHMEM_OP Op>
__host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce,
int pe_start, int log_pe_stride,
int pe_size, T *p_wrk,
long *p_sync) { // NOLINT(runtime/int)
host_interface->to_all<T, Op>(dest, source, nreduce, pe_start, log_pe_stride,
pe_size, p_wrk, p_sync);
}
template <typename T, ROCSHMEM_OP Op>
__host__ void GPUIBHostContext::to_all(rocshmem_team_t team, T *dest,
const T *source, int nreduce) {
host_interface->to_all<T, Op>(team, dest, source, nreduce);
}
template <typename T>
__host__ void GPUIBHostContext::wait_until(T *ivars, int cmp, T val) {
host_interface->wait_until<T>(ivars, cmp, val, context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::wait_until_all(T *ivars, size_t nelems,
const int* status,
int cmp, T val) {
host_interface->wait_until_all<T>(ivars, nelems, status, cmp, val, context_window_info);
}
template <typename T>
__host__ size_t GPUIBHostContext::wait_until_any(T *ivars, size_t nelems,
const int* status,
int cmp, T val) {
return host_interface->wait_until_any<T>(ivars, nelems, status, cmp, val, context_window_info);
}
template <typename T>
__host__ size_t GPUIBHostContext::wait_until_some(T *ivars, size_t nelems,
size_t* indices,
const int* status,
int cmp, T val) {
return host_interface->wait_until_some<T>(ivars, nelems, indices, status, cmp, val, context_window_info);
}
template <typename T>
__host__ void GPUIBHostContext::wait_until_all_vector(T *ivars, size_t nelems,
const int* status,
int cmp, T* vals) {
host_interface->wait_until_all_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
}
template <typename T>
__host__ size_t GPUIBHostContext::wait_until_any_vector(T *ivars, size_t nelems,
const int* status,
int cmp, T* vals) {
return host_interface->wait_until_any_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
}
template <typename T>
__host__ size_t GPUIBHostContext::wait_until_some_vector(T *ivars, size_t nelems,
size_t* indices,
const int* status,
int cmp, T* vals) {
return host_interface->wait_until_some_vector<T>(ivars, nelems, indices, status, cmp, vals, context_window_info);
}
template <typename T>
__host__ int GPUIBHostContext::test(T *ivars, int cmp, T val) {
return host_interface->test<T>(ivars, cmp, val, context_window_info);
}
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
-39
View File
@@ -1,39 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "rocshmem/rocshmem_debug.hpp"
#include "qe_dumper.hpp"
namespace rocshmem {
void debug_print_cq(int dest_pe, int src_wg, int cqe_index) {
QeDumper dumper(dest_pe, src_wg, cqe_index);
dumper.dump_cq();
}
void debug_print_sq(int dest_pe, int src_wg, int wqe_index) {
QeDumper dumper(dest_pe, src_wg, wqe_index);
dumper.dump_sq();
}
} // namespace rocshmem
-381
View File
@@ -1,381 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "dynamic_connection.hpp"
#include <mpi.h>
#include "backend_ib.hpp"
namespace rocshmem {
DynamicConnection::DynamicConnection(GPUIBBackend* b) : Connection(b, 4) {
char* value = nullptr;
if ((value = getenv("ROCSHMEM_NUM_DCIs"))) {
num_dcis = atoi(value);
}
if ((value = getenv("ROCSHMEM_NUM_DCT"))) {
num_dct = atoi(value);
}
}
DynamicConnection::~DynamicConnection() {
CHECK_HIP(hipFree(vec_lids));
CHECK_HIP(hipFree(vec_dct_num));
}
ibv_qp_init_attr_ex DynamicConnection::dct_qp_init_attr(
ibv_cq* cq, ibv_srq* srq, [[maybe_unused]] uint8_t port) const {
ibv_qp_init_attr_ex attr{};
attr.comp_mask = IBV_QP_INIT_ATTR_PD;
attr.pd = ib_state->pd;
attr.recv_cq = cq;
attr.send_cq = cq;
attr.srq = srq;
attr.qp_type = IBV_QPT_DRIVER;
return attr;
}
mlx5dv_qp_init_attr DynamicConnection::dct_dv_init_attr() {
mlx5dv_qp_init_attr dv_attr{};
dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC;
dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT;
dv_attr.dc_init_attr.dct_access_key = DC_IB_KEY;
return dv_attr;
}
Connection::InitQPState DynamicConnection::initqp(uint8_t port) {
InitQPState initqp{};
initqp.exp_qp_attr.port_num = port;
initqp.exp_qp_attr.pkey_index = 0;
initqp.exp_qp_attr.qp_access_flags = 0;
return initqp;
}
Connection::RtrState DynamicConnection::rtr([[maybe_unused]] dest_info_t* dest,
uint8_t port) {
RtrState rtr{};
rtr.exp_qp_attr.ah_attr.is_global = 1;
rtr.exp_qp_attr.ah_attr.port_num = port;
rtr.exp_qp_attr.max_dest_rd_atomic = 0;
rtr.exp_qp_attr.min_rnr_timer = 0;
return rtr;
}
Connection::RtsState DynamicConnection::rts([
[maybe_unused]] dest_info_t* dest) {
RtsState rts{};
rts.exp_attr_mask |= IBV_QP_SQ_PSN;
return rts;
}
void DynamicConnection::connect_dci(ibv_qp* qp, uint8_t port) {
init_qp_status(qp, port);
change_status_rtr(qp, nullptr, port);
change_status_rts(qp, nullptr);
}
/*
* create a DCT and get is to ready state
*/
void DynamicConnection::create_dct(int32_t* dct_num, ibv_cq* cq, ibv_srq* srq,
uint8_t port) {
auto init_attr = dct_qp_init_attr(cq, srq, port);
auto dv_attr = dct_dv_init_attr();
auto dct = mlx5dv_create_qp(ib_state->context, &init_attr, &dv_attr);
if (dct == nullptr) {
printf("Failed to create dct \n");
abort();
}
ibv_qp_attr qp_attr{};
qp_attr.qp_state = IBV_QPS_INIT;
qp_attr.port_num = port;
qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
int attr_mask =
IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
int ret = ibv_modify_qp(dct, &qp_attr, attr_mask);
if (ret) {
abort();
}
qp_attr.qp_state = IBV_QPS_RTR;
qp_attr.path_mtu = IBV_MTU_4096;
qp_attr.min_rnr_timer = 7;
qp_attr.ah_attr.is_global = 1;
qp_attr.ah_attr.grh.hop_limit = 1;
qp_attr.ah_attr.grh.traffic_class = 0;
qp_attr.ah_attr.grh.sgid_index = 0;
qp_attr.ah_attr.port_num = port;
attr_mask = IBV_QP_STATE | IBV_QP_MIN_RNR_TIMER | IBV_QP_AV | IBV_QP_PATH_MTU;
ret = ibv_modify_qp(dct, &qp_attr, attr_mask);
if (ret) {
abort();
}
*dct_num = dct->qp_num;
}
/*
* @brief create a qp (DCI qp) using DEVX
*/
ibv_qp* DynamicConnection::create_qp_0(ibv_context* context,
ibv_qp_init_attr_ex* qp_attr) {
ibv_qp* qp;
qp_attr->qp_type = IBV_QPT_DRIVER;
mlx5dv_qp_init_attr dv_attr{};
dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC;
dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI;
dv_attr.dc_init_attr.dct_access_key = DC_IB_KEY;
qp = mlx5dv_create_qp(context, qp_attr, &dv_attr);
return qp;
}
void DynamicConnection::create_qps_1() {
ibv_srq_init_attr srq_init_attr{};
srq_init_attr.attr.max_wr = 1;
srq_init_attr.attr.max_sge = 1;
srq = ibv_create_srq(ib_state->pd, &srq_init_attr);
if (!srq) {
abort();
}
dct_cq = ibv_create_cq(ib_state->context, 100, nullptr, nullptr, 0);
if (!dct_cq) {
abort();
}
}
void DynamicConnection::create_qps_2(int port, int my_rank,
ibv_port_attr* ib_port_att) {
for (int i = 0; i < num_dct; i++) {
int32_t dct_num;
create_dct(&dct_num, dct_cq, srq, port);
dct_num = htobe32(dct_num);
dcts_num[my_rank * num_dct + i] = dct_num;
}
lids[my_rank] = htobe16(ib_port_att->lid);
}
void DynamicConnection::create_qps_3(
int port, ibv_qp* qp, [[maybe_unused]] int offset,
[[maybe_unused]] ibv_port_attr* ib_port_att) {
return connect_dci(qp, port);
}
void DynamicConnection::get_remote_conn(int* remote_conn) {
*remote_conn = num_dcis;
}
void DynamicConnection::allocate_dynamic_members([
[maybe_unused]] int num_wg) {
size_t num_pes_size_bytes = sizeof(uint16_t) * backend->num_pes;
lids = reinterpret_cast<uint16_t*>(malloc(num_pes_size_bytes));
if (lids == nullptr) {
abort();
}
size_t num_dcts = num_dct * backend->num_pes;
size_t num_dcts_size_bytes = sizeof(uint32_t) * num_dcts;
dcts_num = reinterpret_cast<uint32_t*>(malloc(num_dcts_size_bytes));
if (dcts_num == nullptr) {
abort();
}
}
/*
* get the wqe_av information from tyhe ibv_ah
* rely on DEVX to extract the AV. We use the AV to create
* the DC segment
*/
void DynamicConnection::dc_get_av(ibv_ah* ah, mlx5_wqe_av* mlx5_av) {
mlx5dv_obj dv;
mlx5dv_ah dah;
dv.ah.in = ah;
dv.ah.out = &dah;
mlx5dv_init_obj(&dv, MLX5DV_OBJ_AH);
memcpy(mlx5_av, dah.av, sizeof(mlx5_wqe_av));
}
void DynamicConnection::free_dynamic_members() {
free(lids);
free(dcts_num);
}
void DynamicConnection::initialize_1(int port, [[maybe_unused]] int num_wg) {
MPI_Allgather(MPI_IN_PLACE, sizeof(int32_t) * num_dct, MPI_CHAR, dcts_num,
sizeof(int32_t) * num_dct, MPI_CHAR, backend->thread_comm);
MPI_Allgather(MPI_IN_PLACE, sizeof(int16_t), MPI_CHAR, lids, sizeof(int16_t),
MPI_CHAR, backend->thread_comm);
hipStream_t stream;
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&vec_dct_num),
sizeof(int32_t) * num_dct * backend->num_pes));
CHECK_HIP(hipMemcpyAsync(vec_dct_num, dcts_num,
sizeof(int32_t) * num_dct * backend->num_pes,
hipMemcpyHostToDevice, stream));
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&vec_lids),
sizeof(int16_t) * backend->num_pes));
CHECK_HIP(hipMemcpyAsync(vec_lids, lids, sizeof(int16_t) * backend->num_pes,
hipMemcpyHostToDevice, stream));
struct ibv_ah_attr ah_attr;
memset(&ah_attr, 0, sizeof(ah_attr));
ah_attr.is_global = 1;
ah_attr.dlid = ib_state->portinfo.lid;
ah_attr.sl = 1;
ah_attr.src_path_bits = 0;
ah_attr.port_num = port;
ah = ibv_create_ah(ib_state->pd, &ah_attr);
if (ah == nullptr) {
abort();
}
dc_get_av(ah, &mlx5_av);
CHECK_HIP(hipStreamSynchronize(stream));
CHECK_HIP(hipStreamDestroy(stream));
}
void DynamicConnection::initialize_rkey_handle(uint32_t** heap_rkey_handle,
ibv_mr* mr) {
CHECK_HIP(hipMalloc(heap_rkey_handle, sizeof(uint32_t) * backend->num_pes));
(*heap_rkey_handle)[backend->my_pe] = htobe32(mr->rkey);
}
void DynamicConnection::free_rkey_handle(uint32_t* heap_rkey_handle) {
CHECK_HIP(hipFree(heap_rkey_handle));
}
Connection::QPInitAttr DynamicConnection::qpattr(ibv_qp_cap cap) {
QPInitAttr qpattr(cap);
return qpattr;
}
/*
* Create and write the DC segment to SQ.
* We get all the info needed from the mlx5_wqe_av that we extract from ibv_ah.
*/
void DynamicConnection::set_dgram_seg(mlx5_wqe_datagram_seg* dc_seg,
uint64_t dc_key, uint32_t dct_num,
uint8_t ext, mlx5_wqe_av* mlx5_av) {
dc_seg->av.key.dc_key = htobe64(dc_key);
dc_seg->av.dqp_dct = htobe32(((uint32_t)ext << 31 | dct_num));
dc_seg->av.stat_rate_sl = mlx5_av->stat_rate_sl;
dc_seg->av.fl_mlid = mlx5_av->fl_mlid;
dc_seg->av.rlid = mlx5_av->rlid;
}
/*
* create a DC wqe and post it to the SQ
* we rely on mlx5dv functions to ceate the ctrl and data
* segments but we use our own function to write teh DC and rdma segments
*/
void DynamicConnection::post_dv_dc_wqe(int remote_conn) {
mlx5_wqe_ctrl_seg* ctrl;
mlx5_wqe_datagram_seg* dc_seg;
mlx5_wqe_raddr_seg* rdma;
mlx5_wqe_data_seg* data;
for (int i = 0; i < remote_conn; i++) {
uint64_t* ptr = get_address_sq(i);
const uint32_t nb_post = 4 * sq_size;
for (uint16_t index = 0; index < nb_post; index++) {
uint8_t op_mod = 0;
uint8_t op_code = 8;
uint32_t qp_num = qps[i]->qp_num;
uint8_t fm_ce_se = 0;
uint8_t ds = 4;
ctrl = reinterpret_cast<mlx5_wqe_ctrl_seg*>(ptr);
mlx5dv_set_ctrl_seg(ctrl, index, op_code, op_mod, qp_num, fm_ce_se, ds, 0,
0);
ptr = ptr + 2;
uint32_t dct_num = dcts_num[i];
uint8_t ext = 1;
dc_seg = reinterpret_cast<mlx5_wqe_datagram_seg*>(ptr);
set_dgram_seg(dc_seg, (uint64_t)DC_IB_KEY, dct_num, ext, &mlx5_av);
ptr = ptr + 2;
uint64_t address = 0;
uint32_t rkey = 0;
rdma = reinterpret_cast<mlx5_wqe_raddr_seg*>(ptr);
set_rdma_seg(rdma, address, rkey);
ptr = ptr + 2;
uint32_t lkey = backend->networkImpl.heap_mr->lkey;
data = reinterpret_cast<mlx5_wqe_data_seg*>(ptr);
mlx5dv_set_data_seg(data, 1, lkey, 0);
ptr = ptr + 2;
}
}
}
// TODO(bpotter): remove redundancies with the other derived class
void DynamicConnection::post_wqes() {
int remote_conn;
get_remote_conn(&remote_conn);
remote_conn *= backend->num_blocks_;
post_dv_dc_wqe(remote_conn);
}
void DynamicConnection::initialize_wr_fields([[maybe_unused]] ibv_send_wr* wr,
[[maybe_unused]] ibv_ah* ah,
[[maybe_unused]] int dc_key) {}
int DynamicConnection::get_sq_dv_offset([[maybe_unused]] int pe_idx,
[[maybe_unused]] int num_qps,
int wg_idx) {
return wg_idx;
}
} // namespace rocshmem
-122
View File
@@ -1,122 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
#define LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
#include "connection.hpp"
namespace rocshmem {
class DynamicConnection : public Connection {
public:
explicit DynamicConnection(GPUIBBackend* backend);
~DynamicConnection() override;
void get_remote_conn(int* remote_conn) override;
void post_wqes() override;
void initialize_rkey_handle(uint32_t** heap_rkey_handle,
ibv_mr* mr) override;
void free_rkey_handle(uint32_t* heap_rkey_handle) override;
uint32_t* get_vec_dct_num() const { return vec_dct_num; }
uint16_t* get_vec_lids() const { return vec_lids; }
private:
InitQPState initqp(uint8_t port) override;
RtrState rtr(dest_info_t* dest, uint8_t port) override;
RtsState rts(dest_info_t* dest) override;
QPInitAttr qpattr(ibv_qp_cap cap) override;
void connect_dci(ibv_qp* qp, uint8_t port);
void create_dct(int32_t* dct_num, ibv_cq* cq, ibv_srq* srq, uint8_t port);
ibv_qp_init_attr_ex dct_qp_init_attr(ibv_cq* cq, ibv_srq* srq,
uint8_t port) const;
mlx5dv_qp_init_attr dct_dv_init_attr();
void dc_get_av(ibv_ah* ah, mlx5_wqe_av* mlx5_av);
void set_dgram_seg(mlx5_wqe_datagram_seg* dc_seg, uint64_t dc_key,
uint32_t dct_num, uint8_t ext, mlx5_wqe_av* av);
void set_data_seg(mlx5_wqe_data_seg* data_seg, uint32_t lkey);
void post_dv_dc_wqe(int remote_conn);
void create_qps_1() override;
void create_qps_2(int port, int my_rank,
ibv_port_attr* ib_port_att) override;
void create_qps_3(int port, ibv_qp* qp, int offset,
ibv_port_attr* ib_port_att) override;
ibv_qp* create_qp_0(ibv_context* context,
ibv_qp_init_attr_ex* qp_attr) override;
void allocate_dynamic_members(int num_wg) override;
void free_dynamic_members() override;
void initialize_1(int port, int num_wg) override;
void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, int dc_key) override;
int get_sq_dv_offset(int pe_idx, int32_t num_qps, int wg_idx) override;
int num_dcis{1};
int num_dct{1};
static constexpr int DC_IB_KEY{0x1ee7a330};
uint32_t* dcts_num{nullptr};
uint16_t* lids{nullptr};
mlx5_wqe_av mlx5_av{};
ibv_ah* ah{nullptr};
ibv_srq* srq{nullptr};
ibv_cq* dct_cq{nullptr};
uint32_t* vec_dct_num{nullptr};
uint16_t* vec_lids{nullptr};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
-79
View File
@@ -1,79 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "endian.hpp"
namespace rocshmem {
template <typename T>
__device__ void swap_endian_store(T *dst, const T val) {
typedef union U {
T val;
uint8_t bytes[sizeof(T)];
} union_type;
union_type src;
union_type dst_tmp;
src.val = val;
std::reverse_copy(src.bytes, src.bytes + sizeof(T), dst_tmp.bytes);
*dst = dst_tmp.val;
}
template <>
__device__ void swap_endian_store(uint64_t *dst, const uint64_t val) {
uint64_t new_val = ((val << 8) & 0xFF00FF00FF00FF00ULL) |
((val >> 8) & 0x00FF00FF00FF00FFULL);
new_val = ((new_val << 16) & 0xFFFF0000FFFF0000ULL) |
((new_val >> 16) & 0x0000FFFF0000FFFFULL);
*dst = (new_val << 32) | (new_val >> 32);
}
template <>
__device__ void swap_endian_store(int64_t *dst, const int64_t val) {
swap_endian_store(reinterpret_cast<uint64_t *>(dst), (const uint64_t)val);
}
template <>
__device__ void swap_endian_store(uint32_t *dst, const uint32_t val) {
uint32_t new_val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
*dst = (new_val << 16) | (new_val >> 16);
}
template <>
__device__ void swap_endian_store(int32_t *dst, const int32_t val) {
swap_endian_store(reinterpret_cast<uint32_t *>(dst), (const uint32_t)val);
}
template <>
__device__ void swap_endian_store(uint16_t *dst, const uint16_t val) {
*dst = ((val << 8) & 0xFF00) | ((val >> 8) & 0x00FF);
}
template <>
__device__ void swap_endian_store(int16_t *dst, const int16_t val) {
swap_endian_store(reinterpret_cast<uint16_t *>(dst), (const uint16_t)val);
}
} // namespace rocshmem
-53
View File
@@ -1,53 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
#define LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
#include <hip/hip_runtime.h>
namespace rocshmem {
template <typename T>
__device__ void swap_endian_store(T *dst, const T val);
template <>
__device__ void swap_endian_store(uint64_t *dst, const uint64_t val);
template <>
__device__ void swap_endian_store(int64_t *dst, const int64_t val);
template <>
__device__ void swap_endian_store(uint32_t *dst, const uint32_t val);
template <>
__device__ void swap_endian_store(int32_t *dst, const int32_t val);
template <>
__device__ void swap_endian_store(uint16_t *dst, const uint16_t val);
template <>
__device__ void swap_endian_store(int16_t *dst, const int16_t val);
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
-56
View File
@@ -1,56 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "gpu_ib_team.hpp"
#include "../backend_type.hpp"
#include "backend_ib.hpp"
namespace rocshmem {
GPUIBTeam::GPUIBTeam(Backend *backend, TeamInfo *team_info_parent,
TeamInfo *team_info_world, int num_pes, int my_pe,
MPI_Comm mpi_comm, int pool_index)
: Team(backend, team_info_parent, team_info_world, num_pes, my_pe,
mpi_comm) {
type = BackendType::GPU_IB_BACKEND;
const GPUIBBackend *b = static_cast<const GPUIBBackend *>(backend);
pool_index_ = pool_index;
barrier_pSync =
&(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
reduce_pSync =
&(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
alltoall_pSync =
&(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
pWrk = reinterpret_cast<char *>(b->pWrk_pool) +
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
pAta = reinterpret_cast<char *>(b->pAta_pool) +
ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
}
GPUIBTeam::~GPUIBTeam() {}
} // namespace rocshmem
-50
View File
@@ -1,50 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
#define LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
#include "../team.hpp"
namespace rocshmem {
class GPUIBTeam : public Team {
public:
GPUIBTeam(Backend* handle, TeamInfo* team_info_wrt_parent,
TeamInfo* team_info_wrt_world, int num_pes, int my_pe,
MPI_Comm team_comm, int pool_index);
virtual ~GPUIBTeam();
long* barrier_pSync{nullptr};
long* reduce_pSync{nullptr};
long* bcast_pSync{nullptr};
long* alltoall_pSync{nullptr};
void* pWrk{nullptr};
void* pAta{nullptr};
int pool_index_{-1};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
-49
View File
@@ -1,49 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
#define LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
#include <infiniband/mlx5dv.h>
namespace rocshmem {
typedef struct ib_mlx5_base_av {
uint64_t dc_key;
uint32_t dqp_dct;
uint8_t stat_rate_sl;
uint8_t fl_mlid;
uint16_t rlid;
} ib_mlx5_base_av_t;
union mlx5_segment {
mlx5_wqe_ctrl_seg ctrl_seg;
mlx5_wqe_raddr_seg raddr_seg;
mlx5_wqe_atomic_seg atomic_seg;
mlx5_wqe_data_seg data_seg;
mlx5_wqe_inl_data_seg inl_data_seg;
ib_mlx5_base_av_t base_av;
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
-78
View File
@@ -1,78 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
#define LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
#include <hip/hip_runtime.h>
#include <utility>
namespace rocshmem {
class GPUIBContext;
class MemoryBuilderPolicyWrapper {
public:
__device__ MemoryBuilderPolicyWrapper() = default;
__device__ ~MemoryBuilderPolicyWrapper() {
if (wrapped_policy_) {
delete wrapped_policy_;
}
}
template <typename T>
__device__ MemoryBuilderPolicyWrapper(T&& policy)
: wrapped_policy_(new Wrapper<T>(std::forward<T>(policy))) {}
__device__ void operator()(GPUIBContext* context) {
return (*wrapped_policy_)(context);
}
private:
class PolicyBase {
public:
__device__ virtual void operator()(GPUIBContext* context) = 0;
__device__ virtual ~PolicyBase() {}
};
template <typename T>
class Wrapper : public PolicyBase {
public:
__device__ Wrapper(const T& t) : wrapped_policy_(t) {}
__device__ void operator()(GPUIBContext* context) override {
return wrapped_policy_(context);
}
private:
T wrapped_policy_;
};
PolicyBase* wrapped_policy_;
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
-500
View File
@@ -1,500 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "network_policy.hpp"
#include <mpi.h>
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../atomic_return.hpp"
#include "../context_incl.hpp"
#include "backend_ib.hpp"
#include "connection.hpp"
#include "dynamic_connection.hpp"
#include "queue_pair.hpp"
#include "reliable_connection.hpp"
namespace rocshmem {
void NetworkOnImpl::dump_backend_stats(ROCStats *globalStats) {
/*
* TODO(bpotter): Refactor this into the Stats class to remove the ifdef
*/
#ifdef PROFILE
int statblocks = connection->total_number_connections();
uint64_t cycles_ring_sq_db = 0;
uint64_t cycles_update_wqe = 0;
uint64_t cycles_poll_cq = 0;
uint64_t cycles_next_cq = 0;
uint64_t cycles_init = gpu_qps[statblocks - 1].profiler.getStat(INIT);
uint64_t cycles_finalize = gpu_qps[statblocks - 1].profiler.getStat(FINALIZE);
uint64_t total_quiet_count = 0;
uint64_t total_db_count = 0;
uint64_t total_wqe_count = 0;
for (int i = 0; i < statblocks; i++) {
cycles_ring_sq_db += gpu_qps[i].profiler.getStat(RING_SQ_DB);
cycles_update_wqe += gpu_qps[i].profiler.getStat(UPDATE_WQE);
cycles_poll_cq += gpu_qps[i].profiler.getStat(POLL_CQ);
cycles_next_cq += gpu_qps[i].profiler.getStat(NEXT_CQ);
total_quiet_count += gpu_qps[i].profiler.getStat(QUIET_COUNT);
total_db_count += gpu_qps[i].profiler.getStat(DB_COUNT);
total_wqe_count += gpu_qps[i].profiler.getStat(WQE_COUNT);
}
double us_ring_sq_db = cycles_ring_sq_db / gpu_clock_freq_mhz;
double us_update_wqe = cycles_update_wqe / gpu_clock_freq_mhz;
double us_poll_cq = cycles_poll_cq / gpu_clock_freq_mhz;
double us_next_cq = cycles_next_cq / gpu_clock_freq_mhz;
double us_init = cycles_init / gpu_clock_freq_mhz;
double us_finalize = cycles_finalize / gpu_clock_freq_mhz;
const int FIELD_WIDTH = 20;
const int FLOAT_PRECISION = 2;
printf("Counts: Internal Quiets %lu DB Rings %lu WQE Posts %lu\n",
total_quiet_count, total_db_count, total_wqe_count);
printf("\n%*s%*s%*s%*s%*s%*s\n", FIELD_WIDTH + 1, "Init (us)",
FIELD_WIDTH + 1, "Finalize (us)", FIELD_WIDTH + 1, "Ring SQ DB (us)",
FIELD_WIDTH + 1, "Update WQE (us)", FIELD_WIDTH + 1, "Poll CQ (us)",
FIELD_WIDTH + 1, "Next CQ (us)");
uint64_t totalFinalize = globalStats->getStat(NUM_FINALIZE);
printf("%*.*f %*.*f %*.*f %*.*f %*.*f %*.*f\n", FIELD_WIDTH, FLOAT_PRECISION,
us_init / totalFinalize, FIELD_WIDTH, FLOAT_PRECISION,
us_finalize / totalFinalize, FIELD_WIDTH, FLOAT_PRECISION,
us_ring_sq_db / total_db_count, FIELD_WIDTH, FLOAT_PRECISION,
us_update_wqe / total_wqe_count, FIELD_WIDTH, FLOAT_PRECISION,
us_poll_cq / total_quiet_count, FIELD_WIDTH, FLOAT_PRECISION,
us_next_cq / total_quiet_count);
#endif
}
void NetworkOnImpl::reset_backend_stats() {
int statblocks = connection->total_number_connections();
for (size_t i = 0; i < statblocks; i++) {
gpu_qps[i].profiler.resetStats();
}
}
void NetworkOnImpl::exchange_hdp_info(HdpPolicy *hdp_policy,
MPI_Comm thread_comm) {
/*
* Using Connection class, register the host-side hdp flush address
* with the InfiniBand network.
*/
connection->reg_mr(hdp_policy->get_hdp_flush_ptr(), 32, &hdp_mr, false);
/*
* Allocate device-side memory for the remote HDP keys.
*/
CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_rkey),
num_pes * sizeof(uint32_t)));
/*
* Allocate device-side memory for the remote HDP addresses.
*/
CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_address),
num_pes * sizeof(uintptr_t)));
/*
* Allocate host-side memory to exchange hdp keys using MPI_Allgather.
*/
uint32_t *host_hdp_cpy =
reinterpret_cast<uint32_t *>(malloc(num_pes * sizeof(uint32_t)));
if (host_hdp_cpy == nullptr) {
abort();
}
/*
* Allocate host-side memory to exchange hdp addresses using
* MPI_Allgather.
*/
uint32_t **host_hdp_address_cpy =
reinterpret_cast<uint32_t **>(malloc(num_pes * sizeof(uint32_t *)));
if (host_hdp_address_cpy == nullptr) {
free(host_hdp_cpy);
abort();
}
/*
* This processing element writes its personal HDP key and HDP address
* into the host-side arrays which were just allocated.
*/
int my_rank = my_pe;
host_hdp_cpy[my_rank] = htobe32(hdp_mr->rkey);
host_hdp_address_cpy[my_rank] = hdp_policy->get_hdp_flush_ptr();
/*
* Do all-to-all exchange of our HDP key with other processing elements.
*/
MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_hdp_cpy,
sizeof(uint32_t), MPI_CHAR, thread_comm);
/*
* Do all-to-all exchange of our HDP address with other processing
* elements.
*/
MPI_Allgather(MPI_IN_PLACE, sizeof(uintptr_t), MPI_CHAR, host_hdp_address_cpy,
sizeof(uint32_t *), MPI_CHAR, thread_comm);
/*
* Copy the recently exchanged HDP keys to device memory.
*/
hipStream_t stream;
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
CHECK_HIP(hipMemcpyAsync(hdp_rkey, host_hdp_cpy, num_pes * sizeof(uint32_t),
hipMemcpyHostToDevice, stream));
/*
* Copy the recently exchanged HDP addresses to device memory.
*/
CHECK_HIP(hipMemcpyAsync(hdp_address, host_hdp_address_cpy,
num_pes * sizeof(uint32_t *), hipMemcpyHostToDevice,
stream));
CHECK_HIP(hipStreamSynchronize(stream));
CHECK_HIP(hipStreamDestroy(stream));
/*
* Free the host-side resources used to exchange HDP resources
* between processing elements.
*/
free(host_hdp_cpy);
free(host_hdp_address_cpy);
}
void NetworkOnImpl::setup_atomic_region() {
/*
* Allocate fine-grained device-side memory for the atomic return
* region.
*/
allocate_atomic_region(&atomic_ret, num_blocks);
/*
* Register the atomic return region on the InfiniBand network.
*/
connection->reg_mr(atomic_ret->atomic_base_ptr,
sizeof(uint64_t) * max_nb_atomic * num_blocks, &mr, false);
/*
* Set member variable from class.
*/
atomic_ret->atomic_lkey = htobe32(mr->lkey);
}
void NetworkOnImpl::heap_memory_rkey(char *local_heap_base, size_t heap_size,
MPI_Comm thread_comm, bool is_managed) {
/*
* Allocate host-side memory to hold remote keys for all processing
* elements.
*/
const size_t rkeys_size = sizeof(uint32_t) * num_pes;
uint32_t *host_rkey_cpy = reinterpret_cast<uint32_t *>(malloc(rkeys_size));
if (host_rkey_cpy == nullptr) {
abort();
}
/*
* Using the Connection class, register the symmetric heap with the
* InfiniBand network.
*/
void *base_heap = local_heap_base;
connection->reg_mr(base_heap, heap_size, &heap_mr, is_managed);
/*
* Using the memory region from the prior heap memory registration,
* allocate and initialize some device-side memory to hold the remote
* keys for the symmetric heap base.
*
* Only the device-side memory entry for this processing element will be
* updated with the key for the heap memory region.
*/
connection->initialize_rkey_handle(&heap_rkey, heap_mr);
/*
* Copy the device-side heap base remote key array to the host-side
* heap base remote key array.
*/
hipStream_t stream;
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
CHECK_HIP(hipMemcpyAsync(host_rkey_cpy, heap_rkey, rkeys_size,
hipMemcpyDeviceToHost, stream));
CHECK_HIP(hipStreamSynchronize(stream));
/*
* Do all-to-all exchange of symmetric heap base remote key between the
* processing elements.
*/
MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_rkey_cpy,
sizeof(uint32_t), MPI_CHAR, thread_comm);
/*
* Copy the recently updated host-side heap base remote key array back
* to the device-side memory.
*/
CHECK_HIP(hipMemcpyAsync(heap_rkey, host_rkey_cpy, rkeys_size,
hipMemcpyHostToDevice, stream));
CHECK_HIP(hipStreamSynchronize(stream));
CHECK_HIP(hipStreamDestroy(stream));
/*
* Free the host-side resources used to do the processing element
* exchange of keys and addresses for the symmetric heap base.
*/
free(host_rkey_cpy);
/*
* Initialize this member variable to hold the InfiniBand memory
* region's local key.
*/
lkey = heap_mr->lkey;
}
void NetworkOnImpl::setup_gpu_qps(GPUIBBackend *B) {
/*
* Determine how many connections are needed.
* The number of connections depends on the connection type and the
* number of workgroups.
*/
int connections;
connection->get_remote_conn(&connections);
connections *= num_blocks;
/*
* Allocate device-side memory for the queue pairs.
*/
CHECK_HIP(hipMalloc(&gpu_qps, sizeof(QueuePair) * connections));
/*
* For every connection, initialize the QueuePair.
*/
for (int i = 0; i < connections; i++) {
new (&gpu_qps[i]) QueuePair(B);
connection->init_gpu_qp_from_connection(&gpu_qps[i], i);
}
}
void NetworkOnImpl::rocshmem_g_init(SymmetricHeap *heap_handle,
MPI_Comm thread_comm) {
init_g_ret(heap_handle, thread_comm, num_blocks, &g_ret);
}
__host__ void NetworkOnImpl::networkHostSetup(GPUIBBackend *B) {
num_pes = B->num_pes;
my_pe = B->my_pe;
num_blocks = B->num_blocks_;
#ifdef USE_DC
connection = new DynamicConnection(B);
#else
connection = new ReliableConnection(B);
#endif
connection->initialize(B->num_blocks_);
exchange_hdp_info(B->hdp_policy, B->thread_comm);
const auto &heap_bases{B->heap.get_heap_bases()};
heap_memory_rkey(heap_bases[my_pe], B->heap.get_size(), B->thread_comm,
B->heap.is_managed());
// The earliest we can allow the main thread to launch a kernel to
// avoid potential deadlock
network_init_done = true;
setup_atomic_region();
connection->initialize_gpu_policy(&connection_policy, heap_rkey);
rocshmem_g_init(&B->heap, B->thread_comm);
connection->post_wqes();
setup_gpu_qps(B);
}
__host__ void NetworkOnImpl::networkHostFinalize() {
CHECK_HIP(hipFree(hdp_rkey));
hdp_rkey = nullptr;
CHECK_HIP(hipFree(hdp_address));
hdp_address = nullptr;
CHECK_HIP(hipFree(atomic_ret));
atomic_ret = nullptr;
CHECK_HIP(hipFree(gpu_qps));
gpu_qps = nullptr;
CHECK_HIP(hipFree(connection_policy));
connection_policy = nullptr;
connection->free_rkey_handle(heap_rkey);
connection->finalize();
delete connection;
connection = nullptr;
}
__host__ void NetworkOnImpl::networkHostInit(GPUIBContext *ctx, int buffer_id) {
int remote_conn = getNumQueuePairs();
CHECK_HIP(hipMalloc(&ctx->device_qp_proxy, remote_conn * sizeof(QueuePair)));
for (int i = 0; i < getNumQueuePairs(); i++) {
/*
* RC gpu_qp is actually [NUM_PE][NUM_BLOCK] qps but is flattened.
* Each num_pe entry contains num_block QPs connected to that PE.
* For RC, we need to iterate gpu_qp[i][buffer_id] to collect a
* single QP for each connected PE in order to build context.
* For DC, NUM_PE = 1 so can just use buffer_id directly.
*/
int offset = num_blocks * i + buffer_id;
new (ctx->getQueuePair(i)) QueuePair(gpu_qps[offset]);
auto *qp = ctx->getQueuePair(i);
qp->global_qp = &gpu_qps[offset];
qp->num_cqs = getNumQueuePairs();
qp->atomic_ret.atomic_base_ptr =
&atomic_ret->atomic_base_ptr[max_nb_atomic * buffer_id];
qp->base_heap = ctx->base_heap;
}
ctx->g_ret = g_ret;
}
__device__ void NetworkOnImpl::networkGpuInit(GPUIBContext *ctx,
int buffer_id) {
for (int i = 0; i < getNumQueuePairs(); i++) {
int offset = num_blocks * i + buffer_id;
auto *qp = ctx->getQueuePair(i);
new (qp) QueuePair(gpu_qps[offset]);
qp->global_qp = &gpu_qps[offset];
qp->num_cqs = getNumQueuePairs();
qp->atomic_ret.atomic_base_ptr =
&atomic_ret->atomic_base_ptr[max_nb_atomic * buffer_id];
qp->base_heap = ctx->base_heap;
}
ctx->g_ret = g_ret;
}
__device__ __host__ QueuePair *NetworkOnImpl::getQueuePair(QueuePair *qp_handle,
int pe) {
#ifdef USE_DC
return qp_handle;
#else
return &qp_handle[pe];
#endif
}
__device__ __host__ int NetworkOnImpl::getNumQueuePairs() {
#ifdef USE_DC
return 1;
#else
return num_pes;
#endif
}
void NetworkOffImpl::networkHostSetup(GPUIBBackend *B) {
num_pes = B->num_pes;
my_pe = B->my_pe;
num_blocks = B->num_blocks_;
exchange_hdp_info(B->hdp_policy, B->thread_comm);
}
void NetworkOffImpl::exchange_hdp_info(HdpPolicy *hdp_policy,
MPI_Comm thread_comm) {
#ifdef USE_SINGLE_NODE
// We are using the symmetric heap for the HDP flush ptr
hdp_address = reinterpret_cast<uintptr_t *>(hdp_policy->get_hdp_flush_ptr());
#else
/*
* Allocate device-side memory for the remote HDP addresses.
*/
CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_address),
num_pes * sizeof(uintptr_t)));
/*
* Allocate host-side memory to exchange hdp keys using MPI_Allgather.
*/
uint32_t *host_hdp_cpy =
reinterpret_cast<uint32_t *>(malloc(num_pes * sizeof(uint32_t)));
if (host_hdp_cpy == nullptr) {
abort();
}
/*
* Allocate host-side memory to exchange hdp addresses using
* MPI_Allgather.
*/
uint32_t **host_hdp_address_cpy =
reinterpret_cast<uint32_t **>(malloc(num_pes * sizeof(uint32_t *)));
if (host_hdp_address_cpy == nullptr) {
free(host_hdp_cpy);
abort();
}
/*
* This processing element writes its personal HDP address
* into the host-side array which were just allocated.
*/
int my_rank = my_pe;
host_hdp_address_cpy[my_rank] = hdp_policy->get_hdp_flush_ptr();
/*
* Do all-to-all exchange of our HDP address with other processing
* elements.
*/
MPI_Allgather(MPI_IN_PLACE, sizeof(uintptr_t), MPI_CHAR, host_hdp_address_cpy,
sizeof(uint32_t *), MPI_CHAR, thread_comm);
/*
* Copy the recently exchanged HDP addresses to device memory.
*/
hipStream_t stream;
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
CHECK_HIP(hipMemcpyAsync(hdp_address, host_hdp_address_cpy,
num_pes * sizeof(uint32_t *), hipMemcpyHostToDevice,
stream));
CHECK_HIP(hipStreamSynchronize(stream));
CHECK_HIP(hipStreamDestroy(stream));
/*
* Free the host-side resources used to exchange HDP resources
* between processing elements.
*/
free(host_hdp_cpy);
free(host_hdp_address_cpy);
#endif
}
void NetworkOffImpl::networkHostFinalize() {
#ifndef USE_SINGLE_NODE
CHECK_HIP(hipFree(hdp_address));
#endif
hdp_address = nullptr;
}
} // namespace rocshmem
-357
View File
@@ -1,357 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
#define LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
#include <hip/hip_runtime.h>
#include <mpi.h>
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "connection_policy.hpp"
#include "queue_pair.hpp"
#include "../hdp_policy.hpp"
#include "../memory/symmetric_heap.hpp"
#include "../stats.hpp"
#include "../util.hpp"
struct ibv_mr;
struct hdp_reg_t;
namespace rocshmem {
struct atomic_ret_t;
class GPUIBBackend;
class GPUIBContext;
class GPUIBHostContext;
class Connection;
class NetworkOnImpl {
public:
void dump_backend_stats(ROCStats *globalStats);
void reset_backend_stats();
/**
* @brief setup the network resources and initialization for the
* GPUIBBackend
*/
__host__ void networkHostSetup(GPUIBBackend *B);
/**
* @brief deallocate and close the network resources
*/
__host__ void networkHostFinalize();
/**
* @brief initialize the network resources for each context
*/
__host__ void networkHostInit(GPUIBContext *ctx, int buffer_id);
/**
* @brief initialize the network resources for each context on GPU side
*/
__device__ void networkGpuInit(GPUIBContext *ctx, int buffer_id);
/**
* @brief returns the QP for the targeted pe
*/
__device__ __host__ QueuePair *getQueuePair(QueuePair *qp, int pe);
/**
* @brief returns the numbers of QPs used per the calling PE
*/
__device__ __host__ int getNumQueuePairs();
/**
* @brief returns the number of PEs accessible via network
*/
__device__ __host__ int getNumDest() { return num_pes; }
static uint32_t externSharedBytes(int num_pes) {
int remote_conn{1};
#ifndef USE_DC
remote_conn = num_pes;
#endif
return remote_conn * sizeof(QueuePair);
}
protected:
/**
* @brief flag to indicated that the helper thread reach this milestone
*/
volatile bool network_init_done{false};
void heap_memory_rkey(char *local_heap_base, size_t heap_size,
MPI_Comm thread_comm, bool is_managed);
/**
* @brief Exchange HDP information between all processing elements.
*
* Each device has a Host Data Path (HDP) associated with it must be
* manually controlled when using fine-grained memory accesses. (The
* symmetric heap is allocated with fine-grained memory to support both
* host memory accesses and device memory accesses.) The HDP can be
* cleared by accessing an address on the device. These addresses must be
* shared across the network (to support updates on remote accesses).
*
* These HDPs are visible to the network by registering them as
* InfiniBand memory regions. Every memory region has a remote key
* which needs to be shared across the network (to access the memory
* region).
*
* This method is responsible to allocating and initializing the
* library's HDP device-side memory and running the all-to-all exchange
* to share both the keys and addresses.
*
* @todo Implement HDP policy class methods to hide most of this
* method. The guts should be encapsulated in the policy class and
* not exposed here in the backend. Within the policy class methods,
* create helper function to improve code reuse regarding the many
* data transfers.
*/
void exchange_hdp_info(HdpPolicy *hdp_policy, MPI_Comm thread_comm);
/**
* @brief Allocate and initialize the atomic region.
*
* The atomic region is used by the atomic operations which have return
* values. The library user does not need to provide an address for the
* return value so we are forced to do it on their behalf.
*
* The atomic_ret member is initialized upon completion of this method.
*/
void setup_atomic_region();
/**
* @brief Allocate and initialize device-side queue pair objects.
*
* Upon completion, the gpu_qps member will be initialized.
*/
void setup_gpu_qps(GPUIBBackend *B);
/**
* @brief Allocate and initialize device-side memory that will be used for
* the return of g shmem ops (eg: shmem_int_g)
*/
void rocshmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm);
/**
* @brief The backend delegates some InfiniBand connection setup to
* the Connection class.
*/
Connection *connection{nullptr};
public:
/**
* @brief Number of PEs. Get directly from the GPUIBBackend.
*/
int num_pes{0};
/**
* @brief This PE's rank.
*/
int my_pe{-1};
/**
* @brief Number of WG that will be performing communication
*/
int num_blocks{0};
/**
* @brief Holds InfiniBand remote keys for HDP memory regions.
*
* The member holds a C-array allocation for remote keys (from
* InfiniBand memory registrations) for remote HDP registers. The C-array
* has one entry for each processing element (indexed by processing
* element ID).
*
* @todo Remove duplication between the backend class and the QueuePair
* class. QueuePair stores a copy of this member too. The backend
* class does not do much besides initialize this data structure and
* hold it until the QueuePair can consume it.
*/
uint32_t *hdp_rkey{nullptr};
/**
* @brief Holds HDP register addresses for each processing element.
*
* The Host Data Path (HDP) addresses are used to clear a buffer
* which interferes with memory visibility of accesses to fine-grained
* allocations.
*
* The member holds a C-array allocation for the register addresses.
* The C-array has one entry for each processing element (indexed by
* processing element ID).
*
* @todo Remove duplication between the backend class and the QueuePair
* class. QueuePair stores a copy of this member too. The backend
* class does not do much besides initialize this data structure and
* hold it until the QueuePair can consume it.
*/
uintptr_t *hdp_address{nullptr};
/**
* @brief Handle for the HDP memory region.
*/
ibv_mr *hdp_mr{nullptr};
/**
* @brief Set of QueuePairs used by device to do networking.
*
* The member is used during Context creation.
*
* @todo What we really need here is a collection of Contexts that can
* either be copied into LDS or used directly by the GPU depending on
* what type of context it is (shareable, serialized, or private).
* No need to pool up QueuePairs, they can just be managed by their
* owning Context. Should then consider pushing into base class since
* it's not gpu-ib specific.
*/
QueuePair *gpu_qps{nullptr};
/**
* @brief C-array of symmetric heap base pointers.
*
* A C-array of char* pointers corresponding to the heap base pointers
* virtual address for each processing element that we can communicate
* with.
*/
uint32_t *heap_rkey{nullptr};
/**
* @brief Handle for the symmetric heap memory region.
*/
ibv_mr *heap_mr{nullptr};
/**
* @brief Local key for the symmetric heap memory region.
*/
uint32_t lkey{0};
/**
* @brief Control struct for atomic memory region.
*
* The atomic region is used by the atomic operations which have return
* values. The library user does not need to provide an address for the
* return value so we are forced to do it on their behalf.
*/
atomic_ret_t *atomic_ret{nullptr};
/**
* @brief Handle for the atomic memory region.
*
* @todo Provide more descriptive variable name.
*/
ibv_mr *mr{nullptr};
/**
* @brief Buffer used to store the results of a *_g operation.
*
* These operations do not provide a destination buffer so the runtime
* must manage one.
*/
char *g_ret{nullptr};
/**
* @brief Compile-time configuration policy for InfiniBand connections.
*
* The configuration option "USE_DC" can be enabled to create
* Dynamic connection types. By default, Reliable connections are
* created.
*/
ConnectionImpl *connection_policy{nullptr};
};
// clang-format off
NOWARN(-Wunused-parameter,
class NetworkOffImpl {
public:
void dump_backend_stats(ROCStats *globalStats) { }
void reset_backend_stats() { }
__host__ void networkHostSetup(GPUIBBackend *B);
__host__ void exchange_hdp_info(HdpPolicy *hdp_policy, MPI_Comm thread_comm);
__host__ void networkHostFinalize();
__host__ void networkHostInit(GPUIBContext *ctx, int buffer_id) {}
__device__ void networkGpuInit(GPUIBContext *ctx, int buffer_id) {}
__device__ __host__ QueuePair *getQueuePair(QueuePair *qp, int pe) {
return nullptr;
}
__device__ __host__ int getNumQueuePairs() { return 0; }
__device__ __host__ int getNumDest() { return 0; }
static uint32_t externSharedBytes(int num_pes) { return 0; }
public:
int num_pes{0};
int my_pe{-1};
int num_blocks{0};
uint32_t *hdp_rkey{nullptr};
uintptr_t *hdp_address{nullptr};
ibv_mr *hdp_mr{nullptr};
QueuePair *gpu_qps{nullptr};
uint32_t *heap_rkey{nullptr};
ibv_mr *heap_mr{nullptr};
uint32_t lkey{0};
atomic_ret_t *atomic_ret{nullptr};
ibv_mr *mr{nullptr};
char *g_ret{nullptr};
ConnectionImpl *connection_policy{nullptr};
};
)
// clang-format on
/*
* Select which one of our IPC policies to use at compile time.
*/
#ifdef USE_SINGLE_NODE
typedef NetworkOffImpl NetworkImpl;
#else
typedef NetworkOnImpl NetworkImpl;
#endif
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
-79
View File
@@ -1,79 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "qe_dumper.hpp"
namespace rocshmem {
QeDumper::QeDumper(int dest_pe, int src_wg, int index)
: dest_pe_(dest_pe), src_wg_(src_wg), index_(index) {
void* temp = malloc(sizeof(GPUIBBackend*));
gpu_backend_ = static_cast<GPUIBBackend*>(temp);
GPUIBBackend* device_backend_proxy_address;
CHECK_HIP(hipGetSymbolAddress(
reinterpret_cast<void**>(&device_backend_proxy_address),
HIP_SYMBOL(device_backend_proxy)));
CHECK_HIP(hipMemcpy(&gpu_backend_, device_backend_proxy_address,
sizeof(GPUIBBackend*), hipMemcpyDeviceToHost));
int qp_offset = gpu_backend_->num_blocks_ * dest_pe_ + src_wg_;
qp_ = &(gpu_backend_->networkImpl.gpu_qps[qp_offset]);
}
QeDumper::~QeDumper() {
/*if (gpu_backend_) {
free(gpu_backend_);
}*/
}
void QeDumper::dump_cq() {
type_ = "CQ";
auto* raw_cqe = &(qp_->current_cq_q_H[index_]);
raw_u64_ = reinterpret_cast<uint64_t*>(raw_cqe);
dump_uint64_(8);
}
void QeDumper::dump_sq() {
type_ = "SQ";
auto* raw_sqe = &(qp_->current_sq_H[index_ * 8]);
raw_u64_ = reinterpret_cast<uint64_t*>(raw_sqe);
dump_uint64_(8);
}
void QeDumper::dump_uint64_(size_t num_elems) const {
printf("%s(%d, %d, %d) *** = ", type_.c_str(), dest_pe_, src_wg_, index_);
for (size_t i = 0; i < num_elems; i++) {
printf(" %lx ", raw_u64_[i]);
}
printf("done %s\n", type_.c_str());
}
} // namespace rocshmem
-66
View File
@@ -1,66 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
#define LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
#include <hip/hip_runtime.h>
#include <infiniband/mlx5dv.h>
#include <string>
#include "backend_ib.hpp"
#include "queue_pair.hpp"
namespace rocshmem {
class QeDumper {
public:
QeDumper(int dest_pe, int src_wg, int index);
~QeDumper();
void dump_cq();
void dump_sq();
private:
void dump_uint64_(size_t num_elems) const;
int dest_pe_{-1};
int src_wg_{-1};
int index_{-1};
GPUIBBackend* gpu_backend_{nullptr};
std::string type_{};
QueuePair* qp_{nullptr};
uint64_t* raw_u64_{nullptr};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
-437
View File
@@ -1,437 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "queue_pair.hpp"
#include <hip/hip_runtime.h>
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_ib.hpp"
#include "endian.hpp"
#include "segment_builder.hpp"
#include "../util.hpp"
namespace rocshmem {
QueuePair::QueuePair(GPUIBBackend *backend)
: hdp_policy(backend->hdp_policy),
connection_policy(*backend->networkImpl.connection_policy) {
hdp_rkey = backend->networkImpl.hdp_rkey;
hdp_address = backend->networkImpl.hdp_address;
atomic_ret.atomic_lkey = backend->networkImpl.atomic_ret->atomic_lkey;
atomic_ret.atomic_counter = 0;
}
__device__ QueuePair::~QueuePair() {
uint64_t start = profiler.startTimer();
global_qp->sq_counter = sq_counter;
global_qp->local_sq_cnt = local_sq_cnt;
global_qp->cq_consumer_counter = cq_consumer_counter;
global_qp->current_sq = current_sq;
global_qp->current_cq_q = current_cq_q;
global_qp->sq_overflow = sq_overflow;
global_qp->quiet_counter = quiet_counter;
profiler.endTimer(start, FINALIZE);
global_qp->profiler.accumulateStats(profiler);
__syncthreads();
}
__device__ uint8_t QueuePair::get_cq_error_syndrome(mlx5_cqe64 *cqe_entry) {
mlx5_err_cqe *cqe_err = reinterpret_cast<mlx5_err_cqe *>(cqe_entry);
return cqe_err->syndrome;
}
__device__ void QueuePair::ring_doorbell(uint64_t db_val) {
swap_endian_store(const_cast<uint32_t *>(dbrec_send),
reinterpret_cast<uint32_t>(sq_counter));
STORE(db.ptr, db_val);
db.uint ^= 256;
}
__device__ void QueuePair::set_completion_flag_on_wqe(int num_wqes) {
uint64_t *wqe = &current_sq[8 * ((sq_counter - num_wqes) % max_nwqe)];
uint8_t *wqe_ce = reinterpret_cast<uint8_t *>(wqe) + 11;
*wqe_ce = 8;
}
template <>
__device__ void QueuePair::update_wqe_ce_single<false>(int num_wqes) {
if (sq_counter % max_nwqe == (max_nwqe - 2)) {
set_completion_flag_on_wqe(num_wqes);
quiet_counter++;
}
}
template <>
__device__ void QueuePair::update_wqe_ce_single<true>(int num_wqes) {
set_completion_flag_on_wqe(num_wqes);
quiet_counter++;
}
template <>
__device__ void QueuePair::update_wqe_ce_thread<false>(int num_wqes) {}
template <>
__device__ void QueuePair::update_wqe_ce_thread<true>(int num_wqes) {
set_completion_flag_on_wqe(num_wqes);
atomicAdd(&quiet_counter, 1);
}
__device__ void QueuePair::compute_db_val_opcode(uint64_t *db_val,
uint16_t dbrec_val,
uint8_t opcode) {
uint64_t opcode64 = opcode;
opcode64 = opcode64 << 24 & 0x000000FFFF000000;
uint64_t dbrec = dbrec_val << 8;
dbrec = dbrec & 0x0000000000FFFF00;
uint64_t val = *db_val;
val = val & 0xFFFFFFFFFF0000FF;
*db_val = val | dbrec | opcode64;
}
template <class level>
__device__ void QueuePair::quiet_internal() {
/*
* If there are nothing to quiet, just return early.
*/
uint32_t quiet_val = quiet_counter;
if (!quiet_val) {
return;
}
profiler.incStat(QUIET_COUNT);
uint64_t start = profiler.startTimer();
/*
* Generate a pointer to the completion queue entry.
*/
cq_consumer_counter = cq_consumer_counter + quiet_val - 1;
uint32_t indx = (cq_consumer_counter % cq_size);
mlx5_cqe64 *cqe_entry = &current_cq_q[indx];
/*
* Access the op_own value in the completion queue entry.
*/
int val_ld = uncached_load_ubyte(&(cqe_entry->op_own));
uint8_t val_op_own = val_ld;
/*
* If the completion queue entry is not valid, wait for it to become so.
*/
while (!((val_op_own & 0x1) == ((cq_consumer_counter >> cq_log_size) & 1)) ||
((val_op_own) >> 4) == 0xF) {
val_ld = uncached_load_ubyte(&(cqe_entry->op_own));
val_op_own = val_ld;
}
/*
* Grab the opcode from the op_own field and report if it is an error.
*/
uint8_t opcode = val_op_own >> 4;
if (opcode != 0) {
uint8_t syndrome = get_cq_error_syndrome(cqe_entry);
mlx5_err_cqe *cqe_err = reinterpret_cast<mlx5_err_cqe *>(cqe_entry);
GPU_DPRINTF("QUIET ERROR: signature %d opcode_qpn %llx wqe_cnt %llx \n",
syndrome, cqe_err->s_wqe_opcode_qpn, cqe_err->wqe_counter);
}
/*
* Decrement the quiet count by the amount determined at the beginning
* of this method.
*
* bpotter - There are two areas of concern in this method for me.
* 1) In multithreaded builds, we may need to make this method a critical
* section to prevent data races on these variables.
*
* 2) Is there a data race in the API if a one remote process calls quiet
* while another process continues adding events? Is it ever possible for
* a quiet to complete, but the quiet_counter decrement here is not set
* to zero?
*/
level L;
L.decQuietCounter(&quiet_counter, quiet_val);
profiler.endTimer(start, POLL_CQ);
start = profiler.startTimer();
/*
* Increment the trailing index counter which tracks our spot in the
* completion queue.
*/
cq_consumer_counter++;
swap_endian_store(const_cast<uint32_t *>(dbrec_cq), cq_consumer_counter);
profiler.endTimer(start, NEXT_CQ);
}
template <class level>
__device__ void QueuePair::quiet_single() {
level L;
L.quiet(this);
}
template <class level>
__device__ void QueuePair::quiet_single_heavy(int pe) {
level L;
L.quiet_heavy(this, pe);
}
template <class level, bool cqe>
__device__ void QueuePair::update_posted_wqe_generic(
int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool ring_db,
uint64_t atomic_ret_pos, bool zero_byte_rd) {
uint64_t start = profiler.startTimer();
level L;
L.postLock(this, pe);
uint32_t num_wqes = connection_policy.getNumWqes(opcode);
// Get the index for my thread's put in the SQ.
uint64_t my_sq_counter = L.threadAtomicAdd(&sq_counter, num_wqes);
uint64_t my_sq_index = my_sq_counter % max_nwqe;
// 16-bit little endian version of the SQ index needed to build the cntrl
// segment in the WQE.
uint16_t le_sq_counter;
uint16_t sq_counter_u16 = my_sq_counter;
swap_endian_store(&le_sq_counter, sq_counter_u16);
bool flag = sq_overflow;
uint32_t lkey_in_stack_frame = lkey;
uint32_t rkey_in_stack_frame = rkey;
uint32_t ctrl_qp_sq_in_stack_frame = ctrl_qp_sq;
uint64_t ctrl_sig_in_stack_frame = ctrl_sig;
connection_policy.setRkey(&rkey_in_stack_frame, pe);
if (opcode == MLX5_OPCODE_RDMA_WRITE && !size) {
rkey_in_stack_frame = hdp_rkey[pe];
size = 4;
}
/*
* Build out all the segments required for my WQE(s) based on the
* operation, starting at my_sq_index into the SQ. SegmentBuilder will
* keep track of placing the segments in the correct location.
*/
SegmentBuilder seg_build(my_sq_index, current_sq);
seg_build.update_cntrl_seg(opcode, le_sq_counter, ctrl_qp_sq_in_stack_frame,
ctrl_sig_in_stack_frame, &connection_policy,
zero_byte_rd);
seg_build.update_connection_seg(pe, &connection_policy);
seg_build.update_rdma_seg(raddr, rkey_in_stack_frame);
if (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS) {
seg_build.update_atomic_data_seg(atomic_data, atomic_cmp);
size = 8;
lkey_in_stack_frame = atomic_ret.atomic_lkey;
laddr = &atomic_ret.atomic_base_ptr[atomic_ret_pos];
}
if (size <= inline_threshold && opcode == MLX5_OPCODE_RDMA_WRITE) {
seg_build.update_inl_data_seg(laddr, size);
} else {
seg_build.update_data_seg(laddr, size, lkey_in_stack_frame);
}
profiler.incStat(WQE_COUNT);
profiler.endTimer(start, UPDATE_WQE);
start = profiler.startTimer();
L.template finishPost<cqe>(this, ring_db, num_wqes, pe, le_sq_counter,
opcode);
profiler.incStat(DB_COUNT);
profiler.endTimer(start, RING_SQ_DB);
}
/******************************************************************************
****************************** SHMEM INTERFACE *******************************
*****************************************************************************/
template <class level>
__device__ void QueuePair::put_nbi(void *dest, const void *source,
size_t nelems, int pe, bool db_ring) {
uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
update_posted_wqe_generic<level, false>(
pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0);
}
template <class level>
__device__ void QueuePair::put_nbi_cqe(void *dest, const void *source,
size_t nelems, int pe, bool db_ring) {
uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
update_posted_wqe_generic<level, true>(
pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0);
}
template <class level>
__device__ void QueuePair::get_nbi(void *dest, const void *source,
size_t nelems, int pe, bool db_ring) {
uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
update_posted_wqe_generic<level, false>(
pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0);
}
template <class level>
__device__ void QueuePair::get_nbi_cqe(void *dest, const void *source,
size_t nelems, int pe, bool db_ring) {
uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
update_posted_wqe_generic<level, true>(
pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0);
}
template <class level>
__device__ void QueuePair::zero_b_rd(int pe) {
uintptr_t *dst = reinterpret_cast<uintptr_t *>(base_heap[pe]);
update_posted_wqe_generic<level, true>(pe, 0, nullptr, dst,
MLX5_OPCODE_RDMA_READ, 0, 0, true, 0,
true); // enable 0_byte read op
}
__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t value,
int64_t cond, int pe, bool db_ring,
uint8_t atomic_op) {
THREAD TH;
uint64_t pos = TH.threadAtomicAdd(
reinterpret_cast<unsigned long long *>(/* NOLINT(runtime/int) */
&atomic_ret.atomic_counter));
pos = pos % max_nb_atomic;
int64_t *atomic_base_ptr =
reinterpret_cast<int64_t *>(atomic_ret.atomic_base_ptr);
int64_t *load_address = &atomic_base_ptr[pos];
*load_address = -100;
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
update_posted_wqe_generic<THREAD, true>(pe, sizeof(int64_t), nullptr, dst,
atomic_op, value, cond, db_ring, pos);
quiet_single<THREAD>();
while (uncached_load(load_address) == -100) {
}
int64_t ret = *load_address;
__threadfence();
return ret;
}
__device__ void QueuePair::atomic_nofetch(void *dest, int64_t value,
int64_t cond, int pe, bool db_ring,
uint8_t atomic_op) {
THREAD TH;
uint64_t pos = TH.threadAtomicAdd(
reinterpret_cast<unsigned long long *>(/* NOLINT(runtime/int) */
&atomic_ret.atomic_counter));
pos = pos % max_nb_atomic;
uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
update_posted_wqe_generic<THREAD, true>(pe, sizeof(int64_t), nullptr, dst,
atomic_op, value, cond, db_ring, pos);
quiet_single<THREAD>();
}
__device__ void QueuePair::fence(int pe) {
// TODO(khamidou): should this be replaced by a zero_byte_rd?
// FIXME: the relaxed ordering requires an intervening read to order
// prior operations.
auto remote_hdp_uncast = hdp_address[pe];
uintptr_t *remote_hdp = reinterpret_cast<uintptr_t *>(remote_hdp_uncast);
update_posted_wqe_generic<THREAD, true>(
pe, 0, nullptr, remote_hdp, MLX5_OPCODE_RDMA_WRITE, 0, 0, true, 0);
}
__device__ void QueuePair::waitCQSpace(int num_msgs) {
// We cannot post more outstanding requests than the completion queue
// size. Force a quiet if we are out of space.
if ((quiet_counter + num_msgs) >= cq_size) {
GPU_DPRINTF(
"*** inside post_cq forcing flush: outstanding %d "
"adding %d cq_size %d\n",
quiet_counter, num_msgs, cq_size);
// TODO(khamidou): More targeted flush would be better here.
quiet_single<THREAD>();
}
}
__device__ void QueuePair::waitSQSpace(int num_msgs) {
// We cannot post more outstanding requests than the Send queue
// size. Force a quiet if we are out of space.
local_sq_cnt += num_msgs;
int div = local_sq_cnt / max_nwqe;
if (div > 0) {
GPU_DPRINTF(
"*** inside waitSQSpace forcing flush to overrun the SQ"
" sq_counter %d adding %d quiet_conter %d \n",
sq_counter, num_msgs, max_nwqe, quiet_counter);
quiet_single<THREAD>();
local_sq_cnt = local_sq_cnt % max_nwqe;
}
}
void QueuePair::setDBval(uint64_t val) { db_val = val; }
#define THREAD_LEVEL_GEN(T) \
template __device__ void QueuePair::put_nbi<T>( \
void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
template __device__ void QueuePair::put_nbi_cqe<T>( \
void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
template __device__ void QueuePair::get_nbi<T>( \
void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
template __device__ void QueuePair::get_nbi_cqe<T>( \
void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
template __device__ void QueuePair::zero_b_rd<T>(int pe); \
template __device__ void QueuePair::quiet_single<T>(); \
template __device__ void QueuePair::quiet_single_heavy<T>(int pe); \
template __device__ void QueuePair::quiet_internal<T>();
THREAD_LEVEL_GEN(THREAD)
THREAD_LEVEL_GEN(WG)
THREAD_LEVEL_GEN(WAVE)
} // namespace rocshmem
-431
View File
@@ -1,431 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
#define LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
/**
* @file queue_pair.hpp
*
* @section DESCRIPTION
* An IB QueuePair (SQ and CQ) that the device can use to perform network
* operations. Most important rocSHMEM operations are performed by this
* class.
*/
#include <infiniband/mlx5dv.h>
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../atomic_return.hpp"
#include "connection_policy.hpp"
#include "thread_policy.hpp"
#include "../hdp_policy.hpp"
#include "../stats.hpp"
namespace rocshmem {
class GPUIBBackend;
enum gpu_ib_stats {
RING_SQ_DB = 0,
UPDATE_WQE,
POLL_CQ,
NEXT_CQ,
QUIET_COUNT,
DB_COUNT,
WQE_COUNT,
MEM_WAIT,
INIT,
FINALIZE,
GPU_IB_NUM_STATS
};
typedef union db_reg {
uint64_t *ptr;
uintptr_t uint;
} db_reg_t;
class QueuePair {
public:
/**
* @brief Constructor.
*
* @param[in] backend Backend needed for member access.
*/
explicit QueuePair(GPUIBBackend *backend);
/**
* @brief Destructor.
*/
__device__ ~QueuePair();
/**
* @brief Inspect completion queue and possibly wait for free space.
*
* @param[in] num_msgs Number of entries needing space in completion queue.
*/
__device__ void waitCQSpace(int num_msgs);
/**
* @brief Inspect send queue and possibly wait for free space.
*
* @param[in] num_msgs Number of entries needing space in send queue.
*/
__device__ void waitSQSpace(int num_msgs);
/**
* @brief Create and enqueue a non-blocking put work queue entry (wqe).
*
* @tparam level Implements specific behaviors for thread, warp, block access.
*
* @param[in] dest Destination address for data transmission.
* @param[in] source Source address for data transmission.
* @param[in] nelems Size in bytes of data transmission.
* @param[in] pe Destination processing element of data transmission.
* @param[in] db_ring Denotes whether send queue door bell should be rung.
*/
template <class level>
__device__ void put_nbi(void *dest, const void *source, size_t nelems, int pe,
bool db_ring);
/**
* @brief Create and enqueue a non-blocking put work queue entry (wqe).
*
* @note This variant differs from put_nbi by requesting that a completion
* queue entry is generated in the completion queue.
*
* @tparam level Implements specific behaviors for thread, warp, block access.
*
* @param[in] dest Destination address for data transmission.
* @param[in] source Source address for data transmission.
* @param[in] nelems Size in bytes of data transmission.
* @param[in] pe Destination processing element of data transmission.
* @param[in] db_ring Denotes whether send queue door bell should be rung.
*/
template <class level>
__device__ void put_nbi_cqe(void *dest, const void *source, size_t nelems,
int pe, bool db_ring);
/**
* @brief Consume a completion queue entry from this queue pair's
* completion queue.
*
* @tparam level Implements specific behaviors for thread, warp, block access.
*/
template <class level>
__device__ void quiet_single();
/**
* @brief Send a zero-byte read to enforce ordering and then consume
* a completion queue entry from this queue pair's completion queue.
*
* @tparam level Implements specific behaviors for thread, warp, block access.
*
* @param[in] pe Processing element id to send the zero_b_rd.
*/
template <class level>
__device__ void quiet_single_heavy(int pe);
/**
* @brief Create and enqueue a HDP flush work queue entry on the remote PE.
*
* @param[in] pe Processing element id to send the HDP flush operation.
*
* TODO(@khamidou): does this require a zero_b_rd to enforce write ordering
* The HDP flush is itself a write. Could this write be reordered with
* respect to other write on the network and arrive out-of-order?
*/
__device__ void fence(int pe);
/**
* @brief Create and enqueue a non-blocking get work queue entry (wqe).
*
* @tparam level Implements specific behaviors for thread, warp, block access.
*
* @param[in] dest Destination address for data transmission.
* @param[in] source Source address for data transmission.
* @param[in] nelems Size in bytes of data transmission.
* @param[in] pe Destination processing element of data transmission.
* @param[in] db_ring Denotes whether send queue door bell should be rung.
*/
template <class level>
__device__ void get_nbi(void *dest, const void *source, size_t nelems, int pe,
bool db_ring);
/**
* @brief Create and enqueue a non-blocking get work queue entry (wqe).
*
* @note This variant differs from get_nbi by requesting that a completion
* queue entry is generated in the completion queue.
*
* @tparam level Implements specific behaviors for thread, warp, block access.
*
* @param[in] dest Destination address for data transmission.
* @param[in] source Source address for data transmission.
* @param[in] nelems Size in bytes of data transmission.
* @param[in] pe Destination processing element of data transmission.
* @param[in] db_ring Denotes whether send queue door bell should be rung.
*/
template <class level>
__device__ void get_nbi_cqe(void *dest, const void *source, size_t nelems,
int pe, bool db_ring);
/**
* @brief Create and enqueue a zero-byte read to enforce write ordering.
*
* @tparam level Implements specific behaviors for thread, warp, block access.
*
* @param[in] pe Processing element id to send the zero_b_rd.
*/
template <class level>
__device__ void zero_b_rd(int pe);
/**
* @brief Create and enqueue an atomic fetch work queue entry (wqe).
*
* @param[in] dest Destination address for data transmission.
* @param[in] value Data value for the atomic operation.
* @param[in] cond Used in atomic comparisons.
* @param[in] pe Destination processing element of data transmission.
* @param[in] db_ring Denotes whether send queue door bell should be rung.
* @param[in] atomic_op The atomic operation to perform.
*
* @return An atomic value
*/
__device__ int64_t atomic_fetch(void *dest, int64_t value, int64_t cond,
int pe, bool db_ring, uint8_t atomic_op);
/**
* @brief Create and enqueue an atomic fetch work queue entry (wqe).
*
* @param[in] dest Destination address for data transmission.
* @param[in] value Data value for the atomic operation.
* @param[in] cond Used in atomic comparisons.
* @param[in] pe Destination processing element of data transmission.
* @param[in] db_ring Denotes whether send queue door bell should be rung.
* @param[in] atomic_op The atomic operation to perform.
*/
__device__ void atomic_nofetch(void *dest, int64_t value, int64_t cond,
int pe, bool db_ring, uint8_t atomic_op);
/**
* @brief Helper method to set the doorbell's value.
*
* @param[in] val Desired value for the doorbell.
*/
void setDBval(uint64_t val);
protected:
/**
* @brief Helper method to build work requests for the send queue.
*
* @tparam level Implements specific behaviors for thread, warp, block access.
* @tparam cqe Flag to optionally generate cqes.
*
* @param[in] pe Destination processing element of data transmission.
* @param[in] size Size in bytes of data transmission.
* @param[in] laddr Local address.
* @param[in] raddr Remote address.
* @param[in] opcode Operation to be performed.
* @param[in] atomic_data An atomic data value to be used.
* @param[in] atomic_cmp An atomic comparison operation to be performed.
* @param[in] ring_db Boolean denoting if doorbell should be rung.
* @param[in] atomic_ret_pos Index into atomic return structure.
* @param[in] zero_byte_rd Boolean if zero byte read should be used.
*/
template <class level, bool cqe>
__device__ __attribute__((noinline)) void update_posted_wqe_generic(
int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool ring_db,
uint64_t atomic_ret_pos, bool zero_byte_rd = false);
/**
* @brief Helper method to drain completion queue entries.
*
* @tparam level Implements specific behaviors for thread, warp, block access.
*
*/
template <class level>
__device__ __attribute__((noinline)) void quiet_internal();
/**
* @brief Helper method to compute doorbell value opcode which is used to
* ring the doorbell.
*
* @param[in,out] db_val
* @param[in] dbrec_val
* @param[in] opcode
*/
__device__ void compute_db_val_opcode(uint64_t *db_val, uint16_t dbrec_val,
uint8_t opcode);
/**
* @brief Helper method that sets the field in a work queue entry to
* generate a completion entry in the completion queue.
*
* @param num_wqes Number of work entries this completion entry represents.
*/
__device__ void set_completion_flag_on_wqe(int num_wqes);
/**
* @brief Helper method to update fields for the work queue entry.
*
* @tparam cqe Flag to optionally generate cqes.
*
* @note Single variant is meant to be callable by a block leader.
*/
template <bool cqe>
__device__ void update_wqe_ce_single(int num_wqes);
/**
* @brief Helper method to update fields for the work queue entry.
*
* @tparam cqe Flag to optionally generate cqes.
*
* @note Thread variant is meant to be callable by multiple threads.
*/
template <bool cqe>
__device__ void update_wqe_ce_thread(int num_wqes);
/**
* @brief Helper method to ring the doorbell
*
* @param[in] db_val Doorbell value is written by method.
*/
__device__ void ring_doorbell(uint64_t db_val);
/**
* @brief Helper method to extract syndrome field from cqe.
*
* @param[in] cq_entry Completion queue entry.
*/
__device__ uint8_t get_cq_error_syndrome(mlx5_cqe64 *cq_entry);
private:
const int inline_threshold{8};
/* TODO(bpotter): Most of these should be private/protected */
public:
#ifdef PROFILE
typedef Stats<GPU_IB_NUM_STATS> GPUIBStats;
#else
typedef NullStats<GPU_IB_NUM_STATS> GPUIBStats;
#endif
/*
* Pointer to the hardware doorbell register for the QP.
*/
db_reg_t db{};
/*
* Base pointer of this QP's SQ
* TODO(bpotter): Use the correct struct type for this.
*/
uint64_t *current_sq{nullptr};
uint64_t *current_sq_H{nullptr};
/*
* Base pointer of this QP's CQ
*/
mlx5_cqe64 *current_cq_q{nullptr};
mlx5_cqe64 *current_cq_q_H{nullptr};
/*
* Pointer to the doorbell record for this SQ.
*/
volatile uint32_t *dbrec_send{nullptr};
/*
* Pointer to the doorbell record for the CQ.
*/
volatile uint32_t *dbrec_cq{nullptr};
uint32_t *hdp_rkey{nullptr};
uintptr_t *hdp_address{nullptr};
HdpPolicy *hdp_policy{};
atomic_ret_t atomic_ret{};
ThreadImpl threadImpl{};
ConnectionImpl connection_policy;
char *const *base_heap{nullptr};
/*
* Current index into the SQ (non-modulo size).
*/
uint32_t sq_counter{0};
uint32_t local_sq_cnt{0};
/*
* Number of outstanding messages on this QP that need to be completed
* during a quiet operation.
*/
uint32_t quiet_counter{0};
int num_cqs{0};
/*
* Current index into the SQ (non-module size).
*/
uint32_t cq_consumer_counter{0};
uint16_t cq_log_size{0};
uint16_t cq_size{0};
uint32_t ctrl_qp_sq{0};
uint64_t ctrl_sig{0};
uint32_t rkey{0};
uint32_t lkey{0};
GPUIBStats profiler{};
uint16_t max_nwqe{0};
bool sq_overflow{0};
uint64_t db_val{};
/*
* Pointer to the QP in global memory that this QP is copied from. When
* this QP is destroyed, the dynamic (indicies, stats, etc) in the
* global_qp are updated.
*/
QueuePair *global_qp{nullptr};
friend SingleThreadImpl;
friend MultiThreadImpl;
friend THREAD;
friend WG;
friend WAVE;
friend RCConnectionImpl;
friend DCConnectionImpl;
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
-201
View File
@@ -1,201 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "reliable_connection.hpp"
#include <mpi.h>
#include "backend_ib.hpp"
namespace rocshmem {
ReliableConnection::ReliableConnection(GPUIBBackend* b) : Connection(b, 0) {}
ReliableConnection::~ReliableConnection() {}
Connection::InitQPState ReliableConnection::initqp(uint8_t port) {
InitQPState init{};
init.exp_qp_attr.qp_access_flags =
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
init.exp_qp_attr.port_num = port;
init.exp_attr_mask |= IBV_QP_ACCESS_FLAGS;
return init;
}
Connection::RtrState ReliableConnection::rtr(dest_info_t* dest, uint8_t port) {
RtrState rtr{};
rtr.exp_qp_attr.dest_qp_num = dest->qpn;
rtr.exp_qp_attr.rq_psn = dest->psn;
rtr.exp_qp_attr.ah_attr.port_num = port;
if (ib_state->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND) {
rtr.exp_qp_attr.ah_attr.dlid = dest->lid;
} else {
rtr.exp_qp_attr.ah_attr.is_global = 1;
rtr.exp_qp_attr.ah_attr.grh.dgid = dest->gid;
rtr.exp_qp_attr.ah_attr.grh.sgid_index = 0;
rtr.exp_qp_attr.ah_attr.grh.hop_limit = 1;
}
rtr.exp_attr_mask |= IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER;
return rtr;
}
Connection::RtsState ReliableConnection::rts(dest_info_t* dest) {
RtsState rts{};
rts.exp_qp_attr.sq_psn = dest->psn;
rts.exp_attr_mask |= IBV_QP_SQ_PSN;
return rts;
}
ibv_qp* ReliableConnection::create_qp_0(ibv_context* context,
ibv_qp_init_attr_ex* qp_attr) {
return ibv_create_qp_ex(context, qp_attr);
}
void ReliableConnection::create_qps_1() { }
void ReliableConnection::create_qps_2(int port, int my_rank,
ibv_port_attr* ib_port_att) { }
void ReliableConnection::create_qps_3(int port, ibv_qp* qp, int offset,
ibv_port_attr* ib_port_att) {
init_qp_status(qp, port);
all_qp[offset].lid = ib_port_att->lid;
all_qp[offset].qpn = qp->qp_num;
all_qp[offset].psn = 0;
union ibv_gid gid;
ibv_query_gid(ib_state->context, port, 0, &gid);
all_qp[offset].gid = gid;
}
void ReliableConnection::get_remote_conn(int* remote_conn) {
*remote_conn = backend->num_pes;
}
void ReliableConnection::allocate_dynamic_members(int num_blocks) {
all_qp.resize(backend->num_pes * num_blocks);
}
void ReliableConnection::free_dynamic_members() {
}
void ReliableConnection::initialize_1(int port, int num_blocks) {
MPI_Alltoall(MPI_IN_PLACE, sizeof(dest_info_t) * num_blocks, MPI_CHAR,
all_qp.data(), sizeof(dest_info_t) * num_blocks, MPI_CHAR,
backend->thread_comm);
for (int i = 0; i < qps.size(); i++) {
change_status_rtr(qps[i], &all_qp[i], port);
}
MPI_Barrier(backend->thread_comm);
for (int i = 0; i < qps.size(); i++) {
change_status_rts(qps[i], &all_qp[i]);
}
}
void ReliableConnection::initialize_rkey_handle(uint32_t** heap_rkey_handle,
ibv_mr* mr) {
CHECK_HIP(
hipHostMalloc(heap_rkey_handle, sizeof(uint32_t) * backend->num_pes));
(*heap_rkey_handle)[backend->my_pe] = mr->rkey;
}
void ReliableConnection::free_rkey_handle(uint32_t* heap_rkey_handle) {
CHECK_HIP(hipHostFree(heap_rkey_handle));
}
Connection::QPInitAttr ReliableConnection::qpattr(ibv_qp_cap cap) {
QPInitAttr qpattr(cap);
qpattr.attr.qp_type = IBV_QPT_RC;
return qpattr;
}
void ReliableConnection::post_dv_rc_wqe(int remote_conn) {
mlx5_wqe_ctrl_seg* ctrl;
mlx5_wqe_raddr_seg* rdma;
mlx5_wqe_data_seg* data;
for (int i = 0; i < remote_conn; i++) {
int num_blocks = backend->num_blocks_;
for (int j = 0; j < num_blocks; j++) {
int qp_index = i * num_blocks + j;
uint64_t* ptr = get_address_sq(qp_index);
const uint16_t nb_post = 1; // 4 * sq_size;
for (uint16_t index = 0; index < nb_post; index++) {
uint8_t op_mod = 0;
uint8_t op_code = 8;
uint32_t qp_num = qps[qp_index]->qp_num;
uint8_t fm_ce_se = 0;
uint8_t ds = 3;
ctrl = reinterpret_cast<mlx5_wqe_ctrl_seg*>(ptr);
mlx5dv_set_ctrl_seg(ctrl, index, op_code, op_mod, qp_num, fm_ce_se, ds,
0, 0);
ptr = ptr + 2;
rdma = reinterpret_cast<mlx5_wqe_raddr_seg*>(ptr);
const auto& heap_bases = backend->heap.get_heap_bases();
auto temp = heap_bases[(backend->my_pe + 1) % 2];
uint64_t r_address = reinterpret_cast<uint64_t>(temp);
uint32_t rkey = backend->networkImpl.heap_rkey[i];
set_rdma_seg(rdma, r_address, rkey);
ptr = ptr + 2;
data = reinterpret_cast<mlx5_wqe_data_seg*>(ptr);
uint32_t lkey = backend->networkImpl.heap_mr->lkey;
temp = heap_bases[backend->my_pe];
uint64_t address = reinterpret_cast<uint64_t>(temp);
mlx5dv_set_data_seg(data, 1, lkey, address);
ptr = ptr + 4;
}
}
}
}
// TODO(bpotter): remove redundancies with the other derived class
void ReliableConnection::post_wqes() {
int remote_conn;
get_remote_conn(&remote_conn);
post_dv_rc_wqe(remote_conn);
}
void ReliableConnection::initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah,
int dc_key) {}
int ReliableConnection::get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) {
return pe_idx * num_qps + wg_idx;
}
} // namespace rocshmem
-84
View File
@@ -1,84 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
#define LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
#include <vector>
#include "connection.hpp"
namespace rocshmem {
class ReliableConnection : public Connection {
public:
explicit ReliableConnection(GPUIBBackend* backend);
~ReliableConnection() override;
void get_remote_conn(int* remote_conn) override;
void post_wqes() override;
void initialize_rkey_handle(uint32_t** heap_rkey_handle,
ibv_mr* mr) override;
void free_rkey_handle(uint32_t* heap_rkey_handle) override;
private:
InitQPState initqp(uint8_t port) override;
RtrState rtr(dest_info_t* dest, uint8_t port) override;
RtsState rts(dest_info_t* dest) override;
QPInitAttr qpattr(ibv_qp_cap cap) override;
void create_qps_1() override;
void create_qps_2(int port, int my_rank,
ibv_port_attr* ib_port_att) override;
void create_qps_3(int port, ibv_qp* qp, int offset,
ibv_port_attr* ib_port_att) override;
ibv_qp* create_qp_0(ibv_context* context,
ibv_qp_init_attr_ex* qp_attr) override;
void allocate_dynamic_members(int num_wg) override;
void free_dynamic_members() override;
void initialize_1(int port, int num_wg) override;
void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, int dc_key) override;
int get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) override;
std::vector<dest_info_t> all_qp;
void post_dv_rc_wqe(int remote_conn);
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
-138
View File
@@ -1,138 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "segment_builder.hpp"
#include "../util.hpp"
#include "endian.hpp"
namespace rocshmem {
__device__ SegmentBuilder::SegmentBuilder(uint64_t wqe_idx, void *base) {
mlx5_segment *base_ptr = static_cast<mlx5_segment *>(base);
size_t segment_offset = SEGMENTS_PER_WQE * wqe_idx;
seg_ptr = &base_ptr[segment_offset];
}
__device__ void SegmentBuilder::update_cntrl_seg(
uint8_t opcode, uint16_t wqe_idx, uint32_t ctrl_qp_sq, uint64_t ctrl_sig,
ConnectionImpl *connection_policy, bool zero_byte_rd) {
mlx5_wqe_ctrl_seg ctrl_seg;
ctrl_seg.opmod_idx_opcode = (opcode << 24) | (wqe_idx << 8);
uint32_t DS = 2;
if (zero_byte_rd == false) {
DS = (opcode == MLX5_OPCODE_RDMA_WRITE || opcode == MLX5_OPCODE_RDMA_READ)
? 3
: 4;
}
DS += connection_policy->wqeCntrlOffset();
ctrl_seg.qpn_ds = (DS << 24) | ctrl_qp_sq;
ctrl_seg.signature = ctrl_sig;
ctrl_seg.fm_ce_se = ctrl_sig >> 24;
ctrl_seg.imm = ctrl_sig >> 32;
memcpy(&seg_ptr->ctrl_seg, &ctrl_seg, sizeof(mlx5_wqe_ctrl_seg));
seg_ptr++;
}
__device__ void SegmentBuilder::update_atomic_data_seg(uint64_t atomic_data,
uint64_t atomic_cmp) {
mlx5_wqe_atomic_seg atomic_seg;
swap_endian_store(reinterpret_cast<uint64_t *>(&atomic_seg.swap_add),
atomic_data);
swap_endian_store(reinterpret_cast<uint64_t *>(&atomic_seg.compare),
atomic_cmp);
memcpy(&seg_ptr->atomic_seg, &atomic_seg, sizeof(mlx5_wqe_atomic_seg));
seg_ptr++;
}
__device__ void SegmentBuilder::update_rdma_seg(uintptr_t *raddr,
uint32_t rkey) {
mlx5_wqe_raddr_seg raddr_seg;
raddr_seg.rkey = rkey;
swap_endian_store(reinterpret_cast<uint64_t *>(&raddr_seg.raddr),
reinterpret_cast<uint64_t>(raddr));
memcpy(&seg_ptr->raddr_seg, &raddr_seg, sizeof(mlx5_wqe_raddr_seg));
seg_ptr++;
}
__device__ void SegmentBuilder::update_data_seg(uintptr_t *laddr, int32_t size,
uint32_t lkey) {
if (laddr == nullptr) {
return;
}
mlx5_wqe_data_seg data_seg;
data_seg.lkey = lkey;
swap_endian_store(&data_seg.byte_count, size & 0x7FFFFFFFU);
swap_endian_store(reinterpret_cast<uint64_t *>(&data_seg.addr),
reinterpret_cast<uint64_t>(laddr));
memcpy(&seg_ptr->data_seg, &data_seg, sizeof(mlx5_wqe_data_seg));
seg_ptr++;
}
__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr,
int32_t size) {
mlx5_wqe_inl_data_seg inl_data_seg;
swap_endian_store(&inl_data_seg.byte_count, (size & 0x3FF) | 0x80000000);
// Assume fence HDP flush
// TODO(khamidou): Rework fence interface to avoid this
size_t field_size{sizeof(mlx5_wqe_inl_data_seg)};
if (!laddr) {
uint8_t flush_val = 1;
memcpy(&inl_data_seg + 1, &flush_val, sizeof(flush_val));
field_size += sizeof(flush_val);
} else {
memcpy(&inl_data_seg + 1, laddr, size);
field_size += size;
}
memcpy(&seg_ptr->inl_data_seg, &inl_data_seg, field_size);
seg_ptr++;
}
__device__ void SegmentBuilder::update_connection_seg(
int pe, ConnectionImpl *conn_policy) {
if (conn_policy->updateConnectionSegmentImpl(&seg_ptr->base_av, pe)) {
seg_ptr++;
}
}
} // namespace rocshmem
-64
View File
@@ -1,64 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
#define LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
#include <infiniband/mlx5dv.h>
#include "connection_policy.hpp"
#include "infiniband_structs.hpp"
#include "../util.hpp"
namespace rocshmem {
class SegmentBuilder {
public:
__device__ SegmentBuilder(uint64_t wqe_idx, void *base);
__device__ void update_cntrl_seg(uint8_t opcode, uint16_t wqe_idx,
uint32_t ctrl_qp_sq, uint64_t ctrl_sig,
ConnectionImpl *connection_policy,
bool zero_byte_rd);
__device__ void update_connection_seg(int pe,
ConnectionImpl *connection_policy);
__device__ void update_atomic_data_seg(uint64_t atomic_data,
uint64_t atomic_cmp);
__device__ void update_rdma_seg(uintptr_t *raddr, uint32_t rkey);
__device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size);
__device__ void update_data_seg(uintptr_t *laddr, int32_t size,
uint32_t lkey);
private:
const int SEGMENTS_PER_WQE = 4;
mlx5_segment *seg_ptr;
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
-358
View File
@@ -1,358 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "thread_policy.hpp"
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "queue_pair.hpp"
namespace rocshmem {
__device__ void SingleThreadImpl::quiet(QueuePair *handle) {
handle->quiet_internal<THREAD>();
}
__device__ void SingleThreadImpl::quiet_heavy(QueuePair *handle, int pe) {
handle->zero_b_rd<THREAD>(pe);
handle->quiet_internal<THREAD>();
}
__device__ void MultiThreadImpl::quiet(QueuePair *handle) {
int thread_id = get_flat_block_id();
/*
* Each WF selects one thread to perform the quiet. Only one thread
* per WG is allowed to do a quiet at once to avoid races with the CQ.
*/
if (thread_id % WF_SIZE == lowerID()) {
while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
}
handle->quiet_internal<THREAD>();
__threadfence();
handle->threadImpl.cq_lock = 0;
}
}
__device__ void MultiThreadImpl::quiet_heavy(QueuePair *handle, int pe) {
int thread_id = get_flat_block_id();
/*
* Each WF selects one thread to perform the quiet. Only one thread
* per WG is allowed to do a quiet at once to avoid races with the CQ.
*/
if (thread_id % WF_SIZE == lowerID()) {
// zero_byte read
handle->zero_b_rd<THREAD>(pe);
while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
}
handle->quiet_internal<THREAD>();
__threadfence();
handle->threadImpl.cq_lock = 0;
}
}
__device__ void WG::quiet(QueuePair *handle) { handle->quiet_internal<WG>(); }
__device__ void WG::quiet_heavy(QueuePair *handle, int pe) {
handle->zero_b_rd<THREAD>(pe);
handle->quiet_internal<WG>();
}
__device__ void WAVE::quiet(QueuePair *handle) {
int thread_id = get_flat_block_id();
/*
* Each WF selects one thread to perform the quiet. Only one thread
* per WG is allowed to do a quiet at once to avoid races with the CQ.
*/
if (thread_id % WF_SIZE == 0) {
while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
}
handle->quiet_internal<WAVE>();
__threadfence();
handle->threadImpl.cq_lock = 0;
}
}
__device__ void WAVE::quiet_heavy(QueuePair *handle, int pe) {
int thread_id = get_flat_block_id();
/*
* Each WF selects one thread to perform the quiet. Only one thread
* per WG is allowed to do a quiet at once to avoid races with the CQ.
*/
if (thread_id % WF_SIZE == 0) {
// post a zero-byte read
handle->zero_b_rd<THREAD>(pe);
while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
}
handle->quiet_internal<WAVE>();
__threadfence();
handle->threadImpl.cq_lock = 0;
}
}
__device__ void SingleThreadImpl::decQuietCounter(uint32_t *quiet_counter,
int num) {
*quiet_counter -= num;
}
__device__ void MultiThreadImpl::decQuietCounter(uint32_t *quiet_counter,
int num) {
atomicSub(quiet_counter, num);
}
__device__ void WG::decQuietCounter(uint32_t *quiet_counter, int num) {
*quiet_counter -= num;
}
__device__ void WAVE::decQuietCounter(uint32_t *quiet_counter, int num) {
*quiet_counter -= num;
}
template <bool cqe>
__device__ void SingleThreadImpl::finishPost(QueuePair *handle, bool ring_db,
int num_wqes, int pe,
uint16_t le_sq_counter,
uint8_t opcode) {
if (ring_db) {
uint64_t db_val = handle->db_val;
handle->compute_db_val_opcode(&db_val, le_sq_counter, opcode);
handle->update_wqe_ce_single<cqe>(num_wqes);
handle->ring_doorbell(db_val);
}
}
template <bool cqe>
__device__ void MultiThreadImpl::finishPost(QueuePair *handle, bool ring_db,
int num_wqes, int pe,
uint16_t le_sq_counter,
uint8_t opcode) {
/*
* For RC, we can't allow a wave to have different PEs in it, else the
* doorbell ringing logic will not work. This little for loop forces
* control flow divergence based on the PE. It works well for small
* numbers of PEs, but we might want a different solution for large
* numbers.
*/
if (handle->connection_policy.forcePostDivergence()) {
for (int i = 0; i < handle->num_cqs; i++) {
if (i != pe) {
continue;
}
finishPost_internal<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
opcode);
}
} else {
finishPost_internal<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
opcode);
}
}
template <bool cqe>
__device__ void MultiThreadImpl::finishPost_internal(QueuePair *handle,
bool ring_db, int num_wqes,
int pe,
uint16_t le_sq_counter,
uint8_t opcode) {
/*
* Assuming here that postLock locks out all wavefronts in this WG but
* one, and that this will select a single thread in the wavefront.
*/
__threadfence();
if (get_flat_block_id() % WF_SIZE == lowerID()) {
if (ring_db) {
uint64_t db_val =
handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
handle->max_nwqe)];
handle->update_wqe_ce_thread<true>(num_wqes);
handle->ring_doorbell(db_val);
}
handle->threadImpl.sq_lock = 0;
}
}
template <bool cqe>
__device__ void WG::finishPost(QueuePair *handle, bool ring_db, int num_wqes,
int pe, uint16_t le_sq_counter, uint8_t opcode) {
if (ring_db) {
uint64_t db_val = handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
handle->max_nwqe)];
handle->update_wqe_ce_single<cqe>(num_wqes);
handle->ring_doorbell(db_val);
}
}
template <bool cqe>
__device__ void WAVE::finishPost(QueuePair *handle, bool ring_db, int num_wqes,
int pe, uint16_t le_sq_counter,
uint8_t opcode) {
if (ring_db) {
uint64_t db_val = handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
handle->max_nwqe)];
handle->update_wqe_ce_thread<cqe>(num_wqes);
handle->ring_doorbell(db_val);
}
handle->threadImpl.sq_lock = 0;
}
__device__ void SingleThreadImpl::postLock(QueuePair *handle, int pe) {
handle->hdp_policy->hdp_flush();
// handle->waitCQSpace(1);
handle->waitSQSpace(1);
}
__device__ void MultiThreadImpl::postLock_internal(QueuePair *handle) {
int thread_id = get_flat_block_id();
int active_threads = wave_SZ();
if (thread_id % WF_SIZE == lowerID()) {
handle->hdp_policy->hdp_flush();
/*
* Don't let more than one wave in this WG go any further or a
* horrible variety of impossible to debug race conditions can occur.
*/
while (atomicCAS(&(handle->threadImpl.sq_lock), 0, 1) == 1) {
}
/*
* This is a tiny bit over-aggressive as it assumes that all of the
* active_threads are going to the same PE when calculating whether
* we are full.
*/
// handle->waitCQSpace(active_threads);
handle->waitSQSpace(active_threads);
}
/*
* Double check we've got the same exec mask (assuming divergence after
* the previous if.
*/
if (active_threads != wave_SZ()) {
__builtin_trap();
}
}
__device__ void MultiThreadImpl::postLock(QueuePair *handle, int pe) {
/*
* For RC, we can't allow a wave to have different PEs in it, else the
* doorbell ringing logic will not work. This little for loop forces
* control flow divergence based on the PE. It works well for small
* numbers of PEs, but we might want a different solution for large
* numbers.
*/
if (handle->connection_policy.forcePostDivergence()) {
for (int i = 0; i < handle->num_cqs; i++) {
if (i != pe) {
continue;
}
postLock_internal(handle);
}
} else {
postLock_internal(handle);
}
}
__device__ void WG::postLock(QueuePair *handle, int pe) {
handle->hdp_policy->hdp_flush();
// handle->waitCQSpace(1);
handle->waitSQSpace(1);
}
__device__ void WAVE::postLock(QueuePair *handle, int pe) {
handle->hdp_policy->hdp_flush();
/*
* Don't let more than one wave in this WG go any further or a horrible
* variety of impossible to debug race conditions can occur.
*/
while (atomicCAS(&(handle->threadImpl.sq_lock), 0, 1) == 1) {
}
/*
* This is a tiny bit over-aggressive as it assumes that all of the
* active_threads are going to the same PE when calculating whether
* we are full.
*/
// handle->waitCQSpace(1);
handle->waitSQSpace(1);
}
template <typename T>
__device__ T SingleThreadImpl::threadAtomicAdd(T *val, T value) {
T old_val = *val;
*val += value;
return old_val;
}
template <typename T>
__device__ T MultiThreadImpl::threadAtomicAdd(T *val, T value) {
return atomicAdd(val, value);
}
template <typename T>
__device__ T WG::threadAtomicAdd(T *val, T value) {
T old_val = *val;
*val += value;
return old_val;
}
template <typename T>
__device__ T WAVE::threadAtomicAdd(T *val, T value) {
return atomicAdd(val, value);
}
#define TYPE_GEN(T) \
template __device__ T SingleThreadImpl::threadAtomicAdd<T>(T * val, \
T value); \
template __device__ T MultiThreadImpl::threadAtomicAdd<T>(T * val, T value); \
template __device__ T WG::threadAtomicAdd<T>(T * val, T value); \
template __device__ T WAVE::threadAtomicAdd<T>(T * val, T value);
TYPE_GEN(float)
TYPE_GEN(double)
TYPE_GEN(int)
TYPE_GEN(unsigned int)
TYPE_GEN(unsigned long long) // NOLINT(runtime/int)
#define TYPE_BOOL(T) \
template __device__ void SingleThreadImpl::finishPost<T>( \
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
uint16_t le_sq_counter, uint8_t opcode); \
template __device__ void MultiThreadImpl::finishPost<T>( \
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
uint16_t le_sq_counter, uint8_t opcode); \
template __device__ void WG::finishPost<T>( \
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
uint16_t le_sq_counter, uint8_t opcode); \
template __device__ void WAVE::finishPost<T>( \
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
uint16_t le_sq_counter, uint8_t opcode); \
template __device__ void MultiThreadImpl::finishPost_internal<T>( \
QueuePair * handle, bool ring_db, int num_wqes, int pe, \
uint16_t le_sq_counter, uint8_t opcode);
TYPE_BOOL(true)
TYPE_BOOL(false)
} // namespace rocshmem
-176
View File
@@ -1,176 +0,0 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
#define LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "../util.hpp"
namespace rocshmem {
class QueuePair;
/*
* GPU single-thread policy class. Only a single work-item per work-group
* is allowed to call into a rocSHMEM function (unless it is specifically
* called out as a collective API. This thread policy is the fastest but
* is not as flexible.
*/
class SingleThreadImpl {
public:
uint32_t cq_lock = 0;
uint32_t sq_lock = 0;
__device__ void quiet(QueuePair *handle);
__device__ void quiet_heavy(QueuePair *handle, int pe);
__device__ void decQuietCounter(uint32_t *quiet_counter, int num);
template <bool cqe>
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
int pe, uint16_t le_sq_counter, uint8_t opcode);
__device__ void postLock(QueuePair *handle, int pe);
template <typename T>
__device__ T threadAtomicAdd(T *val, T value = 1);
};
/*
* GPU multi-thread policy class. Multiple work-items per work-group are
* allowed to call into a rocSHMEM function. A bit slower than its
* single-thread counterpart but it enables a much more flexible user-facing
* API.
*/
class MultiThreadImpl {
/*
* Per-wg locks for the CQ and the SQ, respectively.
*/
template <bool cqe>
__device__ void finishPost_internal(QueuePair *handle, bool ring_db,
int num_wqes, int pe,
uint16_t le_sq_counter, uint8_t opcode);
__device__ void postLock_internal(QueuePair *handle);
public:
uint32_t cq_lock = 0;
uint32_t sq_lock = 0;
__device__ void quiet(QueuePair *handle);
__device__ void quiet_heavy(QueuePair *handle, int pe);
__device__ void decQuietCounter(uint32_t *quiet_counter, int num);
template <bool cqe>
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
int pe, uint16_t le_sq_counter, uint8_t opcode);
__device__ void postLock(QueuePair *handle, int pe);
template <typename T>
__device__ T threadAtomicAdd(T *val, T value = 1);
};
/*
* Select which one of our thread policies to use at compile time.
*/
#ifdef USE_THREADS
typedef MultiThreadImpl ThreadImpl;
#else
typedef SingleThreadImpl ThreadImpl;
#endif
class THREAD {
public:
ThreadImpl threadImpl;
__device__ void quiet(QueuePair *handle) { threadImpl.quiet(handle); }
__device__ void quiet_heavy(QueuePair *handle, int pe) {
threadImpl.quiet_heavy(handle, pe);
}
__device__ void decQuietCounter(uint32_t *quiet_counter, int num) {
threadImpl.decQuietCounter(quiet_counter, num);
}
template <bool cqe>
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
int pe, uint16_t le_sq_counter, uint8_t opcode) {
threadImpl.finishPost<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
opcode);
}
__device__ void postLock(QueuePair *handle, int pe) {
threadImpl.postLock(handle, pe);
}
template <typename T>
__device__ T threadAtomicAdd(T *val, T value = 1) {
T tmp = threadImpl.threadAtomicAdd(val, value);
return tmp;
}
};
class WAVE {
public:
__device__ void quiet(QueuePair *handle);
__device__ void quiet_heavy(QueuePair *handle, int pe);
__device__ void decQuietCounter(uint32_t *quiet_counter, int num);
template <bool cqe>
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
int pe, uint16_t le_sq_counter, uint8_t opcode);
__device__ void postLock(QueuePair *handle, int pe);
template <typename T>
__device__ T threadAtomicAdd(T *val, T value = 1);
};
class WG {
public:
__device__ void quiet(QueuePair *handle);
__device__ void quiet_heavy(QueuePair *handle, int pe);
__device__ void decQuietCounter(uint32_t *quiet_counter, int num);
template <bool cqe>
__device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
int pe, uint16_t le_sq_counter, uint8_t opcode);
__device__ void postLock(QueuePair *handle, int pe);
template <typename T>
__device__ T threadAtomicAdd(T *val, T value = 1);
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
+2 -2
View File
@@ -35,7 +35,7 @@ namespace rocshmem {
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
rocshmem_team_t get_external_team(GPUIBTeam *team) {
rocshmem_team_t get_external_team(IPCTeam *team) {
return reinterpret_cast<rocshmem_team_t>(team);
}
@@ -223,7 +223,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team,
* Allocate device-side memory for team_world and
* construct a IPC team in it
*/
GPUIBTeam *new_team_obj;
IPCTeam *new_team_obj;
CHECK_HIP(hipMalloc(&new_team_obj, sizeof(IPCTeam)));
new (new_team_obj)
IPCTeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes,
+1 -1
View File
@@ -298,4 +298,4 @@ class IPCContext : public Context {
} // namespace rocshmem
#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
#endif // LIBRARY_SRC_IPC_CONTEXT_DEVICE_HPP_
+1 -1
View File
@@ -35,7 +35,7 @@
* both host access and device access to the memory space.
*
* The symmetric heaps are visible to network by registering them as
* InfiniBand memory regions. Every memory region has a remote key
* memory regions. Every memory region has a remote key
* which needs to be shared across the network (to access the memory
* region).
*/
+7 -7
View File
@@ -29,14 +29,14 @@ namespace rocshmem {
template <typename T>
__host__ void ROHostContext::p(T *dest, T value, int pe) {
DPRINTF("Function: gpu_ib_host_p\n");
DPRINTF("Function: ro_host_p\n");
host_interface->p<T>(dest, value, pe, context_window_info);
}
template <typename T>
__host__ T ROHostContext::g(const T *source, int pe) {
DPRINTF("Function: gpu_ib_host_g\n");
DPRINTF("Function: ro_host_g\n");
return host_interface->g<T>(source, pe, context_window_info);
}
@@ -44,7 +44,7 @@ __host__ T ROHostContext::g(const T *source, int pe) {
template <typename T>
__host__ void ROHostContext::put(T *dest, const T *source, size_t nelems,
int pe) {
DPRINTF("Function: gpu_ib_host_put\n");
DPRINTF("Function: ro_host_put\n");
host_interface->put<T>(dest, source, nelems, pe, context_window_info);
}
@@ -52,7 +52,7 @@ __host__ void ROHostContext::put(T *dest, const T *source, size_t nelems,
template <typename T>
__host__ void ROHostContext::get(T *dest, const T *source, size_t nelems,
int pe) {
DPRINTF("Function: gpu_ib_host_get\n");
DPRINTF("Function: ro_host_get\n");
host_interface->get<T>(dest, source, nelems, pe, context_window_info);
}
@@ -60,7 +60,7 @@ __host__ void ROHostContext::get(T *dest, const T *source, size_t nelems,
template <typename T>
__host__ void ROHostContext::put_nbi(T *dest, const T *source, size_t nelems,
int pe) {
DPRINTF("Function: gpu_ib_host_put_nbi\n");
DPRINTF("Function: ro_host_put_nbi\n");
host_interface->put_nbi<T>(dest, source, nelems, pe, context_window_info);
}
@@ -68,7 +68,7 @@ __host__ void ROHostContext::put_nbi(T *dest, const T *source, size_t nelems,
template <typename T>
__host__ void ROHostContext::get_nbi(T *dest, const T *source, size_t nelems,
int pe) {
DPRINTF("Function: gpu_ib_host_get_nbi\n");
DPRINTF("Function: ro_host_get_nbi\n");
host_interface->get_nbi<T>(dest, source, nelems, pe, context_window_info);
}
@@ -107,7 +107,7 @@ __host__ void ROHostContext::broadcast(T *dest, const T *source, int nelems,
int pe_root, int pe_start,
int log_pe_stride, int pe_size,
long *p_sync) {
DPRINTF("Function: gpu_ib_host_broadcast\n");
DPRINTF("Function: ro_host_broadcast\n");
host_interface->broadcast<T>(dest, source, nelems, pe_root, pe_start,
log_pe_stride, pe_size, p_sync);
+2 -8
View File
@@ -36,10 +36,7 @@
#include "backend_bc.hpp"
#include "context_incl.hpp"
#ifdef USE_GPU_IB
#include "gpu_ib/backend_ib.hpp"
#include "gpu_ib/context_ib_tmpl_host.hpp"
#elif defined(USE_RO)
#ifdef USE_RO
#include "reverse_offload/backend_ro.hpp"
#include "reverse_offload/context_ro_tmpl_host.hpp"
#else
@@ -86,10 +83,7 @@ rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
rocshmem_env_config_init();
#ifdef USE_GPU_IB
CHECK_HIP(hipHostMalloc(&backend, sizeof(GPUIBBackend)));
backend = new (backend) GPUIBBackend(comm);
#elif defined(USE_RO)
#ifdef USE_RO
CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
backend = new (backend) ROBackend(comm);
#else
+1 -3
View File
@@ -49,9 +49,7 @@
#include "templates.hpp"
#include "util.hpp"
#ifdef USE_GPU_IB
#include "gpu_ib/context_ib_tmpl_device.hpp"
#elif defined(USE_RO)
#ifdef USE_RO
#include "reverse_offload/context_ro_tmpl_device.hpp"
#else
#include "ipc/context_ipc_tmpl_device.hpp"
-4
View File
@@ -36,10 +36,6 @@ __host__ __device__ Team* get_internal_team(rocshmem_team_t team) {
return reinterpret_cast<Team*>(team);
}
GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team) {
return reinterpret_cast<GPUIBTeam*>(team);
}
ROTeam* get_internal_ro_team(rocshmem_team_t team) {
return reinterpret_cast<ROTeam*>(team);
}
+1 -4
View File
@@ -33,7 +33,6 @@ namespace rocshmem {
class Backend;
class Team;
class ROTeam;
class GPUIBTeam;
class IPCTeam;
class TeamInfo {
@@ -154,13 +153,11 @@ class Team {
*
* @note This is required to do some reinterpret_casts.
*/
BackendType type{BackendType::GPU_IB_BACKEND};
BackendType type{BackendType::RO_BACKEND};
};
__host__ __device__ Team* get_internal_team(rocshmem_team_t team);
GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team);
ROTeam* get_internal_ro_team(rocshmem_team_t team);
IPCTeam* get_internal_ipc_team(rocshmem_team_t team);