From 658bf2a3b5e0f1b7ec7fa837529a947502b69bd3 Mon Sep 17 00:00:00 2001 From: Yiltan Date: Mon, 24 Mar 2025 09:04:52 -0400 Subject: [PATCH] Removed GPU_IB (#59) --- CMakeLists.txt | 20 - README.md | 10 +- cmake/Modules/FindIbverbs.cmake | 62 - cmake/rocshmem_config.h.in | 2 - scripts/build_configs/dc_multi | 27 - scripts/build_configs/dc_multi_debug | 27 - scripts/build_configs/dc_multi_ipc | 27 - scripts/build_configs/dc_multi_profile | 27 - scripts/build_configs/dc_single | 27 - scripts/build_configs/ipc_single | 2 - scripts/build_configs/ipc_tests_only | 2 - scripts/build_configs/rc_multi | 27 - scripts/build_configs/rc_multi_debug | 27 - scripts/build_configs/rc_multi_wf_coal | 27 - scripts/build_configs/rc_single | 28 - scripts/build_configs/rc_single_debug | 27 - scripts/build_configs/rc_single_managed | 28 - scripts/build_configs/rc_single_managed_debug | 28 - scripts/build_configs/rc_single_profile | 27 - scripts/build_configs/rc_single_single_node | 30 - .../build_configs/rc_single_single_node_debug | 30 - scripts/build_configs/ro_ipc | 2 - scripts/build_configs/ro_net | 2 - scripts/build_configs/ro_net_debug | 2 - scripts/functional_tests/driver.py | 5 +- src/CMakeLists.txt | 4 +- src/backend_bc.cpp | 12 +- src/backend_bc.hpp | 2 +- src/backend_type.hpp | 30 +- src/context_incl.hpp | 5 +- src/context_tmpl_device.hpp | 4 +- src/context_tmpl_host.hpp | 4 +- src/gpu_ib/CMakeLists.txt | 45 - src/gpu_ib/backend_ib.cpp | 493 -------- src/gpu_ib/backend_ib.hpp | 351 ------ src/gpu_ib/connect_tests.cpp | 54 - src/gpu_ib/connection.cpp | 431 ------- src/gpu_ib/connection.hpp | 259 ---- src/gpu_ib/connection_policy.cpp | 81 -- src/gpu_ib/connection_policy.hpp | 130 -- src/gpu_ib/context_ib_device.cpp | 371 ------ src/gpu_ib/context_ib_device.hpp | 304 ----- src/gpu_ib/context_ib_device_coll.cpp | 122 -- src/gpu_ib/context_ib_host.cpp | 85 -- src/gpu_ib/context_ib_host.hpp | 149 --- src/gpu_ib/context_ib_tmpl_device.hpp | 1097 ----------------- src/gpu_ib/context_ib_tmpl_host.hpp | 173 --- src/gpu_ib/debug.cpp | 39 - src/gpu_ib/dynamic_connection.cpp | 381 ------ src/gpu_ib/dynamic_connection.hpp | 122 -- src/gpu_ib/endian.cpp | 79 -- src/gpu_ib/endian.hpp | 53 - src/gpu_ib/gpu_ib_team.cpp | 56 - src/gpu_ib/gpu_ib_team.hpp | 50 - src/gpu_ib/infiniband_structs.hpp | 49 - src/gpu_ib/memory_builder_policy.hpp | 78 -- src/gpu_ib/network_policy.cpp | 500 -------- src/gpu_ib/network_policy.hpp | 357 ------ src/gpu_ib/qe_dumper.cpp | 79 -- src/gpu_ib/qe_dumper.hpp | 66 - src/gpu_ib/queue_pair.cpp | 437 ------- src/gpu_ib/queue_pair.hpp | 431 ------- src/gpu_ib/reliable_connection.cpp | 201 --- src/gpu_ib/reliable_connection.hpp | 84 -- src/gpu_ib/segment_builder.cpp | 138 --- src/gpu_ib/segment_builder.hpp | 64 - src/gpu_ib/thread_policy.cpp | 358 ------ src/gpu_ib/thread_policy.hpp | 176 --- src/ipc/backend_ipc.cpp | 4 +- src/ipc/context_ipc_device.hpp | 2 +- src/memory/symmetric_heap.hpp | 2 +- src/reverse_offload/context_ro_tmpl_host.hpp | 14 +- src/rocshmem.cpp | 10 +- src/rocshmem_gpu.cpp | 4 +- src/team.cpp | 4 - src/team.hpp | 5 +- 76 files changed, 33 insertions(+), 8539 deletions(-) delete mode 100644 cmake/Modules/FindIbverbs.cmake delete mode 100755 scripts/build_configs/dc_multi delete mode 100755 scripts/build_configs/dc_multi_debug delete mode 100755 scripts/build_configs/dc_multi_ipc delete mode 100755 scripts/build_configs/dc_multi_profile delete mode 100755 scripts/build_configs/dc_single delete mode 100755 scripts/build_configs/rc_multi delete mode 100755 scripts/build_configs/rc_multi_debug delete mode 100755 scripts/build_configs/rc_multi_wf_coal delete mode 100755 scripts/build_configs/rc_single delete mode 100755 scripts/build_configs/rc_single_debug delete mode 100755 scripts/build_configs/rc_single_managed delete mode 100755 scripts/build_configs/rc_single_managed_debug delete mode 100755 scripts/build_configs/rc_single_profile delete mode 100755 scripts/build_configs/rc_single_single_node delete mode 100755 scripts/build_configs/rc_single_single_node_debug delete mode 100644 src/gpu_ib/CMakeLists.txt delete mode 100644 src/gpu_ib/backend_ib.cpp delete mode 100644 src/gpu_ib/backend_ib.hpp delete mode 100644 src/gpu_ib/connect_tests.cpp delete mode 100644 src/gpu_ib/connection.cpp delete mode 100644 src/gpu_ib/connection.hpp delete mode 100644 src/gpu_ib/connection_policy.cpp delete mode 100644 src/gpu_ib/connection_policy.hpp delete mode 100644 src/gpu_ib/context_ib_device.cpp delete mode 100644 src/gpu_ib/context_ib_device.hpp delete mode 100644 src/gpu_ib/context_ib_device_coll.cpp delete mode 100644 src/gpu_ib/context_ib_host.cpp delete mode 100644 src/gpu_ib/context_ib_host.hpp delete mode 100644 src/gpu_ib/context_ib_tmpl_device.hpp delete mode 100644 src/gpu_ib/context_ib_tmpl_host.hpp delete mode 100644 src/gpu_ib/debug.cpp delete mode 100644 src/gpu_ib/dynamic_connection.cpp delete mode 100644 src/gpu_ib/dynamic_connection.hpp delete mode 100644 src/gpu_ib/endian.cpp delete mode 100644 src/gpu_ib/endian.hpp delete mode 100644 src/gpu_ib/gpu_ib_team.cpp delete mode 100644 src/gpu_ib/gpu_ib_team.hpp delete mode 100644 src/gpu_ib/infiniband_structs.hpp delete mode 100644 src/gpu_ib/memory_builder_policy.hpp delete mode 100644 src/gpu_ib/network_policy.cpp delete mode 100644 src/gpu_ib/network_policy.hpp delete mode 100644 src/gpu_ib/qe_dumper.cpp delete mode 100644 src/gpu_ib/qe_dumper.hpp delete mode 100644 src/gpu_ib/queue_pair.cpp delete mode 100644 src/gpu_ib/queue_pair.hpp delete mode 100644 src/gpu_ib/reliable_connection.cpp delete mode 100644 src/gpu_ib/reliable_connection.hpp delete mode 100644 src/gpu_ib/segment_builder.cpp delete mode 100644 src/gpu_ib/segment_builder.hpp delete mode 100644 src/gpu_ib/thread_policy.cpp delete mode 100644 src/gpu_ib/thread_policy.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 599aeada61..9d29220c5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,9 +39,7 @@ endif() ############################################################################### option(DEBUG "Enable debug trace" OFF) option(PROFILE "Enable statistics and timing support" OFF) -option(USE_GPU_IB "Enable GPU_IB conduit." ON) option(USE_RO "Enable RO conduit." ON) -option(USE_DC "Enable IB dynamically connected transport (DC)" OFF) option(USE_IPC "Enable IPC support (using HIP)" OFF) option(USE_THREADS "Enable workgroup threads to share network queues" OFF) option(USE_WF_COAL "Enable wavefront message coalescing" OFF) @@ -158,10 +156,6 @@ if (NOT BUILD_TESTS_ONLY) find_package(hip REQUIRED) find_package(hsa-runtime64 REQUIRED) - if (USE_GPU_IB) - find_package(Ibverbs REQUIRED) - endif() - set(CMAKE_THREAD_PREFER_PTHREAD TRUE) set(THREADS_PREFER_PTHREAD_FLAG TRUE) find_package(Threads REQUIRED) @@ -188,20 +182,6 @@ if (NOT BUILD_TESTS_ONLY) hip::host hsa-runtime64::hsa-runtime64 ) - - if (USE_GPU_IB) - target_include_directories( - ${PROJECT_NAME} - PUBLIC - ${IBVERBS_INCLUDE_DIRS} - ) - - target_link_libraries( - ${PROJECT_NAME} - PUBLIC - ${IBVERBS_LIBRARIES} - ) - endif() endif() ############################################################################### diff --git a/README.md b/README.md index ed8bff6821..cc21118ede 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,11 @@ code complexity and enables more fine-grained communication/computation overlap than traditional host-driven networking. rocSHMEM uses a single symmetric heap (SHEAP) that is allocated on GPU memories. -There are currently three backends for rocSHMEM; -IPC, Reverse Offload (RO), and GPU-IB. +There are currently two backends for rocSHMEM; +IPC and Reverse Offload (RO). The backends primarily differ in their implementations of intra-kernel networking. Currently, only the IPC backend is supported. -The RO and GPU-IB backends are provided as-is with +The RO backend is provided as-is with no guarantees of support from AMD or AMD Research. The IPC backend implements communication primitives using load/store operations issued from the GPU. @@ -21,10 +21,6 @@ to the host-side runtime, which calls into a traditional MPI or OpenSHMEM implementation. This forwarding of requests is transparent to the programmer, who only sees the GPU-side interface. -The GPU InfiniBand (GPU-IB) backend implements a lightweight InfiniBand verbs interface -on the GPU. The GPU itself is responsible for building commands and ringing -the doorbell on the NIC to send network commands. - ## Requirements rocSHMEM base requirements: diff --git a/cmake/Modules/FindIbverbs.cmake b/cmake/Modules/FindIbverbs.cmake deleted file mode 100644 index 46c17f31a5..0000000000 --- a/cmake/Modules/FindIbverbs.cmake +++ /dev/null @@ -1,62 +0,0 @@ -############################################################################### -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -############################################################################### - -find_package(PkgConfig REQUIRED QUIET) -pkg_check_modules(PC_IBVERBS QUIET libibverbs) - -find_path( - IBVERBS_INCLUDE_DIR infiniband/verbs.h - HINTS ${PC_IBVERBS_INCLUDEDIR} ${PC_IBVERBS_INCLUDE_DIRS} - PATH_SUFFIXES include -) - -find_library( - IBVERBS_LIBRARY - NAMES ibverbs libibverbs - HINTS ${PC_IBVERBS_LIBDIR} ${PC_IBVERBS_LIBRARY_DIRS} - PATH_SUFFIXES lib lib64 -) - -find_library( - MLX5_LIBRARY - NAMES mlx5 libmlx5 - HINTS ${PC_IBVERBS_LIBDIR} ${PC_IBVERBS_LIBRARY_DIRS} - PATH_SUFFIXES lib lib64 -) - -set( - IBVERBS_LIBRARIES - ${IBVERBS_LIBRARY} ${MLX5_LIBRARY} - CACHE INTERNAL "" -) - -set( - IBVERBS_INCLUDE_DIRS - ${IBVERBS_INCLUDE_DIR} - CACHE INTERNAL "" -) - -find_package_handle_standard_args( - Ibverbs DEFAULT_MSG IBVERBS_LIBRARY IBVERBS_INCLUDE_DIR -) - -mark_as_advanced(IBVERBS_LIBRARY IBVERBS_INCLUDE_DIR) diff --git a/cmake/rocshmem_config.h.in b/cmake/rocshmem_config.h.in index 8067651a01..d3964ef722 100644 --- a/cmake/rocshmem_config.h.in +++ b/cmake/rocshmem_config.h.in @@ -1,8 +1,6 @@ #cmakedefine DEBUG #cmakedefine PROFILE -#cmakedefine USE_GPU_IB #cmakedefine USE_RO -#cmakedefine USE_DC #cmakedefine USE_IPC #cmakedefine USE_THREADS #cmakedefine USE_SHARED_CTX diff --git a/scripts/build_configs/dc_multi b/scripts/build_configs/dc_multi deleted file mode 100755 index 4e806e7f85..0000000000 --- a/scripts/build_configs/dc_multi +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=ON \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=ON \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/dc_multi_debug b/scripts/build_configs/dc_multi_debug deleted file mode 100755 index e95ed54a6a..0000000000 --- a/scripts/build_configs/dc_multi_debug +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DDEBUG=ON \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=ON \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=ON \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/dc_multi_ipc b/scripts/build_configs/dc_multi_ipc deleted file mode 100755 index a154f34667..0000000000 --- a/scripts/build_configs/dc_multi_ipc +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=ON \ - -DUSE_IPC=ON \ - -DUSE_THREADS=ON \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/dc_multi_profile b/scripts/build_configs/dc_multi_profile deleted file mode 100755 index 37954c8342..0000000000 --- a/scripts/build_configs/dc_multi_profile +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=ON \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=ON \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=ON \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/dc_single b/scripts/build_configs/dc_single deleted file mode 100755 index 1cc14f71ef..0000000000 --- a/scripts/build_configs/dc_single +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=ON \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/ipc_single b/scripts/build_configs/ipc_single index dc6b0c522b..a0ff287adc 100755 --- a/scripts/build_configs/ipc_single +++ b/scripts/build_configs/ipc_single @@ -17,9 +17,7 @@ cmake \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ -DDEBUG=OFF \ -DPROFILE=OFF \ - -DUSE_GPU_IB=OFF \ -DUSE_RO=OFF \ - -DUSE_DC=OFF \ -DUSE_IPC=ON \ -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ diff --git a/scripts/build_configs/ipc_tests_only b/scripts/build_configs/ipc_tests_only index d725da97ef..f111cf1349 100755 --- a/scripts/build_configs/ipc_tests_only +++ b/scripts/build_configs/ipc_tests_only @@ -22,9 +22,7 @@ cmake \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ -DDEBUG=OFF \ -DPROFILE=OFF \ - -DUSE_GPU_IB=OFF \ -DUSE_RO=OFF \ - -DUSE_DC=OFF \ -DUSE_IPC=ON \ -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ diff --git a/scripts/build_configs/rc_multi b/scripts/build_configs/rc_multi deleted file mode 100755 index f38514dea7..0000000000 --- a/scripts/build_configs/rc_multi +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=ON \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_multi_debug b/scripts/build_configs/rc_multi_debug deleted file mode 100755 index 8937d426ea..0000000000 --- a/scripts/build_configs/rc_multi_debug +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=ON \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_multi_wf_coal b/scripts/build_configs/rc_multi_wf_coal deleted file mode 100755 index cfa1acaf90..0000000000 --- a/scripts/build_configs/rc_multi_wf_coal +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=ON \ - -DUSE_WF_COAL=ON \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_single b/scripts/build_configs/rc_single deleted file mode 100755 index ad378880e1..0000000000 --- a/scripts/build_configs/rc_single +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ - -DUSE_COHERENT_HEAP=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_single_debug b/scripts/build_configs/rc_single_debug deleted file mode 100755 index cf75f64819..0000000000 --- a/scripts/build_configs/rc_single_debug +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DDEBUG=ON \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_single_managed b/scripts/build_configs/rc_single_managed deleted file mode 100755 index 963fe7260b..0000000000 --- a/scripts/build_configs/rc_single_managed +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - -DUSE_MANAGED_HEAP=ON \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_single_managed_debug b/scripts/build_configs/rc_single_managed_debug deleted file mode 100755 index fd74cf904e..0000000000 --- a/scripts/build_configs/rc_single_managed_debug +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DDEBUG=ON \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - -DUSE_MANAGED_HEAP=ON \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_single_profile b/scripts/build_configs/rc_single_profile deleted file mode 100755 index ff6c0194f6..0000000000 --- a/scripts/build_configs/rc_single_profile +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DDEBUG=OFF \ - -DPROFILE=ON \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_single_single_node b/scripts/build_configs/rc_single_single_node deleted file mode 100755 index 4f1c56f79a..0000000000 --- a/scripts/build_configs/rc_single_single_node +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=ON \ - -DUSE_COHERENT_HEAP=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - -DUSE_SINGLE_NODE=ON \ - -DUSE_HOST_SIDE_HDP_FLUSH=ON\ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/rc_single_single_node_debug b/scripts/build_configs/rc_single_single_node_debug deleted file mode 100755 index 60292ca594..0000000000 --- a/scripts/build_configs/rc_single_single_node_debug +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -set -e - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_GPU_IB=ON \ - -DUSE_DC=OFF \ - -DUSE_IPC=ON \ - -DUSE_COHERENT_HEAP=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - -DUSE_SINGLE_NODE=ON \ - -DUSE_HOST_SIDE_HDP_FLUSH=ON\ - $src_path -cmake --build . --parallel 8 -cmake --install . diff --git a/scripts/build_configs/ro_ipc b/scripts/build_configs/ro_ipc index cfe359eb57..cb68279158 100755 --- a/scripts/build_configs/ro_ipc +++ b/scripts/build_configs/ro_ipc @@ -17,8 +17,6 @@ cmake \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ -DDEBUG=OFF \ -DPROFILE=OFF \ - -DUSE_GPU_IB=OFF \ - -DUSE_DC=OFF \ -DUSE_IPC=ON \ -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ diff --git a/scripts/build_configs/ro_net b/scripts/build_configs/ro_net index 240e7735b5..a8de47c368 100755 --- a/scripts/build_configs/ro_net +++ b/scripts/build_configs/ro_net @@ -17,8 +17,6 @@ cmake \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ -DDEBUG=OFF \ -DPROFILE=OFF \ - -DUSE_GPU_IB=OFF \ - -DUSE_DC=OFF \ -DUSE_IPC=OFF \ -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ diff --git a/scripts/build_configs/ro_net_debug b/scripts/build_configs/ro_net_debug index 54e0309730..b9f1766629 100755 --- a/scripts/build_configs/ro_net_debug +++ b/scripts/build_configs/ro_net_debug @@ -17,8 +17,6 @@ cmake \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ -DDEBUG=OFF \ -DPROFILE=OFF \ - -DUSE_GPU_IB=OFF \ - -DUSE_DC=OFF \ -DUSE_IPC=OFF \ -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ diff --git a/scripts/functional_tests/driver.py b/scripts/functional_tests/driver.py index 8e2894a126..b6de00ff2a 100755 --- a/scripts/functional_tests/driver.py +++ b/scripts/functional_tests/driver.py @@ -145,12 +145,9 @@ def determine_algos_from_library_config_type(config): if config['algorithms']: return config - gpu_ib = re.match('^[rd]c_', config['library_build_config_type']) thread_single = re.match('.*single.*', config['library_build_config_type']) - if not gpu_ib: - config['algorithms'] = reverse_offload_algorithms - elif thread_single: + if thread_single: config['algorithms'] = single_thread_algorithms else: config['algorithms'] = multi_thread_algorithms diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3a02f783b5..8b928eef68 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -58,9 +58,7 @@ target_compile_options( ############################################################################### # ROCSHMEM TARGET FOR BACKENDS ############################################################################### -IF (USE_GPU_IB) -add_subdirectory(gpu_ib) -ELSEIF(USE_RO) +IF (USE_RO) add_subdirectory(reverse_offload) ELSE() add_subdirectory(ipc) diff --git a/src/backend_bc.cpp b/src/backend_bc.cpp index 23d085070a..d8c3ecaac1 100644 --- a/src/backend_bc.cpp +++ b/src/backend_bc.cpp @@ -25,9 +25,7 @@ #include "backend_type.hpp" #include "context_incl.hpp" -#ifdef USE_GPU_IB -#include "gpu_ib/backend_ib.hpp" -#elif defined(USE_RO) +#ifdef USE_RO #include "reverse_offload/backend_ro.hpp" #else #include "ipc/backend_ipc.hpp" @@ -203,9 +201,7 @@ void Backend::reset_stats() { } __device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) { -#ifdef USE_GPU_IB - return static_cast(this)->create_ctx(option, ctx); -#elif defined(USE_RO) +#ifdef USE_RO return static_cast(this)->create_ctx(option, ctx); #else return static_cast(this)->create_ctx(option, ctx); @@ -213,9 +209,7 @@ __device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) { } __device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) { -#ifdef USE_GPU_IB - static_cast(this)->destroy_ctx(ctx); -#elif defined(USE_RO) +#ifdef USE_RO static_cast(this)->destroy_ctx(ctx); #else static_cast(this)->destroy_ctx(ctx); diff --git a/src/backend_bc.hpp b/src/backend_bc.hpp index ccfd81c6f3..6b71293ce2 100644 --- a/src/backend_bc.hpp +++ b/src/backend_bc.hpp @@ -278,7 +278,7 @@ class Backend { * rely on the normal inheritance mechanism to tailor behavior for * derived backend types. */ - BackendType type{BackendType::GPU_IB_BACKEND}; + BackendType type{BackendType::RO_BACKEND}; /** * @brief Dumps derived class statistics. diff --git a/src/backend_type.hpp b/src/backend_type.hpp index 020b956f57..6b75d60cb7 100644 --- a/src/backend_type.hpp +++ b/src/backend_type.hpp @@ -44,7 +44,7 @@ namespace rocshmem { * @note Derived classes which use Backend as a base class must add * themselves to this enum class to support static polymorphism. */ -enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND }; +enum class BackendType { RO_BACKEND, IPC_BACKEND }; /** * @brief Helper macro for some dispatch calls @@ -54,10 +54,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND }; /** * @brief Device static dispatch method call. */ -#ifdef USE_GPU_IB -#define DISPATCH(Func) \ - static_cast(this)->Func; -#elif defined(USE_RO) +#ifdef USE_RO #define DISPATCH(Func) \ static_cast(this)->Func; #else @@ -68,11 +65,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND }; /** * @brief Device static dispatch method call with a return value. */ -#ifdef USE_GPU_IB -#define DISPATCH_RET(Func) \ - auto ret_val = static_cast(this)->Func; \ - return ret_val; -#elif defined(USE_RO) +#ifdef USE_RO #define DISPATCH_RET(Func) \ auto ret_val = static_cast(this)->Func; \ return ret_val; @@ -85,12 +78,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND }; /** * @brief Device static dispatch method call with a return type of pointer. */ -#ifdef USE_GPU_IB -#define DISPATCH_RET_PTR(Func) \ - void *ret_val{nullptr}; \ - ret_val = static_cast(this)->Func; \ - return ret_val; -#elif defined(USE_RO) +#ifdef USE_RO #define DISPATCH_RET_PTR(Func) \ void *ret_val{nullptr}; \ ret_val = static_cast(this)->Func; \ @@ -109,9 +97,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND }; * MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and * threading semantics of collectives in OpenSHMEM match those of MPI. */ -#ifdef USE_GPU_IB -#define HOST_DISPATCH(Func) static_cast(this)->Func; -#elif defined(USE_RO) +#ifdef USE_RO #define HOST_DISPATCH(Func) static_cast(this)->Func; #else #define HOST_DISPATCH(Func) static_cast(this)->Func; @@ -124,11 +110,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND }; * threading semantics of collectives in OpenSHMEM match those of MPI. */ -#ifdef USE_GPU_IB -#define HOST_DISPATCH_RET(Func) \ - auto ret_val = static_cast(this)->Func; \ - return ret_val; -#elif defined(USE_RO) +#ifdef USE_RO #define HOST_DISPATCH_RET(Func) \ auto ret_val = static_cast(this)->Func; \ return ret_val; diff --git a/src/context_incl.hpp b/src/context_incl.hpp index f86ca3dfe8..0794397670 100644 --- a/src/context_incl.hpp +++ b/src/context_incl.hpp @@ -26,10 +26,7 @@ #include "context.hpp" #include "context_tmpl_device.hpp" #include "context_tmpl_host.hpp" -#ifdef USE_GPU_IB -#include "gpu_ib/context_ib_device.hpp" -#include "gpu_ib/context_ib_host.hpp" -#elif defined (USE_RO) +#ifdef USE_RO #include "reverse_offload/context_ro_device.hpp" #include "reverse_offload/context_ro_host.hpp" #else diff --git a/src/context_tmpl_device.hpp b/src/context_tmpl_device.hpp index a9c1dda624..e57884d602 100644 --- a/src/context_tmpl_device.hpp +++ b/src/context_tmpl_device.hpp @@ -25,9 +25,7 @@ #include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_type.hpp" -#ifdef USE_GPU_IB -#include "gpu_ib/context_ib_device.hpp" -#elif defined(USE_RO) +#ifdef USE_RO #include "reverse_offload/context_ro_device.hpp" #else #include "ipc/context_ipc_device.hpp" diff --git a/src/context_tmpl_host.hpp b/src/context_tmpl_host.hpp index 8bc913f2fc..e24f34c19d 100644 --- a/src/context_tmpl_host.hpp +++ b/src/context_tmpl_host.hpp @@ -25,9 +25,7 @@ #include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_type.hpp" -#ifdef USE_GPU_IB -#include "gpu_ib/context_ib_host.hpp" -#elif defined(USE_RO) +#ifdef USE_RO #include "reverse_offload/context_ro_host.hpp" #else #include "ipc/context_ipc_host.hpp" diff --git a/src/gpu_ib/CMakeLists.txt b/src/gpu_ib/CMakeLists.txt deleted file mode 100644 index 34848b1ca8..0000000000 --- a/src/gpu_ib/CMakeLists.txt +++ /dev/null @@ -1,45 +0,0 @@ -############################################################################### -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -############################################################################### - -############################################################################### -# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY -############################################################################### -target_sources( - ${PROJECT_NAME} - PRIVATE - backend_ib.cpp - connection.cpp - connection_policy.cpp - context_ib_device.cpp - context_ib_device_coll.cpp - context_ib_host.cpp - debug.cpp - dynamic_connection.cpp - endian.cpp - gpu_ib_team.cpp - network_policy.cpp - qe_dumper.cpp - queue_pair.cpp - reliable_connection.cpp - segment_builder.cpp - thread_policy.cpp -) diff --git a/src/gpu_ib/backend_ib.cpp b/src/gpu_ib/backend_ib.cpp deleted file mode 100644 index 7a757b67da..0000000000 --- a/src/gpu_ib/backend_ib.cpp +++ /dev/null @@ -1,493 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "backend_ib.hpp" - -#include -#include -#include - -#include -#include -#include // NOLINT(build/c++11) - -#include "rocshmem/rocshmem.hpp" -#include "../backend_type.hpp" -#include "../context_incl.hpp" -#include "gpu_ib_team.hpp" -#include "queue_pair.hpp" -#include "../host/host.hpp" - -namespace rocshmem { - -#define NET_CHECK(cmd) \ - { \ - if (cmd != MPI_SUCCESS) { \ - fprintf(stderr, "Unrecoverable error: MPI Failure\n"); \ - abort(); \ - } \ - } - -extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; - -rocshmem_team_t get_external_team(GPUIBTeam *team) { - return reinterpret_cast(team); -} - -int get_ls_non_zero_bit(char *bitmask, int mask_length) { - int position = -1; - - for (int bit_i = 0; bit_i < mask_length; bit_i++) { - int byte_i = bit_i / CHAR_BIT; - if (bitmask[byte_i] & (1 << (bit_i % CHAR_BIT))) { - position = bit_i; - break; - } - } - - return position; -} - -GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() { - if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) { - std::stringstream sstream(maximum_num_contexts_str); - sstream >> maximum_num_contexts_; - } - num_blocks_ = maximum_num_contexts_; - - init_mpi_once(comm); - - type = BackendType::GPU_IB_BACKEND; - - NET_CHECK(MPI_Comm_dup(backend_comm, &gpu_ib_comm_world)); - NET_CHECK(MPI_Comm_size(gpu_ib_comm_world, &num_pes)); - NET_CHECK(MPI_Comm_rank(gpu_ib_comm_world, &my_pe)); - - /* Initialize the host interface */ - host_interface = - new HostInterface(hdp_proxy_.get(), gpu_ib_comm_world, &heap); - - /* - * Construct default host context independently of the - * default device context (done in the async thread) - * so that host operations can execute regardless of - * device operations. - */ - setup_default_host_ctx(); - - setup_team_world(); - - rocshmem_collective_init(); - - teams_init(); - - // MPI_Comm_dup(gpu_ib_comm_world, &thread_comm); - thread_comm = gpu_ib_comm_world; - - NET_CHECK(MPI_Barrier(gpu_ib_comm_world)); - - worker_thread_exit = false; - -#ifdef USE_HOST_SIDE_HDP_FLUSH - hdp_gpu_cpu_flush_flag_ = - static_cast(rocshmem_malloc(sizeof(unsigned int))); - hdp_policy->set_flush_polling_ptr(hdp_gpu_cpu_flush_flag_); - hdp_flush_worker_thread = std::thread(&GPUIBBackend::hdp_flush_poll, this); - - // We can now initialize and set the HDP window in the host interface - host_interface->create_hdp_window(); - -#endif - - // commenting out the async thread as there is some issues with ROCm - // this makes the CPU init blocking - // async_thread_ = thread_spawn(this); - thread_func_internal(this); -} - -__device__ bool GPUIBBackend::create_ctx(int64_t options, - rocshmem_ctx_t *ctx) { - GPUIBContext *ctx_; - - auto pop_result = ctx_free_list.get()->pop_front(); - if (!pop_result.success) { - return false; - } - ctx_ = pop_result.value; - ctx->ctx_opaque = ctx_; - return true; -} - -void GPUIBBackend::ctx_create(int64_t options, void **ctx) { - GPUIBHostContext *new_ctx = nullptr; - - new_ctx = new GPUIBHostContext(this, options); - - *ctx = new_ctx; -} - -GPUIBHostContext *get_internal_gpu_ib_ctx(Context *ctx) { - return reinterpret_cast(ctx); -} - -void GPUIBBackend::ctx_destroy(Context *ctx) { - GPUIBHostContext *gpu_ib_host_ctx = get_internal_gpu_ib_ctx(ctx); - delete gpu_ib_host_ctx; -} - -__device__ void GPUIBBackend::destroy_ctx(rocshmem_ctx_t *ctx) { - ctx_free_list.get()->push_back(static_cast(ctx->ctx_opaque)); -} - -GPUIBBackend::~GPUIBBackend() { - // need to get this back once ROCm is fixed - // async_thread_.join(); - - worker_thread_exit = true; - -#ifdef USE_HOST_SIDE_HDP_FLUSH - hdp_flush_worker_thread.join(); - hdp_policy->set_flush_polling_ptr(nullptr); - rocshmem_free(hdp_gpu_cpu_flush_flag_); -#endif - - /** - * Destroy teams infrastructure - * and team world - */ - teams_destroy(); - auto *team_world{team_tracker.get_team_world()}; - team_world->~Team(); - CHECK_HIP(hipFree(team_world)); - - delete default_host_ctx_; - - NET_CHECK(MPI_Comm_free(&gpu_ib_comm_world)); - - CHECK_HIP(hipFree(default_ctx_->device_qp_proxy)); - CHECK_HIP(hipFree(default_ctx_)); - default_ctx_ = nullptr; - - delete host_interface; - host_interface = nullptr; - - networkImpl.networkHostFinalize(); - - CHECK_HIP(hipFree(ctx_array)); -} - -__host__ void GPUIBBackend::global_exit(int status) { - MPI_Abort(gpu_ib_comm_world, status); -} - -void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team, - TeamInfo *team_info_wrt_parent, - TeamInfo *team_info_wrt_world, int num_pes, - int my_pe_in_new_team, MPI_Comm team_comm, - rocshmem_team_t *new_team) { - /** - * Read the bit mask and find out a common index into - * the pool of available work arrays. - */ - NET_CHECK(MPI_Allreduce(pool_bitmask_, reduced_bitmask_, bitmask_size_, - MPI_CHAR, MPI_BAND, team_comm)); - - /* Pick the least significant non-zero bit (logical layout) in the reduced - * bitmask */ - auto max_num_teams{team_tracker.get_max_num_teams()}; - int common_index = get_ls_non_zero_bit(reduced_bitmask_, max_num_teams); - if (common_index < 0) { - /* No team available */ - abort(); - } - - /* Mark the team as taken (by unsetting the bit in the pool bitmask) */ - int byte = common_index / CHAR_BIT; - pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT)); - - /** - * Allocate device-side memory for team_world and - * construct a GPU_IB team in it - */ - GPUIBTeam *new_team_obj; - CHECK_HIP(hipMalloc(&new_team_obj, sizeof(GPUIBTeam))); - new (new_team_obj) - GPUIBTeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes, - my_pe_in_new_team, team_comm, common_index); - - *new_team = get_external_team(new_team_obj); -} - -void GPUIBBackend::team_destroy(rocshmem_team_t team) { - GPUIBTeam *team_obj = get_internal_gpu_ib_team(team); - - /* Mark the pool as available */ - int bit = team_obj->pool_index_; - int byte_i = bit / CHAR_BIT; - pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT); - - team_obj->~GPUIBTeam(); - CHECK_HIP(hipFree(team_obj)); -} - -void GPUIBBackend::dump_backend_stats() { - networkImpl.dump_backend_stats(&globalStats); -} - -void GPUIBBackend::reset_backend_stats() { networkImpl.reset_backend_stats(); } - -void GPUIBBackend::initialize_ipc() { - ipcImpl.ipcHostInit(my_pe, heap.get_heap_bases(), thread_comm); -} - -void GPUIBBackend::initialize_network() { networkImpl.networkHostSetup(this); } - -void GPUIBBackend::setup_default_host_ctx() { - default_host_ctx_ = new GPUIBHostContext(this, 0); - ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_; -} - -void GPUIBBackend::setup_ctxs() { - /* - * Allocate device-side memory for all context and construct an - * InfiniBand context in it. - */ - CHECK_HIP( - hipMalloc(&ctx_array, sizeof(GPUIBContext) * maximum_num_contexts_)); - for (int i = 0; i < maximum_num_contexts_; i++) { - new (&ctx_array[i]) GPUIBContext(this, false, i); - ctx_free_list.get()->push_back(ctx_array + i); - } -} - -void GPUIBBackend::setup_default_ctx() { - /* - * Allocate device-side memory for default context and construct an - * InfiniBand context in it. - */ - CHECK_HIP(hipMalloc(&default_ctx_, sizeof(GPUIBContext))); - new (default_ctx_) GPUIBContext(this, true, 0); - - /* - * Set the ROCSHMEM_CTX_DEFAULT in constant memory. - */ - int *symbol_address; - CHECK_HIP(hipGetSymbolAddress(reinterpret_cast(&symbol_address), - HIP_SYMBOL(ROCSHMEM_CTX_DEFAULT))); - - TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world; - rocshmem_ctx_t ctx_default_host{default_ctx_, tinfo}; - - hipStream_t stream; - CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - CHECK_HIP(hipMemcpyAsync(symbol_address, &ctx_default_host, - sizeof(rocshmem_ctx_t), hipMemcpyDefault, stream)); - CHECK_HIP(hipStreamSynchronize(stream)); - CHECK_HIP(hipStreamDestroy(stream)); -} - -void GPUIBBackend::setup_team_world() { - TeamInfo *team_info_wrt_parent, *team_info_wrt_world; - - /** - * Allocate device-side memory for team_world and construct a - * GPU_IB team in it. - */ - CHECK_HIP(hipMalloc(&team_info_wrt_parent, sizeof(TeamInfo))); - CHECK_HIP(hipMalloc(&team_info_wrt_world, sizeof(TeamInfo))); - - new (team_info_wrt_parent) TeamInfo(nullptr, 0, 1, num_pes); - new (team_info_wrt_world) TeamInfo(nullptr, 0, 1, num_pes); - - MPI_Comm team_world_comm; - NET_CHECK(MPI_Comm_dup(gpu_ib_comm_world, &team_world_comm)); - - GPUIBTeam *team_world{nullptr}; - CHECK_HIP(hipMalloc(&team_world, sizeof(GPUIBTeam))); - new (team_world) GPUIBTeam(this, team_info_wrt_parent, team_info_wrt_world, - num_pes, my_pe, team_world_comm, 0); - team_tracker.set_team_world(team_world); - - /** - * Copy the address to ROCSHMEM_TEAM_WORLD. - */ - ROCSHMEM_TEAM_WORLD = reinterpret_cast(team_world); -} - -void GPUIBBackend::init_mpi_once(MPI_Comm comm) { - static std::mutex init_mutex; - const std::lock_guard lock(init_mutex); - - int init_done = 0; - NET_CHECK(MPI_Initialized(&init_done)); - if (init_done == 0) { - int provided; - NET_CHECK( - MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided)); - } - if (comm == MPI_COMM_NULL) { - NET_CHECK(MPI_Comm_dup(MPI_COMM_WORLD, &backend_comm)); - } else { - NET_CHECK(MPI_Comm_dup(comm, &backend_comm)); - } -} - -std::thread GPUIBBackend::thread_spawn(GPUIBBackend *b) { - return std::thread(&GPUIBBackend::thread_func_internal, this, b); -} - -void GPUIBBackend::thread_func_internal(GPUIBBackend *b) { - CHECK_HIP(hipSetDevice(hip_dev_id)); - - b->initialize_ipc(); - b->initialize_network(); - b->setup_ctxs(); - b->setup_default_ctx(); - *(b->done_init) = 1; -} - -#ifdef USE_HOST_SIDE_HDP_FLUSH -void GPUIBBackend::hdp_flush_poll() { - while (!worker_thread_exit) { - if (hdp_policy->has_active_flush_request()) { - hdp_policy->hdp_flush(); - hdp_policy->clear_active_flush_flag(); - } - } -} -#endif - -void GPUIBBackend::teams_init() { - /** - * Allocate pools for the teams sync and work arrary from the SHEAP. - */ - auto max_num_teams{team_tracker.get_max_num_teams()}; - barrier_pSync_pool = reinterpret_cast(rocshmem_malloc( - sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams)); - reduce_pSync_pool = reinterpret_cast(rocshmem_malloc( - sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams)); - bcast_pSync_pool = reinterpret_cast(rocshmem_malloc( - sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams)); - alltoall_pSync_pool = reinterpret_cast(rocshmem_malloc( - sizeof(long) * ROCSHMEM_ALLTOALL_SYNC_SIZE * max_num_teams)); - - /* Accommodating for largest possible data type for pWrk */ - pWrk_pool = rocshmem_malloc( - sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams); - pAta_pool = rocshmem_malloc(sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE * - max_num_teams); - - /** - * Initialize the sync arrays in the pool with default values. - */ - long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync; - for (int team_i = 0; team_i < max_num_teams; team_i++) { - barrier_pSync = reinterpret_cast( - &barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]); - reduce_pSync = reinterpret_cast( - &reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]); - bcast_pSync = reinterpret_cast( - &bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]); - alltoall_pSync = reinterpret_cast( - &alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]); - - for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) { - barrier_pSync[i] = ROCSHMEM_SYNC_VALUE; - } - for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) { - reduce_pSync[i] = ROCSHMEM_SYNC_VALUE; - } - for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) { - bcast_pSync[i] = ROCSHMEM_SYNC_VALUE; - } - for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) { - alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE; - } - } - - /** - * Initialize bit mask - * - * Logical: - * MSB..........................................................................LSB - * Physical: MSB...1st least significant 8 bits...LSB MSB...2nd least - * signifant 8 bits...LSB - * - * Description shows only a 2-byte long mask but idea extends to any - * arbitrary size. - */ - bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1) - : (max_num_teams / CHAR_BIT); - pool_bitmask_ = reinterpret_cast(malloc(bitmask_size_)); - reduced_bitmask_ = reinterpret_cast(malloc(bitmask_size_)); - - memset(pool_bitmask_, 0, bitmask_size_); - memset(reduced_bitmask_, 0, bitmask_size_); - /* Set all to available except the 0th one (reserved for TEAM_WORLD) */ - for (int bit_i = 1; bit_i < max_num_teams; bit_i++) { - int byte_i = bit_i / CHAR_BIT; - - pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT); - } - - /** - * Make sure that all processing elements have done this before - * continuing. - */ - NET_CHECK(MPI_Barrier(gpu_ib_comm_world)); -} - -void GPUIBBackend::teams_destroy() { - rocshmem_free(barrier_pSync_pool); - rocshmem_free(reduce_pSync_pool); - rocshmem_free(bcast_pSync_pool); - rocshmem_free(alltoall_pSync_pool); - rocshmem_free(pWrk_pool); - rocshmem_free(pAta_pool); - - free(pool_bitmask_); - free(reduced_bitmask_); -} - -void GPUIBBackend::rocshmem_collective_init() { - /* - * Allocate heap space for barrier_sync - */ - size_t one_sync_size_bytes{sizeof(*barrier_sync)}; - size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE}; - heap.malloc(reinterpret_cast(&barrier_sync), sync_size_bytes); - - /* - * Initialize the barrier synchronization array with default values. - */ - for (int i = 0; i < num_pes; i++) { - barrier_sync[i] = ROCSHMEM_SYNC_VALUE; - } - - /* - * Make sure that all processing elements have done this before - * continuing. - */ - NET_CHECK(MPI_Barrier(gpu_ib_comm_world)); -} - -} // namespace rocshmem diff --git a/src/gpu_ib/backend_ib.hpp b/src/gpu_ib/backend_ib.hpp deleted file mode 100644 index 87b5dc85f1..0000000000 --- a/src/gpu_ib/backend_ib.hpp +++ /dev/null @@ -1,351 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_ -#define LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_ - -#include "../backend_bc.hpp" -#include "../containers/free_list_impl.hpp" -#include "network_policy.hpp" -#include "../hdp_policy.hpp" -#include "../hdp_proxy.hpp" -#include "../memory/hip_allocator.hpp" - -namespace rocshmem { - -class HostInterface; - -/** - * @class GPUIBBackend backend.hpp - * @brief InfiniBand specific backend. - * - * The InfiniBand (GPUIB) backend enables the device to enqueue network - * requests to InfiniBand queues (with minimal host intervention). The setup - * requires some effort from the host, but the device is able to craft - * InfiniBand requests and send them on its own. - */ -class GPUIBBackend : public Backend { - public: - /** - * @copydoc Backend::Backend(unsigned) - */ - explicit GPUIBBackend(MPI_Comm comm); - - /** - * @copydoc Backend::~Backend() - */ - virtual ~GPUIBBackend(); - - /** - * @brief Abort the application. - * - * @param[in] status Exit code. - * - * @return void. - * - * @note This routine terminates the entire application. - */ - void global_exit(int status) override; - - /** - * @copydoc Backend::create_new_team - */ - void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent, - TeamInfo *team_info_wrt_world, int num_pes, - int my_pe_in_new_team, MPI_Comm team_comm, - rocshmem_team_t *new_team) override; - - /** - * @copydoc Backend::team_destroy(rocshmem_team_t) - */ - void team_destroy(rocshmem_team_t team) override; - - /** - * @copydoc Backend::ctx_create - */ - void ctx_create(int64_t options, void **ctx) override; - - __device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx); - - /** - * @copydoc Backend::ctx_destroy - */ - void ctx_destroy(Context *ctx) override; - - /** - * @copydoc Backend::ctx_destroy - */ - __device__ void destroy_ctx(rocshmem_ctx_t *ctx); - - protected: - /** - * @copydoc Backend::dump_backend_stats() - */ - void dump_backend_stats() override; - - /** - * @copydoc Backend::reset_backend_stats() - */ - void reset_backend_stats() override; - - /** - * @brief spawn a new thread to perform the rest of initialization - */ - std::thread thread_spawn(GPUIBBackend *b); - - /** - * @brief overheads for helper thread to run - * - * @param[in] the thread needs access to the class - * - * @return void - */ - void thread_func_internal(GPUIBBackend *b); - - /** - * @brief initialize MPI. - * - * GPUIB relies on MPI just to exchange the connection information. - * - * todo: remove the dependency on MPI and make it generic to PMI-X or just - * to OpenSHMEM to have support for both CPU and GPU - */ - void init_mpi_once(MPI_Comm comm); - - /** - * @brief init the network support - */ - void initialize_network(); - - /** - * @brief Invokes the IPC policy class initialization method. - * - * This method delegates Inter Process Communication (IPC) - * initialization to the appropriate policy class. The initialization - * needs to be exposed to the Backed due to initialization ordering - * constraints. (The symmetric heaps needs to be allocated and - * initialized before this method can be called.) - * - * The policy class encapsulates what the initialization process so - * refer to that class for more details. - */ - void initialize_ipc(); - - /** - * @brief Allocate and initialize the ROCSHMEM_CTX_DEFAULT variable. - * - * @todo The default_ctx member looks unused after it is copied into - * the ROCSHMEM_CTX_DEFAULT variable. - */ - void setup_default_ctx(); - void setup_ctxs(); - - /** - * @brief Allocate and initialize the default context for host - * operations. - */ - void setup_default_host_ctx(); - - /** - * @brief Allocate and initialize team world. - */ - void setup_team_world(); - - /** - * @brief Initialize the resources required to support teams - */ - void teams_init(); - - /** - * @brief Destruct the resources required to support teams - */ - void teams_destroy(); - - /** - * @brief Allocate and initialize barrier operation addresses on - * symmetric heap. - * - * When this method completes, the barrier_sync member will be available - * for use. - */ - void rocshmem_collective_init(); - -#ifdef USE_HOST_SIDE_HDP_FLUSH - /** - * @brief A service thread routine that flushes the hdp cache on behalf of the - * GPU. - */ - void hdp_flush_poll(); - - /** - * @brief Workers used to poll on the device hdp flush request. - */ - std::thread hdp_flush_worker_thread{}; -#endif - - /** - * @brief Signals to the worker threads to exist - */ - std::atomic worker_thread_exit{false}; - - public: - /** - * @brief The host-facing interface that will be used - * by all contexts of the GPUIBBackend - */ - HostInterface *host_interface{nullptr}; - - /** - * @brief Handle for raw memory for barrier sync - */ - long *barrier_pSync_pool{nullptr}; - - /** - * @brief Handle for raw memory for reduce sync - */ - long *reduce_pSync_pool{nullptr}; - - /** - * @brief Handle for raw memory for broadcast sync - */ - long *bcast_pSync_pool{nullptr}; - - /** - * @brief Handle for raw memory for alltoall sync - */ - long *alltoall_pSync_pool{nullptr}; - - /** - * @brief Handle for raw memory for work - */ - void *pWrk_pool{nullptr}; - - /** - * @brief Handle for raw memory for alltoall - */ - void *pAta_pool{nullptr}; - - /** - * @brief rocSHMEM's copy of MPI_COMM_WORLD (for interoperability - * with orthogonal MPI usage in an MPI+rocSHMEM program). - */ - MPI_Comm gpu_ib_comm_world{}; - MPI_Comm backend_comm{}; - - /** - * @brief Holds number of blocks used in library - */ - size_t num_blocks_{1}; - - private: - /** - * @brief Allocates cacheable, device memory for the hdp policy. - * - * @note Internal data ownership is managed by the proxy - */ - HdpProxy hdp_proxy_{}; - - public: - /** - * @brief Policy choice for two HDP implementations. - * - * @todo Combine HDP related stuff together into a class with a - * reasonable interface. The functionality does not need to exist in - * multiple pieces in the Backend and QueuePair classes. The hdp_rkey, - * hdp_addresses, and hdp_policy fields should all live in the class. - */ - HdpPolicy *hdp_policy{hdp_proxy_.get()}; - - /** - * @brief Scratchpad for the internal barrier algorithms. - */ - int64_t *barrier_sync{nullptr}; - - /** - * @brief Compile-time configuration policy for network (IB) - * - * - * The configuration option "USE_SINGLE_NODE" can be enabled to not build - * with network support. - */ - NetworkImpl networkImpl{}; - - private: - /** - * @brief An array of @ref ROContexts that backs the context FreeList. - */ - GPUIBContext *ctx_array{nullptr}; - - /** - * @brief A free-list containing contexts. - */ - FreeListProxy ctx_free_list{}; - - /** - * @brief Holds maximum number of contexts used in library - */ - size_t maximum_num_contexts_{1024}; - - /** - * @brief The bitmask representing the availability of teams in the pool - */ - char *pool_bitmask_{nullptr}; - - /** - * @brief Bitmask to store the reduced result of bitmasks on pariticipating - * PEs - * - * With no thread-safety for this bitmask, multithreaded creation of teams is - * not supported. - */ - char *reduced_bitmask_{nullptr}; - - /** - * @brief Size of the bitmask - */ - int bitmask_size_{-1}; - - /** - * @brief a helper thread to perform the initialization (non-blocking init) - */ - std::thread async_thread_{}; - - /** - * @brief Holds a copy of the default context (see OpenSHMEM - * specification). - * - * @todo Remove this member from the backend class. There is another - * copy stored in ROCSHMEM_CTX_DEFAULT. - */ - GPUIBContext *default_ctx_{nullptr}; - - /** - * @brief Holds a copy of the default context for host functions - */ - GPUIBHostContext *default_host_ctx_{nullptr}; - - unsigned int* hdp_gpu_cpu_flush_flag_; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_ diff --git a/src/gpu_ib/connect_tests.cpp b/src/gpu_ib/connect_tests.cpp deleted file mode 100644 index 0f2cc67686..0000000000 --- a/src/gpu_ib/connect_tests.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include -#include - -#include "dynamic_connection.hpp" -#include "reliable_connection.hpp" - -namespace rocshmem { - -// test with different use_ib_hca -// test with different heap size -// test with different sleep -// test with different sq_size - -TEST(DynamicConnect, ToNothing) { - DynamicConnection connect; - connect.construct_init(1); -} - -// test with different num_dcis -// test with different num_dcts - -TEST(ReliableConnect, ToNothing) { - ReliableConnection connect; - connect.construct_init(1); -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - -} // namespace rocshmem diff --git a/src/gpu_ib/connection.cpp b/src/gpu_ib/connection.cpp deleted file mode 100644 index 31f8eda4d3..0000000000 --- a/src/gpu_ib/connection.cpp +++ /dev/null @@ -1,431 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "connection.hpp" - -#include - -#include // NOLINT(build/c++11) -#include - -#include "backend_ib.hpp" -#include "queue_pair.hpp" -#include "../util.hpp" - -namespace rocshmem { - -int Connection::use_gpu_mem = 0; -int Connection::coherent_cq = 0; - -Connection::Connection(GPUIBBackend* b, int k) : backend(b), key_offset(k) { - char* value = nullptr; - - if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) { - requested_dev = value; - } - - if ((value = getenv("ROCSHMEM_SQ_SIZE"))) { - sq_size = atoi(value); - } - - if ((value = getenv("ROCSHMEM_USE_CQ_GPU_MEM")) != nullptr) { - cq_use_gpu_mem = atoi(value); - } - - if ((value = getenv("ROCSHMEM_USE_SQ_GPU_MEM")) != nullptr) { - sq_use_gpu_mem = atoi(value); - } -} - -Connection::~Connection() { delete ib_state; } - -void Connection::reg_mr(void* ptr, size_t size, ibv_mr** mr, bool managed) { - int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | - IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; - if (managed) { - access |= IBV_ACCESS_ON_DEMAND; - } - - *mr = ibv_reg_mr(ib_state->pd, ptr, size, access); - - if (*mr == nullptr) { - abort(); - } -} - -unsigned Connection::total_number_connections() { - int connections; - get_remote_conn(&connections); - return backend->num_blocks_ * connections; -} - -void Connection::initialize(int num_block) { - allocate_dynamic_members(num_block); - - int ib_devices{0}; - dev_list = ibv_get_device_list(&ib_devices); - if (dev_list == nullptr) { - abort(); - } - - struct ibv_device* ib_dev = dev_list[0]; - if (requested_dev != nullptr) { - for (int i = 0; i < ib_devices; i++) { - const char* select_dev = ibv_get_device_name(dev_list[i]); - if (strstr(select_dev, requested_dev) != nullptr) { - ib_dev = dev_list[i]; - break; - } - } - } - - uint8_t port = 1; - ib_init(ib_dev, port); - - int hip_dev_id = 0; - CHECK_HIP(hipGetDevice(&hip_dev_id)); - - int ib_fork_err = ibv_fork_init(); - if (ib_fork_err != 0) printf("error: ibv_fork_init failed \n"); - - sq_post_dv = static_cast( - malloc(sizeof(sq_post_dv_t) * total_number_connections())); - - if (sq_post_dv == nullptr) { - abort(); - } - - create_qps(port, backend->my_pe, &ib_state->portinfo); - initialize_1(port, num_block); - - MPI_Barrier(backend->thread_comm); - free_dynamic_members(); -} - -void Connection::finalize() { - ibv_free_device_list(dev_list); - - int ret = ibv_dereg_mr(backend->networkImpl.heap_mr); - if (ret) { - abort(); - } - // comment until rocm 4.5 - // ibv_dereg_mr(backend->networkImpl.hdp_mr); - ibv_dereg_mr(backend->networkImpl.mr); -} - -void Connection::ib_init(struct ibv_device* ib_dev, uint8_t port) { - ib_state = new ib_state_t; - if (!ib_state) { - abort(); - } - - ib_state->context = ibv_open_device(ib_dev); - if (!ib_state->context) { - delete ib_state; - abort(); - } - - ib_state->pd = ibv_alloc_pd(ib_state->context); - if (!ib_state->pd) { - delete ib_state; - abort(); - } - - ibv_parent_domain_init_attr pattr; - init_parent_domain_attr(&pattr); - ib_state->pd = ibv_alloc_parent_domain(ib_state->context, &pattr); - - ibv_query_port(ib_state->context, port, &ib_state->portinfo); -} - -template -void Connection::try_to_modify_qp(ibv_qp* qp, StateType state) { - ibv_modify_qp(qp, &state.exp_qp_attr, state.exp_attr_mask); -} - -void Connection::init_qp_status(ibv_qp* qp, uint8_t port) { - try_to_modify_qp(qp, initqp(port)); -} - -/** - * rtr stands for 'ready to receive' - */ -void Connection::change_status_rtr(ibv_qp* qp, dest_info_t* dest, - uint8_t port) { - try_to_modify_qp(qp, rtr(dest, port)); -} - -/** - * rts stands for 'ready to send' - */ -void Connection::change_status_rts(ibv_qp* qp, dest_info_t* dest) { - try_to_modify_qp(qp, rts(dest)); -} - -void Connection::create_qps(uint8_t port, int my_rank, - ibv_port_attr* ib_port_att) { - create_qps_1(); - - ibv_qp_cap cap{}; - cap.max_send_wr = sq_size; - cap.max_send_sge = 1; - cap.max_inline_data = 4; - - QPInitAttr qp_init_attr = qpattr(cap); - - size_t qp_size = total_number_connections(); - cqs.resize(qp_size); - qps.resize(qp_size); - - int cqe = qp_init_attr.attr.cap.max_send_wr; - for (auto& entry : cqs) { - entry = create_cq(ib_state->context, ib_state->pd, cqe); - if (!entry) { - abort(); - } - } - - create_qps_2(port, my_rank, ib_port_att); - - for (int i = 0; i < qps.size(); i++) { - qps[i] = - create_qp(ib_state->pd, ib_state->context, &qp_init_attr.attr, cqs[i]); - if (!qps[i]) { - abort(); - } - - create_qps_3(port, qps[i], i, ib_port_att); - } -} - -void Connection::initialize_gpu_policy(ConnectionImpl** conn, - uint32_t* heap_rkey) { - CHECK_HIP(hipMalloc(reinterpret_cast(conn), sizeof(ConnectionImpl))); - new (*conn) ConnectionImpl(this, heap_rkey); -} - -/* - * Create and write the rdma segment to the SQ - */ -void Connection::set_rdma_seg(mlx5_wqe_raddr_seg* rdma, uint64_t address, - uint32_t rkey) { - rdma->raddr = htobe64(address); - rdma->rkey = htobe32(rkey); -} - -/* - * Retrieve the address of a SQ. - * We used this address to write the WQE directly to the SQ. - */ -uint64_t* Connection::get_address_sq(int i) { - mlx5dv_obj mlx_obj; - mlx5dv_qp qp_out; - - mlx_obj.qp.in = qps[i]; - mlx_obj.qp.out = &qp_out; - - mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP); - - return reinterpret_cast(qp_out.sq.buf); -} - -void* Connection::buf_alloc([[maybe_unused]] struct ibv_pd* pd, - [[maybe_unused]] void* pd_context, size_t size, - [[maybe_unused]] size_t alignment, - [[maybe_unused]] uint64_t resource_type) { - if (use_gpu_mem) { - void* dev_ptr; - if (coherent_cq == 1) { -#if defined USE_COHERENT_HEAP - CHECK_HIP(hipMalloc(reinterpret_cast(&dev_ptr), size)); -#else - #ifdef HIP_SUPPORTS_MALLOC_UNCACHED - CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast(&dev_ptr), size, - hipDeviceMallocUncached)); - #else - CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast(&dev_ptr), size, - hipDeviceMallocFinegrained)); - #endif -#endif - } else { -#ifdef HIP_SUPPORTS_MALLOC_UNCACHED - CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast(&dev_ptr), size, - hipDeviceMallocUncached)); -#else - CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast(&dev_ptr), size, - hipDeviceMallocFinegrained)); -#endif - - } - memset(dev_ptr, 0, size); - return dev_ptr; - } - return IBV_ALLOCATOR_USE_DEFAULT; -} - -void Connection::buf_release([[maybe_unused]] struct ibv_pd* pd, - [[maybe_unused]] void* pd_context, void* ptr, - [[maybe_unused]] uint64_t resource_type) { - if (use_gpu_mem) { - CHECK_HIP(hipFree(ptr)); - } else { - free(ptr); - } -} - -void Connection::init_parent_domain_attr(ibv_parent_domain_init_attr* attr1) { - attr1->pd = ib_state->pd; - attr1->td = nullptr; - attr1->comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS; - attr1->alloc = Connection::buf_alloc; - attr1->free = Connection::buf_release; - attr1->pd_context = nullptr; -} - -ibv_cq* Connection::create_cq(ibv_context* context, ibv_pd* pd, int cqe) { - use_gpu_mem = cq_use_gpu_mem; - - ibv_cq_init_attr_ex cq_attr; - memset(&cq_attr, 0, sizeof(ibv_cq_init_attr_ex)); - cq_attr.cqe = cqe; - cq_attr.cq_context = nullptr; - cq_attr.channel = nullptr; - cq_attr.comp_vector = 0; - cq_attr.flags = 0; // see ibv_exp_cq_create_flags - cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_PD; - cq_attr.parent_domain = pd; - - coherent_cq = 1; - ibv_cq_ex* cq = ibv_create_cq_ex(context, &cq_attr); - coherent_cq = 0; - if (!cq) { - printf("error in ibv_create_cq_ex: %d %s\n", errno, strerror(errno)); - return nullptr; - } - return ibv_cq_ex_to_cq(cq); -} - -void Connection::init_gpu_qp_from_connection(QueuePair* gpu_qp, - int conn_num) { - int hip_dev_id = 0; - CHECK_HIP(hipGetDevice(&hip_dev_id)); - use_gpu_mem = cq_use_gpu_mem; - - mlx5dv_cq cq_out; - mlx5dv_obj mlx_obj; - mlx_obj.cq.in = cqs[conn_num]; - mlx_obj.cq.out = &cq_out; - - mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_CQ); - gpu_qp->cq_log_size = log2(cq_out.cqe_cnt); - gpu_qp->cq_size = cq_out.cqe_cnt; - - void* gpu_ptr = nullptr; - if (use_gpu_mem) { - gpu_qp->current_cq_q = reinterpret_cast(cq_out.buf); - } else { - rocm_memory_lock_to_fine_grain(reinterpret_cast(cq_out.buf), - cq_out.cqe_cnt * 64, &gpu_ptr, hip_dev_id); - gpu_qp->current_cq_q = reinterpret_cast(gpu_ptr); - } - gpu_qp->current_cq_q_H = reinterpret_cast(cq_out.buf); - - rocm_memory_lock_to_fine_grain(reinterpret_cast(cq_out.dbrec), 64, - &gpu_ptr, hip_dev_id); - - gpu_qp->dbrec_cq = reinterpret_cast(gpu_ptr); - - use_gpu_mem = sq_use_gpu_mem; - - mlx5dv_qp qp_out; - mlx_obj.qp.in = qps[conn_num]; - mlx_obj.qp.out = &qp_out; - - mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP); - - gpu_qp->max_nwqe = (qp_out.sq.wqe_cnt); - - volatile uint32_t* dbrec_send = qp_out.dbrec + 1; - - if (use_gpu_mem) { - gpu_qp->current_sq = reinterpret_cast(qp_out.sq.buf); - gpu_qp->dbrec_send = reinterpret_cast(dbrec_send); - } else { - gpu_ptr = nullptr; - rocm_memory_lock_to_fine_grain(reinterpret_cast(qp_out.sq.buf), - qp_out.sq.wqe_cnt * 64, &gpu_ptr, - hip_dev_id); - - gpu_qp->current_sq = reinterpret_cast(gpu_ptr); - - rocm_memory_lock_to_fine_grain( - reinterpret_cast(const_cast(dbrec_send)), 32, - &gpu_ptr, hip_dev_id); - - gpu_qp->dbrec_send = reinterpret_cast(gpu_ptr); - } - - gpu_qp->current_sq_H = reinterpret_cast(qp_out.sq.buf); - - gpu_qp->setDBval(*(reinterpret_cast(qp_out.sq.buf))); - - rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, - hip_dev_id); - - gpu_qp->db.ptr = reinterpret_cast(gpu_ptr); - - uint32_t* sq = reinterpret_cast(qp_out.sq.buf); - uint32_t ctrl_qp_sq = (reinterpret_cast(sq))[1]; - gpu_qp->ctrl_qp_sq = ctrl_qp_sq & 0xFFFFFF; - gpu_qp->ctrl_sig = (reinterpret_cast(sq))[1]; - gpu_qp->rkey = (reinterpret_cast(sq))[6 + key_offset]; - gpu_qp->lkey = (reinterpret_cast(sq))[9 + key_offset]; -} - -ibv_qp* Connection::create_qp(ibv_pd* pd, ibv_context* context, - ibv_qp_init_attr_ex* qp_attr, ibv_cq* cq) { - use_gpu_mem = sq_use_gpu_mem; - - ibv_qp* qp = nullptr; - - assert(pd); - assert(context); - assert(qp_attr); - - qp_attr->send_cq = cq; - qp_attr->recv_cq = cq; - qp_attr->pd = pd; - - qp_attr->comp_mask = IBV_QP_INIT_ATTR_PD; - - qp = create_qp_0(context, qp_attr); - - if (!qp) { - printf("***** error ibv_create_qp failed %d m %m \n", errno, errno); - ibv_destroy_cq(cq); - } - - return qp; -} - -} // namespace rocshmem diff --git a/src/gpu_ib/connection.hpp b/src/gpu_ib/connection.hpp deleted file mode 100644 index bdcf9eaeba..0000000000 --- a/src/gpu_ib/connection.hpp +++ /dev/null @@ -1,259 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_HPP_ -#define LIBRARY_SRC_GPU_IB_CONNECTION_HPP_ - -#include - -extern "C" { -#include -} - -#include - -#include "rocshmem/rocshmem.hpp" -#include "connection_policy.hpp" - -namespace rocshmem { - -class GPUIBBackend; -class QueuePair; - -class Connection { - protected: - typedef struct ib_state { - struct ibv_context* context; - struct ibv_pd* pd; - struct ibv_mr* mr; - struct ibv_port_attr portinfo; - } ib_state_t; - - typedef struct dest_info { - int lid; - int qpn; - int psn; - union ibv_gid gid; - } dest_info_t; - - typedef struct heap_info { - void* base_heap; - uint32_t rkey; - } heap_info_t; - - struct sq_post_dv_t { - uint64_t segments[16]; - uint32_t current_sq; - uint16_t wqe_idx; - }; - - class State { - public: - ibv_qp_attr exp_qp_attr{}; - uint64_t exp_attr_mask{}; - }; - - class InitQPState : public State { - public: - InitQPState() { - exp_qp_attr.qp_state = IBV_QPS_INIT; - exp_qp_attr.qp_access_flags = - IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; - - exp_attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; - } - }; - - class RtrState : public State { - public: - RtrState() { - exp_qp_attr.qp_state = IBV_QPS_RTR; - exp_qp_attr.path_mtu = IBV_MTU_4096; - exp_qp_attr.ah_attr.sl = 1; - exp_qp_attr.max_dest_rd_atomic = 1; - exp_qp_attr.min_rnr_timer = 12; - - exp_attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU; - } - }; - - class RtsState : public State { - public: - RtsState() { - exp_qp_attr.qp_state = IBV_QPS_RTS; - exp_qp_attr.timeout = 14; - exp_qp_attr.retry_cnt = 7; - exp_qp_attr.rnr_retry = 7; - exp_qp_attr.max_rd_atomic = 1; - - exp_attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | - IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC; - } - }; - - class QPInitAttr { - public: - explicit QPInitAttr(ibv_qp_cap cap) { - attr.cap = cap; - attr.sq_sig_all = 0; - } - ibv_qp_init_attr_ex attr{}; - }; - - public: - Connection(GPUIBBackend* backend, int key_offset); - - virtual ~Connection(); - - void initialize(int num_block); - - void finalize(); - - virtual void post_wqes() = 0; - - void reg_mr(void* ptr, size_t size, ibv_mr** mr, bool is_managed); - - virtual void get_remote_conn(int* remote_conn) = 0; - - unsigned total_number_connections(); - - virtual void initialize_rkey_handle(uint32_t** heap_rkey_handle, - ibv_mr* mr) = 0; - - virtual void free_rkey_handle(uint32_t* heap_rkey_handle) = 0; - - void initialize_gpu_policy(ConnectionImpl** conn, uint32_t* heap_rkey); - - /* - * Populate a QueuePair for use on the GPU from the internal IB state. - */ - void init_gpu_qp_from_connection(QueuePair* qp, int conn_num); - - protected: - Connection() = default; - - virtual InitQPState initqp(uint8_t port) = 0; - - virtual RtrState rtr(dest_info_t* dest, uint8_t port) = 0; - - virtual RtsState rts(dest_info_t* dest) = 0; - - virtual QPInitAttr qpattr(ibv_qp_cap cap) = 0; - - void init_qp_status(ibv_qp* qp, uint8_t port); - - void change_status_rtr(ibv_qp* qp, dest_info_t* dest, uint8_t port); - - void change_status_rts(ibv_qp* qp, dest_info_t* dest); - - void create_qps(uint8_t port, int my_rank, ibv_port_attr* ib_port_att); - - template - void try_to_modify_qp(ibv_qp* qp, T state); - - virtual void create_qps_1() = 0; - - virtual void create_qps_2(int port, int my_rank, - ibv_port_attr* ib_port_att) = 0; - - virtual void create_qps_3(int port, ibv_qp* qp, int offset, - ibv_port_attr* ib_port_att) = 0; - - virtual ibv_qp* create_qp_0(ibv_context* context, - ibv_qp_init_attr_ex* qp_attr) = 0; - - virtual void allocate_dynamic_members(int num_block) = 0; - - virtual void free_dynamic_members() = 0; - - virtual void initialize_1(int port, int num_block) = 0; - - virtual void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, - int dc_key) = 0; - - virtual int get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) = 0; - - void set_sq_dv(int num_block, int wg_idx, int pe_idx); - - /* - * ibv interface functions must be static. - */ - static void* buf_alloc(ibv_pd* pd, void* pd_context, size_t size, - size_t alignment, uint64_t resource_type); - - static void buf_release(ibv_pd* pd, void* pd_context, void* ptr, - uint64_t resource_type); - - void init_parent_domain_attr(ibv_parent_domain_init_attr* attr); - - void set_rdma_seg(mlx5_wqe_raddr_seg* rdma, uint64_t address, uint32_t rkey); - - uint64_t* get_address_sq(int i); - - ibv_cq* create_cq(ibv_context* context, ibv_pd* pd, int cqe); - - ibv_qp* create_qp(ibv_pd* pd, ibv_context* context, - ibv_qp_init_attr_ex* qp_attr, ibv_cq* rcq); - - /* - * TODO: Remove this eventually. Goal is to have backend delegate - * connection stuff to this class, while this class knows nothing about - * GPUs or backends. - */ - GPUIBBackend* backend{nullptr}; - - uint32_t sq_size{1024}; - - ib_state_t* ib_state{nullptr}; - - const int key_offset{0}; - - sq_post_dv_t* sq_post_dv{nullptr}; - - std::vector cqs; - - std::vector qps; - - uint64_t counter_wqe{0}; - - static int use_gpu_mem; - - static int coherent_cq; - - int cq_use_gpu_mem{1}; - - int sq_use_gpu_mem{1}; - - private: - void init_shmem_handle(); - - void ib_init(ibv_device* ib_dev, uint8_t port); - - char* requested_dev{nullptr}; - - ibv_device** dev_list{nullptr}; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_CONNECTION_HPP_ diff --git a/src/gpu_ib/connection_policy.cpp b/src/gpu_ib/connection_policy.cpp deleted file mode 100644 index 5ccf91a0ea..0000000000 --- a/src/gpu_ib/connection_policy.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "connection_policy.hpp" - -#include - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "dynamic_connection.hpp" -#include "queue_pair.hpp" - -#ifdef DEBUG -#define HIP_ENABLE_PRINTF 1 -#endif - -namespace rocshmem { - -RCConnectionImpl::RCConnectionImpl([[maybe_unused]] Connection* conn, - [[maybe_unused]] uint32_t* _vec_rkey) {} - -DCConnectionImpl::DCConnectionImpl(Connection* conn, uint32_t* _vec_rkey) - : vec_dct_num(static_cast(conn)->get_vec_dct_num()), - vec_rkey(_vec_rkey), - vec_lids(static_cast(conn)->get_vec_lids()) {} - -__device__ uint32_t RCConnectionImpl::getNumWqesImpl([ - [maybe_unused]] uint8_t opcode) { - return 1; -} - -__device__ uint32_t DCConnectionImpl::getNumWqesImpl(uint8_t opcode) { - // FIXME: We assume all threads in wave are performing ATOMIC ops. - // While this might be common, we do not have such restriction - // so need to be fixed. - // Since OFED 5.2, a DC segments uses 48bytes - so with or without - // atomic we need 2 wqes. - // return 2; - return (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS) - ? 2 - : 1; -} - -__device__ bool RCConnectionImpl::updateConnectionSegmentImpl( - [[maybe_unused]] ib_mlx5_base_av_t* wqe, [[maybe_unused]] int pe) { - return false; -} - -__device__ bool DCConnectionImpl::updateConnectionSegmentImpl( - ib_mlx5_base_av_t* wqe, int pe) { - wqe->dqp_dct = vec_dct_num[pe]; - wqe->rlid = vec_lids[pe]; - return true; -} - -__device__ void RCConnectionImpl::setRkeyImpl([[maybe_unused]] uint32_t* rkey, - [[maybe_unused]] int pe) {} - -__device__ void DCConnectionImpl::setRkeyImpl(uint32_t* rkey, int pe) { - *rkey = vec_rkey[pe]; -} - -} // namespace rocshmem diff --git a/src/gpu_ib/connection_policy.hpp b/src/gpu_ib/connection_policy.hpp deleted file mode 100644 index 76bb5db1cf..0000000000 --- a/src/gpu_ib/connection_policy.hpp +++ /dev/null @@ -1,130 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_ -#define LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_ - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "infiniband_structs.hpp" - -namespace rocshmem { - -/* - * CRTP base class for connection type - */ -template -class ConnectionBase { - public: - /* - * Control segment WQE offset imposed by this connection type. - */ - __device__ int wqeCntrlOffset() { - return static_cast(this)->wqeCntrlOffsetImpl(); - } - - /* - * Whether or not we need to force PE-level divergence when posting for - * this connection type. - */ - __device__ bool forcePostDivergence() { - return static_cast(this)->forcePostDivergenceImpl(); - } - - /* - * Number of WQEs produced by this connection type for the given opcode. - */ - __device__ uint32_t getNumWqes(uint8_t opcode) { - return static_cast(this)->getNumWqesImpl(opcode); - } - - /* - * Updates the connection-specific segment in the SQ. - */ - __device__ bool updateConnectionSegment(ib_mlx5_base_av_t* wqe, int pe) { - return static_cast(this)->updateConnectionSegmentImpl(wqe, pe); - } - - /* - * Set the rkey based on this connection type. - */ - __device__ void setRkey(uint32_t* rkey, int pe) { - static_cast(this)->setRkeyImpl(rkey, pe); - } -}; - -class Connection; - -/* - * Connection policy corresponding to an RC connection type. - */ -class RCConnectionImpl : public ConnectionBase { - public: - RCConnectionImpl(Connection* conn, uint32_t* _vec_rkey); - - __device__ int wqeCntrlOffsetImpl() { return 0; } - - __device__ bool forcePostDivergenceImpl() { return true; } - - __device__ uint32_t getNumWqesImpl(uint8_t opcode); - - __device__ bool updateConnectionSegmentImpl(ib_mlx5_base_av_t* wqe, int pe); - - __device__ void setRkeyImpl(uint32_t* rkey, int pe); -}; - -/* - * Connection policy corresponding to a DC connection type. - */ -class DCConnectionImpl : public ConnectionBase { - public: - DCConnectionImpl(Connection* conn, uint32_t* _vec_rkey); - - __device__ int wqeCntrlOffsetImpl() { return 1; } - - __device__ bool forcePostDivergenceImpl() { return false; } - - __device__ uint32_t getNumWqesImpl(uint8_t opcode); - - __device__ bool updateConnectionSegmentImpl(ib_mlx5_base_av_t* wqe, int pe); - - __device__ void setRkeyImpl(uint32_t* rkey, int pe); - - private: - uint32_t* vec_dct_num{nullptr}; - - uint32_t* vec_rkey{nullptr}; - - uint16_t* vec_lids{nullptr}; -}; - -/* - * Select which one of our connection policies to use at compile time. - */ -#ifdef USE_DC -typedef DCConnectionImpl ConnectionImpl; -#else -typedef RCConnectionImpl ConnectionImpl; -#endif - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_ diff --git a/src/gpu_ib/context_ib_device.cpp b/src/gpu_ib/context_ib_device.cpp deleted file mode 100644 index c3e8146736..0000000000 --- a/src/gpu_ib/context_ib_device.cpp +++ /dev/null @@ -1,371 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "context_ib_device.hpp" - -#include - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "rocshmem/rocshmem.hpp" -#include "../backend_type.hpp" -#include "../context_incl.hpp" -#include "backend_ib.hpp" -#include "queue_pair.hpp" - -namespace rocshmem { - -__host__ GPUIBContext::GPUIBContext(Backend *backend, bool option, int idx) - : Context(backend, option) { - GPUIBBackend *b{static_cast(backend)}; - ctx_idx = idx; - networkImpl = b->networkImpl; - base_heap = b->heap.get_heap_bases().data(); - networkImpl.networkHostInit(this, idx); - - barrier_sync = b->barrier_sync; - ipcImpl_.ipc_bases = b->ipcImpl.ipc_bases; - ipcImpl_.shm_size = b->ipcImpl.shm_size; -} - -__device__ void GPUIBContext::ctx_create() { - /* Nothing to do in the GPU_IB backend */ - return; -} - -/* - * TODO(bpotter): these will go in a policy class based on DC/RC. - * I am not completely sure at this point what else is needed in said class, - * so just leave them up here for now. - */ -__device__ __host__ QueuePair *GPUIBContext::getQueuePair(int pe) { - return networkImpl.getQueuePair(device_qp_proxy, pe); -} - -__device__ __host__ int GPUIBContext::getNumQueuePairs() { - return networkImpl.getNumQueuePairs(); -} - -__device__ __host__ int GPUIBContext::getNumDest() { - return networkImpl.getNumDest(); -} - -__device__ void GPUIBContext::fence() { -#ifdef USE_SINGLE_NODE - threadfence_system(); -#else - - for (int k = 0; k < getNumDest(); k++) { - getQueuePair(k)->fence(k); - } - - fence_.flush(); -#endif -} - -__device__ void GPUIBContext::fence(int pe) { -#ifdef USE_SINGLE_NODE - threadfence_system(); -#else - getQueuePair(pe)->fence(pe); - fence_.flush(); -#endif -} - -__device__ void GPUIBContext::putmem_nbi(void *dest, const void *source, - size_t nelems, int pe) { - uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; - - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[local_pe] + L_offset, - const_cast(source), nelems); - } else { - bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems); - if (!must_send_message) { - return; - } - - auto *qp = getQueuePair(pe); - qp->put_nbi(base_heap[pe] + L_offset, source, nelems, pe, true); - } -} - -__device__ void GPUIBContext::getmem_nbi(void *dest, const void *source, - size_t nelems, int pe) { - const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = const_cast(src_typed) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems); - } else { - bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems); - if (!must_send_message) { - return; - } - - auto *qp = getQueuePair(pe); - qp->get_nbi(base_heap[pe] + L_offset, dest, nelems, pe, true); - } -} - -__device__ void GPUIBContext::quiet() { -#ifdef USE_SINGLE_NODE - threadfence_system(); - for (int pe = 0; pe < ipcImpl_.shm_size; pe++) { - if (pe != my_pe) { - ipcImpl_.zero_byte_read(pe); - } - } -#else - - for (int k = 0; k < getNumDest(); k++) { - getQueuePair(k)->quiet_single_heavy(k); - } - fence_.flush(); - -#endif -} - -__device__ void *GPUIBContext::shmem_ptr(const void *dest, int pe) { - void *ret = nullptr; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - void *dst = const_cast(dest); - uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; - int local_pe = pe % ipcImpl_.shm_size; - ret = ipcImpl_.ipc_bases[local_pe] + L_offset; - } - return ret; -} - -__device__ void GPUIBContext::threadfence_system() { - int thread_id = get_flat_block_id(); - - if (thread_id % WF_SIZE == lowerID()) { -#ifdef USE_SINGLE_NODE - // Flush current PE HDP - HdpPolicy::hdp_flush( - reinterpret_cast(networkImpl.hdp_address)); - - // Flush the rest of the HDPs - for (int pe = 0; pe < ipcImpl_.shm_size; pe++) { - auto target_address = networkImpl.hdp_address; - const int value = HdpPolicy::HDP_FLUSH_VAL; - if (pe != my_pe) { - const int value = HdpPolicy::HDP_FLUSH_VAL; - auto mapped_address = - shmem_ptr(reinterpret_cast(target_address), pe); - __hip_atomic_store(static_cast(mapped_address), value, - __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); - } - } -#else - getQueuePair(my_pe)->hdp_policy->flushCoherency(); -#endif - } - - __threadfence_system(); -} - -__device__ void GPUIBContext::getmem(void *dest, const void *source, - size_t nelems, int pe) { - const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = const_cast(src_typed) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems); - } else { - bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems); - if (!must_send_message) { - return; - } - auto *qp = getQueuePair(pe); - qp->get_nbi_cqe(base_heap[pe] + L_offset, dest, nelems, pe, true); - qp->quiet_single(); - } - fence_.flush(); -} - -__device__ void GPUIBContext::putmem(void *dest, const void *source, - size_t nelems, int pe) { - uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[local_pe] + L_offset, - const_cast(source), nelems); - - threadfence_system(); - ipcImpl_.zero_byte_read(pe); - } else { - bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems); - if (!must_send_message) { - return; - } - auto *qp = getQueuePair(pe); - qp->put_nbi_cqe(base_heap[pe] + L_offset, source, nelems, pe, true); - qp->quiet_single(); - } - fence_.flush(); -} - -/****************************************************************************** - ************************ WORKGROUP/WAVE-LEVEL RMA API ************************ - *****************************************************************************/ -__device__ void GPUIBContext::putmem_nbi_wg(void *dest, const void *source, - size_t nelems, int pe) { - uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[local_pe] + L_offset, - const_cast(source), nelems); - } else { - if (is_thread_zero_in_block()) { - auto *qp = getQueuePair(pe); - qp->put_nbi(base_heap[pe] + L_offset, source, nelems, pe, true); - } - } - __syncthreads(); -} - -__device__ void GPUIBContext::putmem_nbi_wave(void *dest, const void *source, - size_t nelems, int pe) { - uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[local_pe] + L_offset, - const_cast(source), nelems); - } else { - if (is_thread_zero_in_wave()) { - auto *qp = getQueuePair(pe); - qp->put_nbi(base_heap[pe] + L_offset, source, nelems, pe, true); - } - } -} - -__device__ void GPUIBContext::putmem_wg(void *dest, const void *source, - size_t nelems, int pe) { - uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[local_pe] + L_offset, - const_cast(source), nelems); - __syncthreads(); - threadfence_system(); - ipcImpl_.zero_byte_read(pe); - } else { - auto *qp = getQueuePair(pe); - if (is_thread_zero_in_block()) { - qp->put_nbi_cqe(base_heap[pe] + L_offset, source, nelems, pe, true); - } - qp->quiet_single(); - } - __syncthreads(); - fence_.flush(); -} - -__device__ void GPUIBContext::putmem_wave(void *dest, const void *source, - size_t nelems, int pe) { - uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; - auto *qp = getQueuePair(pe); - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[local_pe] + L_offset, - const_cast(source), nelems); - threadfence_system(); - ipcImpl_.zero_byte_read(pe); - } else { - if (is_thread_zero_in_wave()) { - qp->put_nbi_cqe(base_heap[pe] + L_offset, source, nelems, pe, true); - } - qp->quiet_single(); - } - fence_.flush(); -} - -__device__ void GPUIBContext::getmem_wg(void *dest, const void *source, - size_t nelems, int pe) { - const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = const_cast(src_typed) - base_heap[my_pe]; - auto *qp = getQueuePair(pe); - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems); - } else { - if (is_thread_zero_in_block()) { - qp->get_nbi_cqe(base_heap[pe] + L_offset, dest, nelems, pe, true); - } - qp->quiet_single(); - } - __syncthreads(); - fence_.flush(); -} - -__device__ void GPUIBContext::getmem_wave(void *dest, const void *source, - size_t nelems, int pe) { - const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = const_cast(src_typed) - base_heap[my_pe]; - auto *qp = getQueuePair(pe); - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, - nelems); - } else { - if (is_thread_zero_in_wave()) { - qp->get_nbi_cqe(base_heap[pe] + L_offset, dest, nelems, pe, true); - } - qp->quiet_single(); - } - fence_.flush(); -} - -__device__ void GPUIBContext::getmem_nbi_wg(void *dest, const void *source, - size_t nelems, int pe) { - const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = const_cast(src_typed) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems); - } else { - if (is_thread_zero_in_block()) { - auto *qp = getQueuePair(pe); - qp->get_nbi(base_heap[pe] + L_offset, dest, nelems, pe, true); - } - } - __syncthreads(); -} - -__device__ void GPUIBContext::getmem_nbi_wave(void *dest, const void *source, - size_t nelems, int pe) { - const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = const_cast(src_typed) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - int local_pe = pe % ipcImpl_.shm_size; - ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, - nelems); - } else { - if (is_thread_zero_in_wave()) { - auto *qp = getQueuePair(pe); - qp->get_nbi(base_heap[pe] + L_offset, dest, nelems, pe, true); - } - } -} - -} // namespace rocshmem diff --git a/src/gpu_ib/context_ib_device.hpp b/src/gpu_ib/context_ib_device.hpp deleted file mode 100644 index 9748b1dac3..0000000000 --- a/src/gpu_ib/context_ib_device.hpp +++ /dev/null @@ -1,304 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_ -#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_ - -#include "../context.hpp" -#include "memory_builder_policy.hpp" -#include "network_policy.hpp" - -namespace rocshmem { - -class QueuePair; - -class GPUIBContext : public Context { - public: - __host__ GPUIBContext(Backend *b, bool option, int idx); - - __device__ __host__ QueuePair *getQueuePair(int pe); - - __device__ __host__ int getNumQueuePairs(); - - __device__ __host__ int getNumDest(); - - __device__ __attribute__((noinline)) void threadfence_system(); - - __device__ void ctx_create(); - - __device__ void ctx_destroy(); - - __device__ void putmem(void *dest, const void *source, size_t nelems, int pe); - - __device__ void getmem(void *dest, const void *source, size_t nelems, int pe); - - __device__ void putmem_nbi(void *dest, const void *source, size_t nelems, - int pe); - - __device__ void getmem_nbi(void *dest, const void *source, size_t size, - int pe); - - __device__ void fence(); - - __device__ void fence(int pe); - - __device__ void quiet(); - - __device__ void *shmem_ptr(const void *dest, int pe); - - __device__ void barrier_all(); - - __device__ void sync_all(); - - __device__ void sync(rocshmem_team_t team); - - template - __device__ void amo_add(void *dst, T value, int pe); - - template - __device__ void amo_set(void *dst, T value, int pe); - - template - __device__ T amo_swap(void *dst, T value, int pe); - - template - __device__ T amo_fetch_and(void *dst, T value, int pe); - - template - __device__ void amo_and(void *dst, T value, int pe); - - template - __device__ T amo_fetch_or(void *dst, T value, int pe); - - template - __device__ void amo_or(void *dst, T value, int pe); - - template - __device__ T amo_fetch_xor(void *dst, T value, int pe); - - template - __device__ void amo_xor(void *dst, T value, int pe); - - template - __device__ void amo_cas(void *dst, T value, T cond, int pe); - - template - __device__ T amo_fetch_add(void *dst, T value, int pe); - - template - __device__ T amo_fetch_cas(void *dst, T value, T cond, int pe); - - template - __device__ void p(T *dest, T value, int pe); - - template - __device__ T g(const T *source, int pe); - - template - __device__ void to_all(T *dest, const T *source, int nreduce, int PE_start, - int logPE_stride, int PE_size, T *pWrk, - long *pSync); // NOLINT(runtime/int) - - template - __device__ void to_all(rocshmem_team_t team, T *dest, const T *source, - int nreduce); - - template - __device__ void put(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void put_nbi(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void get(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void broadcast(rocshmem_team_t team, T *dest, const T *source, - int nelems, int pe_root); - - template - __device__ void broadcast(T *dest, const T *source, int nelems, int pe_root, - int pe_start, int log_pe_stride, int pe_size, - long *p_sync); // NOLINT(runtime/int) - - template - __device__ void alltoall(rocshmem_team_t team, T *dest, const T *source, - int nelems); - - template - __device__ void alltoall_broadcast(rocshmem_team_t team, T *dest, - const T *source, int nelems); - - template - __device__ void alltoall_brucks(rocshmem_team_t team, T *dest, - const T *source, int nelems); - - template - __device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source, - int nelems); - - template - __device__ void alltoall_gcen2(rocshmem_team_t team, T *dest, - const T *source, int nelems); - - template - __device__ void fcollect(rocshmem_team_t team, T *dest, const T *source, - int nelems); - - template - __device__ void fcollect_broadcast(rocshmem_team_t team, T *dest, - const T *source, int nelems); - - template - __device__ void fcollect_brucks(rocshmem_team_t team, T *dest, - const T *source, int nelems); - - template - __device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source, - int nelems); - - template - __device__ void fcollect_gcen2(rocshmem_team_t team, T *dest, - const T *source, int nelems); - - __device__ void putmem_wg(void *dest, const void *source, size_t nelems, - int pe); - - __device__ void getmem_wg(void *dest, const void *source, size_t nelems, - int pe); - - __device__ void putmem_nbi_wg(void *dest, const void *source, size_t nelems, - int pe); - - __device__ void getmem_nbi_wg(void *dest, const void *source, size_t size, - int pe); - - __device__ void putmem_wave(void *dest, const void *source, size_t nelems, - int pe); - - __device__ void getmem_wave(void *dest, const void *source, size_t nelems, - int pe); - - __device__ void putmem_nbi_wave(void *dest, const void *source, size_t nelems, - int pe); - - __device__ void getmem_nbi_wave(void *dest, const void *source, size_t size, - int pe); - - template - __device__ void put_wg(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void put_nbi_wg(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void get_wg(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void get_nbi_wg(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void put_wave(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void put_nbi_wave(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void get_wave(T *dest, const T *source, size_t nelems, int pe); - - template - __device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe); - - private: - template - __device__ void internal_direct_allreduce( - T *dst, const T *src, int nelems, int PE_start, int logPE_stride, - int PE_size, T *pWrk, - long *pSync); // NOLINT(runtime/int) - - template - __device__ void internal_ring_allreduce(T *dst, const T *src, int nelems, - int PE_start, int logPE_stride, - int PE_size, T *pWrk, - long *pSync, // NOLINT(runtime/int) - int n_seg, int seg_size, - int chunk_size); - - template - __device__ void internal_put_broadcast(T *dst, const T *src, int nelems, - int pe_root, int PE_start, - int logPE_stride, int PE_size, - long *pSync); // NOLINT(runtime/int) - - template - __device__ void internal_get_broadcast(T *dst, const T *src, int nelems, - int pe_root, - long *pSync); // NOLINT(runtime/int) - - __device__ void internal_direct_barrier(int pe, int PE_start, int stride, - int n_pes, int64_t *pSync); - - __device__ void internal_atomic_barrier(int pe, int PE_start, int stride, - int n_pes, int64_t *pSync); - - __device__ void internal_sync(int pe, int PE_start, int stride, int PE_size, - int64_t *pSync); - - __device__ void quiet_single(int cq_num); - - public: - /* - * Collection of queue pairs that are currently checked out by this - * context from GPUIBBackend. - */ - // FIXME: keep it private and destroy in destructor for better - // encapsulation. - QueuePair *device_qp_proxy{nullptr}; - - /* - * Array of char * pointers corresponding to the heap base pointers VA for - * each PE that we can communicate with. - */ - char *const *base_heap{nullptr}; - - /* - * Buffer used to store the results of a *_g operation. These ops do not - * provide a destination buffer, so the runtime must manage one. - */ - char *g_ret{nullptr}; - - NetworkImpl networkImpl{}; - - /* - * Temporary scratchpad memory used by internal barrier algorithms. - */ - int64_t *barrier_sync{nullptr}; - - int ctx_idx; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_ diff --git a/src/gpu_ib/context_ib_device_coll.cpp b/src/gpu_ib/context_ib_device_coll.cpp deleted file mode 100644 index 61f1e97ca9..0000000000 --- a/src/gpu_ib/context_ib_device_coll.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "rocshmem/rocshmem.hpp" -#include "../context_incl.hpp" -#include "context_ib_tmpl_device.hpp" -#include "../util.hpp" - -namespace rocshmem { - -__device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start, - int stride, int n_pes, - int64_t *pSync) { - int64_t flag_val = 1; - if (pe == PE_start) { - // Go through all PE offsets (except current offset = 0) - // and wait until they all reach - for (size_t i = 1; i < n_pes; i++) { - wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val); - pSync[i] = ROCSHMEM_SYNC_VALUE; - } - threadfence_system(); - // Announce to other PEs that all have reached - for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) { - put_nbi(&pSync[0], &flag_val, 1, j); - } - - } else { - // Mark current PE offset as reached - size_t pe_offset = (pe - PE_start) / stride; - put_nbi(&pSync[pe_offset], &flag_val, 1, PE_start); - wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); - pSync[0] = ROCSHMEM_SYNC_VALUE; - threadfence_system(); - } -} - -__device__ void GPUIBContext::internal_atomic_barrier(int pe, int PE_start, - int stride, int n_pes, - int64_t *pSync) { - int64_t flag_val = 1; - if (pe == PE_start) { - wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1)); - pSync[0] = ROCSHMEM_SYNC_VALUE; - threadfence_system(); - for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) { - put_nbi(&pSync[0], &flag_val, 1, j); - } - } else { - amo_add(&pSync[0], flag_val, PE_start); - wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); - pSync[0] = ROCSHMEM_SYNC_VALUE; - threadfence_system(); - } -} - -// Uses PE values that are relative to world -__device__ void GPUIBContext::internal_sync(int pe, int PE_start, int stride, - int PE_size, int64_t *pSync) { - __syncthreads(); - if (is_thread_zero_in_block()) { - if (PE_size < 64) { - internal_direct_barrier(pe, PE_start, stride, PE_size, pSync); - } else { - internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync); - } - } - __threadfence(); - __syncthreads(); -} - -__device__ void GPUIBContext::sync(rocshmem_team_t team) { - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO: enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - - int pe = team_obj->my_pe_in_world; - int pe_start = team_obj->tinfo_wrt_world->pe_start; - int pe_stride = (1 << log_pe_stride); - int pe_size = team_obj->num_pes; - internal_sync(pe, pe_start, pe_stride, pe_size, barrier_sync); -} - -__device__ void GPUIBContext::sync_all() { - internal_sync(my_pe, 0, 1, num_pes, barrier_sync); -} - -__device__ void GPUIBContext::barrier_all() { - if (is_thread_zero_in_block()) { - quiet(); - } - sync_all(); - __syncthreads(); -} - -} // namespace rocshmem diff --git a/src/gpu_ib/context_ib_host.cpp b/src/gpu_ib/context_ib_host.cpp deleted file mode 100644 index e569d7e6ef..0000000000 --- a/src/gpu_ib/context_ib_host.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "context_ib_host.hpp" - -#include - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../backend_type.hpp" -#include "../context_incl.hpp" -#include "backend_ib.hpp" -#include "../host/host.hpp" - -namespace rocshmem { - -__host__ GPUIBHostContext::GPUIBHostContext(Backend *backend, - [[maybe_unused]] int64_t options) - : Context(backend, true) { - GPUIBBackend *b{static_cast(backend)}; - - host_interface = b->host_interface; - - context_window_info = host_interface->acquire_window_context(); -} - -__host__ GPUIBHostContext::~GPUIBHostContext() { - host_interface->release_window_context(context_window_info); -} - -__host__ void GPUIBHostContext::putmem_nbi(void *dest, const void *source, - size_t nelems, int pe) { - host_interface->putmem_nbi(dest, source, nelems, pe, context_window_info); -} - -__host__ void GPUIBHostContext::getmem_nbi(void *dest, const void *source, - size_t nelems, int pe) { - host_interface->getmem_nbi(dest, source, nelems, pe, context_window_info); -} - -__host__ void GPUIBHostContext::putmem(void *dest, const void *source, - size_t nelems, int pe) { - host_interface->putmem(dest, source, nelems, pe, context_window_info); -} - -__host__ void GPUIBHostContext::getmem(void *dest, const void *source, - size_t nelems, int pe) { - host_interface->getmem(dest, source, nelems, pe, context_window_info); -} - -__host__ void GPUIBHostContext::fence() { - host_interface->fence(context_window_info); -} - -__host__ void GPUIBHostContext::quiet() { - host_interface->quiet(context_window_info); -} - -__host__ void GPUIBHostContext::sync_all() { - host_interface->sync_all(context_window_info); -} - -__host__ void GPUIBHostContext::barrier_all() { - host_interface->barrier_all(context_window_info); -} - -} // namespace rocshmem diff --git a/src/gpu_ib/context_ib_host.hpp b/src/gpu_ib/context_ib_host.hpp deleted file mode 100644 index a38f18ce01..0000000000 --- a/src/gpu_ib/context_ib_host.hpp +++ /dev/null @@ -1,149 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_ -#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_ - -#include "../context.hpp" - -namespace rocshmem { - -class GPUIBHostContext : public Context { - public: - __host__ GPUIBHostContext(Backend *b, int64_t options); - - __host__ ~GPUIBHostContext(); - - template - __host__ void p(T *dest, T value, int pe); - - template - __host__ T g(const T *source, int pe); - - template - __host__ void put(T *dest, const T *source, size_t nelems, int pe); - - template - __host__ void get(T *dest, const T *source, size_t nelems, int pe); - - template - __host__ void put_nbi(T *dest, const T *source, size_t nelems, int pe); - - template - __host__ void get_nbi(T *dest, const T *source, size_t nelems, int pe); - - __host__ void putmem(void *dest, const void *source, size_t nelems, int pe); - - __host__ void getmem(void *dest, const void *source, size_t nelems, int pe); - - __host__ void putmem_nbi(void *dest, const void *source, size_t nelems, - int pe); - - __host__ void getmem_nbi(void *dest, const void *source, size_t size, int pe); - - template - __host__ void amo_add(void *dst, T value, int pe); - - template - __host__ void amo_cas(void *dst, T value, T cond, int pe); - - template - __host__ T amo_fetch_add(void *dst, T value, int pe); - - template - __host__ T amo_fetch_cas(void *dst, T value, T cond, int pe); - - __host__ void fence(); - - __host__ void quiet(); - - __host__ void barrier_all(); - - __host__ void sync_all(); - - template - __host__ void broadcast(T *dest, const T *source, int nelems, int pe_root, - int pe_start, int log_pe_stride, int pe_size, - long *p_sync); // NOLINT(runtime/int) - - template - __host__ void broadcast(rocshmem_team_t team, T *dest, const T *source, - int nelems, int pe_root); - - template - __host__ void to_all(T *dest, const T *source, int nreduce, int pe_start, - int log_pe_stride, int pe_size, T *p_wrk, - long *p_sync); // NOLINT(runtime/int) - - template - __host__ void to_all(rocshmem_team_t team, T *dest, const T *source, - int nreduce); - - template - __host__ void wait_until(T *ivars, int cmp, T val); - - template - __host__ size_t wait_until_any(T *ivars, size_t nelems, - const int *status, - int cmp, T val); - - template - __host__ void wait_until_all(T *ivars, size_t nelems, - const int *status, - int cmp, T val); - - template - __host__ size_t wait_until_some(T *ivars, size_t nelems, - size_t* indices, - const int *status, - int cmp, T val); - - template - __host__ void wait_until_all_vector(T *ivars, size_t nelems, - const int *status, - int cmp, T* vals); - - template - __host__ size_t wait_until_any_vector(T *ivars, size_t nelems, - const int *status, - int cmp, T* vals); - - template - __host__ size_t wait_until_some_vector(T *ivars, size_t nelems, - size_t* indices, - const int *status, - int cmp, T* vals); - - template - __host__ int test(T *ivars, int cmp, T val); - - public: - /* Pointer to the backend's host interface */ - HostInterface *host_interface{nullptr}; - - /* An MPI Window implements a context */ - WindowInfo *context_window_info{nullptr}; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_ diff --git a/src/gpu_ib/context_ib_tmpl_device.hpp b/src/gpu_ib/context_ib_tmpl_device.hpp deleted file mode 100644 index b29523c804..0000000000 --- a/src/gpu_ib/context_ib_tmpl_device.hpp +++ /dev/null @@ -1,1097 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_ -#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_ - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "rocshmem/rocshmem.hpp" -#include "context_ib_device.hpp" -#include "gpu_ib_team.hpp" -#include "queue_pair.hpp" -#include "../util.hpp" -#include "../rocshmem_calc.hpp" - -namespace rocshmem { - -template -__device__ void compute_reduce(T *src, T *dst, int size, int wg_id, - int wg_size) { - for (size_t i = wg_id; i < size; i += wg_size) { - OpWrap::Calc(src, dst, i); - } - __syncthreads(); -} - -template -__device__ void GPUIBContext::p(T *dest, T value, int pe) { - putmem_nbi(dest, &value, sizeof(T), pe); -} - -template -__device__ void GPUIBContext::internal_ring_allreduce( - T *dst, const T *src, int nelems, [[maybe_unused]] int PE_start, - [[maybe_unused]] int logPE_stride, [[maybe_unused]] int PE_size, T *pWrk, - long *pSync, // NOLINT(runtime/int) - int n_seg, int seg_size, int chunk_size) { - int off_seg, off_send, off_recv; - int send_pe = (my_pe + 1) % num_pes; - long wait_val; // NOLINT(runtime/int) - - int wg_size = get_flat_block_size(); - int wg_id = get_flat_block_id(); - - for (size_t i = wg_id; i < nelems; i += wg_size) { - dst[i] = src[i]; - } - __syncthreads(); - - for (size_t seg = 0; seg < n_seg; seg++) { - off_seg = seg * seg_size; - for (int round = 0; round < num_pes - 1; round++) { - off_send = (((my_pe + 1 - round + 2 * num_pes) % num_pes) * chunk_size); - off_recv = (((my_pe - round + 2 * num_pes) % num_pes) * chunk_size); - - putmem_nbi_wg(reinterpret_cast(&pWrk[off_send]), - reinterpret_cast(&dst[off_send + off_seg]), - chunk_size * sizeof(T), send_pe); - - if (is_thread_zero_in_block()) { - fence(); - - wait_val = seg + 100; - p(&pSync[round], wait_val, send_pe); - - wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val); - __threadfence(); - } - __syncthreads(); - compute_reduce(&pWrk[off_recv], &dst[off_seg + off_recv], - chunk_size, wg_id, wg_size); - } - for (size_t round = num_pes - 1; round < 2 * num_pes - 2; round++) { - int off_send2 = - (((my_pe + 1 - round + 2 * num_pes) % num_pes) * chunk_size); - putmem_nbi_wg(reinterpret_cast(&dst[off_send2 + off_seg]), - reinterpret_cast(&dst[off_send2 + off_seg]), - chunk_size * sizeof(T), send_pe); - - if (is_thread_zero_in_block()) { - fence(); - wait_val = seg + 100; - p(&pSync[round], wait_val, send_pe); - wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val); - } - __syncthreads(); - } - } - __syncthreads(); - for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) { - pSync[i] = ROCSHMEM_SYNC_VALUE; - } - __syncthreads(); -} - -template -__device__ void GPUIBContext::internal_direct_allreduce( - T *dst, const T *src, int nelems, int PE_start, int logPE_stride, - int PE_size, T *pWrk, - long *pSync) { // NOLINT(runtime/int) - int stride = 1 << logPE_stride; - int finish = PE_start + stride * PE_size; - int pe = my_pe; - - int wg_id = get_flat_block_id(); - int wg_size = get_flat_block_size(); - - for (int i = wg_id; i < nelems; i += wg_size) { - dst[i] = src[i]; - } - __syncthreads(); - - for (int i = PE_start; i < finish; i += stride) { - if (i != pe) { - putmem_nbi_wg(&pWrk[pe * nelems], reinterpret_cast(src), - nelems * sizeof(T), i); - - if (is_thread_zero_in_block()) { - fence(); - p(&pSync[pe], 1L, i); - } - __syncthreads(); - } - } - - // Do the compute and pSync reset in parallel. - - for (int i = PE_start; i < finish; i += stride) { - if (i != pe) { - // Wait for leader thread to see that the buffer is ready. - if (is_thread_zero_in_block()) { - wait_until(&pSync[i], ROCSHMEM_CMP_EQ, 1L); - } - __syncthreads(); - - T *ptr = &pWrk[i * nelems]; - compute_reduce(ptr, dst, nelems, wg_id, wg_size); - } - } - - __syncthreads(); - - for (int i = wg_id; i < num_pes; i += wg_size) { - pSync[i] = ROCSHMEM_SYNC_VALUE; - } - - __syncthreads(); -} - -template -__device__ void GPUIBContext::to_all(rocshmem_team_t team, T *dest, - const T *source, int nreduce) { - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-power-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - - int pe_start = team_obj->tinfo_wrt_world->pe_start; - int pe_size = team_obj->tinfo_wrt_world->size; - - long *p_sync = team_obj->reduce_pSync; - T *pWrk = reinterpret_cast(team_obj->pWrk); - - to_all(dest, source, nreduce, pe_start, log_pe_stride, pe_size, pWrk, - p_sync); -} - -template -__device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce, - int PE_start, int logPE_stride, - int PE_size, T *pWrk, - long *pSync) { // NOLINT(runtime/int) - size_t direct_pWrk = num_pes * nreduce; - size_t direct_pSync = num_pes; - - size_t ring_pSync = 2 * num_pes; - - size_t provided_pWrk = - max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE); - size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE; - - // TODO(bpotter): - // We basically do a direct reduce if pWrk is big enough, else we - // give up. In the future we will want to design algorithms to work - // with nreduce/2 + 1 space, which would cover every case per the - // standard. - if (provided_pWrk >= direct_pWrk && provided_pSync >= direct_pSync) { - internal_direct_allreduce(dest, source, nreduce, PE_start, - logPE_stride, PE_size, pWrk, pSync); - } else { - if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) { - int chunk_size = 1024; - size_t ring_pWrk = chunk_size * num_pes; - if (provided_pWrk < ring_pWrk) { - ring_pWrk = max(nreduce / 2, // NOLINT - ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE); - chunk_size = ring_pWrk / num_pes; - } - int seg_size = ring_pWrk; - int n_seg = nreduce / seg_size; - if (n_seg == 0) { - n_seg = 1; - seg_size = nreduce; - chunk_size = seg_size / num_pes; - } - internal_ring_allreduce(dest, source, nreduce, PE_start, - logPE_stride, PE_size, pWrk, pSync, n_seg, - seg_size, chunk_size); - } else { - GPU_DPRINTF("Unsupported reduction size for gpu_ib.\n"); - } - } -} - -template -__device__ void GPUIBContext::put(T *dest, const T *source, size_t nelems, - int pe) { - putmem(dest, source, nelems * sizeof(T), pe); -} - -template -__device__ T GPUIBContext::g(const T *source, int pe) { - T ret; - auto *src_const_cast = reinterpret_cast(source); - uint64_t L_offset = const_cast(src_const_cast) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - ipcImpl_.ipcCopy(&ret, ipcImpl_.ipc_bases[pe] + L_offset, sizeof(T)); - return ret; - } else { - int thread_id = get_flat_block_id(); - int block_size = get_flat_block_size(); - int offset = ctx_idx * block_size + thread_id; - - char *base_dest = g_ret; - char *dest = &base_dest[offset * sizeof(int64_t)]; - size_t nelems = sizeof(T); - - bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems); - if (!must_send_message) { - return ret; - } - getQueuePair(pe)->get_nbi(base_heap[pe] + L_offset, dest, nelems, - pe, true); - getQueuePair(pe)->quiet_single(); - getQueuePair(my_pe)->hdp_policy->hdp_flush(); - __threadfence(); - ret = *(reinterpret_cast(dest)); - return ret; - } - return ret; -} - -template -__device__ void GPUIBContext::put_nbi(T *dest, const T *source, size_t nelems, - int pe) { - putmem_nbi(dest, source, sizeof(T) * nelems, pe); -} - -template -__device__ void GPUIBContext::get(T *dest, const T *source, size_t nelems, - int pe) { - getmem(dest, source, sizeof(T) * nelems, pe); -} - -template -__device__ void GPUIBContext::get_nbi(T *dest, const T *source, size_t nelems, - int pe) { - getmem_nbi(dest, source, sizeof(T) * nelems, pe); -} - -template -__device__ T GPUIBContext::amo_fetch_add(void *dst, T value, int pe) { - uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - return ipcImpl_.ipcAMOFetchAdd( - reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), value); - } else { - auto *qp = getQueuePair(pe); - return qp->atomic_fetch(base_heap[pe] + L_offset, value, 0, pe, true, - MLX5_OPCODE_ATOMIC_FA); - } -} - -template -__device__ T GPUIBContext::amo_fetch_cas(void *dst, T value, T cond, int pe) { - uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - return ipcImpl_.ipcAMOFetchCas( - reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), cond, value); - } else { - auto *qp = getQueuePair(pe); - return qp->atomic_fetch(base_heap[pe] + L_offset, value, cond, pe, true, - MLX5_OPCODE_ATOMIC_CS); - } -} - -template -__device__ void GPUIBContext::amo_add(void *dst, T value, int pe) { - uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - ipcImpl_.ipcAMOAdd(reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), - value); - } else { - auto *qp = getQueuePair(pe); - qp->atomic_nofetch(base_heap[pe] + L_offset, value, 0, pe, true, - MLX5_OPCODE_ATOMIC_FA); - } -} - -template -__device__ void GPUIBContext::amo_set(void *dst, T value, int pe) { - uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; - - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - ipcImpl_.ipcAMOSet(reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), - value); - } else { - auto *qp = getQueuePair(pe); - - // Guess that the remote memory is zero by setting condition to zero. - // The compare-and-swap loop will execute at least twice if wrong. - // It may run additional times if contention on memory location. - T ret_val; - T cond = 0; - while ((ret_val = qp->atomic_fetch(base_heap[pe] + L_offset, value, cond, - pe, true, MLX5_OPCODE_ATOMIC_CS))) { - if (ret_val == cond) { - break; - } - cond = ret_val; - } - } -} - -template -__device__ T GPUIBContext::amo_swap(void *dst, T value, int pe) { - assert(false); - return 0; -} - -template -__device__ T GPUIBContext::amo_fetch_and(void *dst, T value, int pe) { - assert(false); - return 0; -} - -template -__device__ void GPUIBContext::amo_and(void *dst, T value, int pe) { - assert(false); -} - -template -__device__ T GPUIBContext::amo_fetch_or(void *dst, T value, int pe) { - assert(false); - return 0; -} - -template -__device__ void GPUIBContext::amo_or(void *dst, T value, int pe) { - assert(false); -} - -template -__device__ T GPUIBContext::amo_fetch_xor(void *dst, T value, int pe) { - assert(false); - return 0; -} - -template -__device__ void GPUIBContext::amo_xor(void *dst, T value, int pe) { - assert(false); -} - -template -__device__ void GPUIBContext::amo_cas(void *dst, T value, T cond, int pe) { - uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; - if (ipcImpl_.isIpcAvailable(my_pe, pe)) { - ipcImpl_.ipcAMOCas(reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), - cond, value); - } else { - auto *qp = getQueuePair(pe); - qp->atomic_nofetch(base_heap[pe] + L_offset, value, cond, pe, true, - MLX5_OPCODE_ATOMIC_CS); - } -} - -template -__device__ void GPUIBContext::internal_put_broadcast( - T *dst, const T *src, int nelems, int pe_root, int pe_start, - int log_pe_stride, int pe_size, - [[maybe_unused]] long *p_sync) { // NOLINT(runtime/int) - if (my_pe == pe_root) { - int stride = 1 << log_pe_stride; - int finish = pe_start + stride * pe_size; - for (int i = pe_start; i < finish; i += stride) { - if (i != my_pe) { - put_nbi_wg(dst, src, nelems, i); - } - } - } -} - -template -__device__ void GPUIBContext::internal_get_broadcast( - T *dst, const T *src, int nelems, int pe_root, - [[maybe_unused]] long *pSync) { // NOLINT(runtime/int) - if (my_pe != pe_root) { - get_wg(dst, src, nelems, pe_root); - } -} - -template -__device__ void GPUIBContext::broadcast(rocshmem_team_t team, T *dst, - const T *src, int nelems, int pe_root) { - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - - int pe_start = team_obj->tinfo_wrt_world->pe_start; - int pe_size = team_obj->tinfo_wrt_world->size; - - long *p_sync = team_obj->bcast_pSync; - - // Passed pe_root is relative to team, convert to world root - int pe_root_world = team_obj->get_pe_in_world(pe_root); - - broadcast(dst, src, nelems, pe_root_world, pe_start, log_pe_stride, - pe_size, p_sync); -} - -template -__device__ void GPUIBContext::broadcast(T *dst, const T *src, int nelems, - int pe_root, int pe_start, - int log_pe_stride, int pe_size, - long *p_sync) { // NOLINT(runtime/int) - if (num_pes < 4) { - internal_put_broadcast(dst, src, nelems, pe_root, pe_start, log_pe_stride, - pe_size, p_sync); - } else { - internal_get_broadcast(dst, src, nelems, pe_root, p_sync); - } - // Synchronize on completion of broadcast - internal_sync(my_pe, pe_start, (1 << log_pe_stride), pe_size, p_sync); -} - -template -__device__ void GPUIBContext::alltoall(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // Currently broadcast implementation performs the best - alltoall_broadcast(team, dst, src, nelems); -} - -template -__device__ void GPUIBContext::alltoall_broadcast(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // Broadcast implementation of alltoall collective - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - int pe_start = team_obj->tinfo_wrt_world->pe_start; - int pe_size = team_obj->num_pes; - int stride = 1 << log_pe_stride; - - long *pSync = team_obj->alltoall_pSync; - int my_pe_in_team = team_obj->my_pe; - // Have each PE put their designated data to the other PEs - for (int j = 0; j < pe_size; j++) { - int dest_pe = team_obj->get_pe_in_world(j); - put_nbi_wg(&dst[my_pe_in_team * nelems], &src[j * nelems], nelems, dest_pe); - } - if (is_thread_zero_in_block()) { - quiet(); - } - // wait until everyone has obtained their designated data - internal_sync(my_pe, pe_start, stride, pe_size, pSync); -} - -template -__device__ void GPUIBContext::alltoall_brucks(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // Brucks implementation of alltoall collective - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - int pe_start = team_obj->tinfo_wrt_world->pe_start; - int pe_size = team_obj->num_pes; - int stride = 1 << log_pe_stride; - - long *pSync = team_obj->alltoall_pSync; - int my_pe_in_team = team_obj->my_pe; - int tid = get_flat_block_id(); - int blk_size = get_flat_block_size(); - - // Check if we have enough buffer space. If not, fail. - if (pe_size * nelems * 2 > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { - GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n"); - assert(false); - } - - T *pAta1 = reinterpret_cast(team_obj->pAta); - T *pAta2 = &pAta1[pe_size * nelems]; - - // Phase 1: Shift all data by (pe_size * nelems) elements - for (size_t i = tid; i < pe_size * nelems; i += blk_size) { - size_t index = (i + my_pe_in_team * nelems) % (pe_size * nelems); - pAta1[i] = src[index]; - } - __syncthreads(); - - // Phase 2: Perform packing and data transfers - for (int64_t shift = 0; ((int64_t)1 << shift) < pe_size; shift++) { - int64_t shift_decimal = ((int64_t)1 << shift); - // Step 1: Pack data to be sent - for (int64_t i = tid; i < pe_size * nelems; i += blk_size) { - int64_t pos = i / nelems; - int64_t offset = i % nelems; - // If bit is set in index, insert in data to be sent - if ((pos >> shift) & 1) { - int64_t index = - ((pos >> (shift + 1)) << shift) + (pos & (shift_decimal - 1)); - pAta2[index * nelems + offset] = pAta1[i]; - } - } - threadfence_system(); - __syncthreads(); - - // Calculate how much data to be sent - int64_t region_size = shift_decimal * 2; - int64_t data_size = nelems * (pe_size / region_size * shift_decimal); - if (pe_size % region_size > shift_decimal) - data_size += pe_size % region_size - shift_decimal; - - // Step 2: Send data - int dest_pe = - team_obj->get_pe_in_world((my_pe_in_team + shift_decimal) % pe_size); - put_wg(dst, pAta2, data_size, dest_pe); - if (is_thread_zero_in_block()) { - quiet(); - } - threadfence_system(); - // Need to synchronize with both receiver and sender. So just sync all. - internal_sync(my_pe, pe_start, stride, pe_size, pSync); - // Step 3: Unpack received data - for (int i = tid; i < pe_size * nelems; i += blk_size) { - int64_t pos = i / nelems; - int64_t offset = i % nelems; - // If bit is set in index, insert in data to be sent - if ((pos >> shift) & 1) { - int64_t index = - ((pos >> (shift + 1)) << shift) + (pos & (shift_decimal - 1)); - pAta1[i] = dst[index * nelems + offset]; - } - } - threadfence_system(); - __syncthreads(); - } - - // Phase 3: Inverse rotation, shift data by (pe_size * nelems) elements - for (size_t i = tid; i < pe_size * nelems; i += blk_size) { - size_t offset = i % nelems; - size_t index = ((pe_size + my_pe_in_team - i / nelems) % pe_size) * nelems; - dst[index + offset] = pAta1[i]; - } - - // wait until everyone has sent the data - internal_sync(my_pe, pe_start, stride, pe_size, pSync); -} - -template -__device__ void GPUIBContext::alltoall_gcen(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // GPU-centric implementation of alltoall collective - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - int pe_size = team_obj->num_pes; - int stride = 1 << log_pe_stride; - - long *pSync = team_obj->alltoall_pSync; - int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE]; - int my_pe_in_team = team_obj->my_pe; - - // Check if we have enough buffer space. If not, fail. - T *pAta = reinterpret_cast(team_obj->pAta); - if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { - GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n"); - assert(false); - } - - // Works when number of PEs divisible by root(PE_size) - int num_clust = sqrt(pe_size); - int clust_size = (pe_size + num_clust - 1) / num_clust; - // TODO(bpotter): Allow any size of cluster - assert(num_clust * clust_size == pe_size); - int clust_id = my_pe_in_team / clust_size; - - int64_t flag_val = 1; - // Step 1: Send data to PEs in cluster - for (int i = 0; i < pe_size; ++i) { - int src_pe = - team_obj->get_pe_in_world(clust_id * clust_size + (i % clust_size)); - int src_loc = (i / clust_size) * clust_size + (my_pe_in_team % clust_size); - get_nbi_wg(&pAta[i * nelems], &src[src_loc * nelems], nelems, src_pe); - } - if (is_thread_zero_in_block()) { - quiet(); - } - __syncthreads(); - // Step 2: Send final data to PEs outside cluster - for (int i = 0; i < num_clust; i++) { - int dest_pe = team_obj->get_pe_in_world((my_pe_in_team % clust_size) + - i * clust_size); - int j = clust_id; - put_nbi_wg(&dst[j * nelems * clust_size], &pAta[i * nelems * clust_size], - nelems * clust_size, dest_pe); - } - if (is_thread_zero_in_block()) { - quiet(); - - // Now sync PEs in cluster and ring. Ideally, we overlap this. - int dest_pe = team_obj->get_pe_in_world(clust_id * clust_size); - if (dest_pe != my_pe) amo_add(pSync2, flag_val, dest_pe); - - int dest_pe2 = team_obj->get_pe_in_world(my_pe_in_team % clust_size); - if (dest_pe2 != my_pe) amo_add(&pSync[0], flag_val, dest_pe2); - - if (my_pe == dest_pe) { - wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1)); - pSync2[0] = ROCSHMEM_SYNC_VALUE; - __threadfence_system(); - for (int i = 1; i < clust_size; ++i) - put_nbi(&pSync2[0], &flag_val, 1, - team_obj->get_pe_in_world(my_pe_in_team + i)); - } else { - wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val); - pSync2[0] = ROCSHMEM_SYNC_VALUE; - __threadfence_system(); - } - - if (my_pe == dest_pe2) { - wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(num_clust - 1)); - pSync[0] = ROCSHMEM_SYNC_VALUE; - threadfence_system(); - for (size_t i = 1, j = dest_pe2 + clust_size * stride; i < num_clust; - ++i, j += clust_size * stride) { - put_nbi(&pSync[0], &flag_val, 1, j); - } - } else { - wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); - pSync[0] = ROCSHMEM_SYNC_VALUE; - threadfence_system(); - } - } - __syncthreads(); -} - -template -__device__ void GPUIBContext::alltoall_gcen2(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // GPU-centric implementation of alltoall collective - // Uses in-place blocking sync - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - int pe_size = team_obj->num_pes; - int stride = 1 << log_pe_stride; - - long *pSync = team_obj->alltoall_pSync; - int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE]; - int my_pe_in_team = team_obj->my_pe; - - // Check if we have enough buffer space. If not, fail. - T *pAta = reinterpret_cast(team_obj->pAta); - if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { - GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n"); - assert(false); - } - - // Works when number of PEs divisible by root(PE_size) - int num_clust = sqrt(pe_size); - int clust_size = (pe_size + num_clust - 1) / num_clust; - // TODO(bpotter): Allow any size of cluster - assert(num_clust * clust_size == pe_size); - int clust_id = my_pe_in_team / clust_size; - - int64_t flag_val = 1; - // Step 1: Send data to PEs in cluster - for (int i = 0; i < pe_size; ++i) { - int src_pe = - team_obj->get_pe_in_world(clust_id * clust_size + (i % clust_size)); - int src_loc = (i / clust_size) * clust_size + (my_pe_in_team % clust_size); - get_nbi_wg(&pAta[i * nelems], &src[src_loc * nelems], nelems, src_pe); - } - - if (is_thread_zero_in_block()) { - int dest_pe = team_obj->get_pe_in_world(clust_id * clust_size); - if (dest_pe != my_pe) amo_add(pSync2, flag_val, dest_pe); - quiet(); - } - __syncthreads(); - - // Step 2: Send final data to PEs outside cluster - // Have each PE put their designated data to the other PEs - for (int i = 0; i < num_clust; i++) { - int dest_pe = team_obj->get_pe_in_world((my_pe_in_team % clust_size) + - i * clust_size); - int j = clust_id; - put_nbi_wg(&dst[j * nelems * clust_size], &pAta[i * nelems * clust_size], - nelems * clust_size, dest_pe); - } - - if (is_thread_zero_in_block()) { - quiet(); - if ((my_pe_in_team % clust_size) == 0) { - wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1)); - pSync2[0] = ROCSHMEM_SYNC_VALUE; - __threadfence_system(); - for (int i = 1; i < clust_size; ++i) - put_nbi(&pSync2[0], &flag_val, 1, - team_obj->get_pe_in_world(my_pe_in_team + i)); - } else { - wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val); - pSync2[0] = ROCSHMEM_SYNC_VALUE; - __threadfence_system(); - } - } - - // wait until everyone in ring has sent the data - internal_sync(my_pe, team_obj->get_pe_in_world(my_pe_in_team % clust_size), - clust_size * stride, num_clust, pSync); -} - -template -__device__ void GPUIBContext::fcollect(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // Main function for fcollect - // Broadcast version performs moderately well - // But there still seems to be scope for optimisation - fcollect_broadcast(team, dst, src, nelems); -} - -template -__device__ void GPUIBContext::fcollect_broadcast(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // Broadcast implementation of fcollect collective - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - int pe_start = team_obj->tinfo_wrt_world->pe_start; - int pe_size = team_obj->num_pes; - int stride = 1 << log_pe_stride; - - long *pSync = team_obj->alltoall_pSync; - int my_pe_in_team = team_obj->my_pe; - // Have each PE put their designated data to the other PEs - for (int j = 0; j < pe_size; j++) { - int dest_pe = team_obj->get_pe_in_world(j); - put_nbi_wg(&dst[my_pe_in_team * nelems], src, nelems, dest_pe); - } - - if (is_thread_zero_in_block()) { - quiet(); - } - // wait until everyone has obtained their designated data - internal_sync(my_pe, pe_start, stride, pe_size, pSync); -} - -template -__device__ void GPUIBContext::fcollect_brucks(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // Brucks implementation of fcollect collective - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - int pe_start = team_obj->tinfo_wrt_world->pe_start; - int pe_size = team_obj->num_pes; - int stride = 1 << log_pe_stride; - - long *pSync = team_obj->alltoall_pSync; - int my_pe_in_team = team_obj->my_pe; - int tid = get_flat_block_id(); - int blk_size = get_flat_block_size(); - - // Check if we have enough buffer space. If not, fail. - if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { - GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n"); - assert(false); - } - - T *pAta = reinterpret_cast(team_obj->pAta); - - // Initial src transfer - put_wg(pAta, src, nelems, team_obj->get_pe_in_world(my_pe_in_team)); - - // Phase 1: Perform data transfers - for (int64_t shift = 0; ((int64_t)1 << shift) < pe_size; shift++) { - int64_t shift_decimal = ((int64_t)1 << shift); - - // Calculate how much data to be sent - int64_t data_size = - min(shift_decimal, pe_size - shift_decimal) * nelems; // NOLINT - - // Send data - int dest_pe = - team_obj->get_pe_in_world((my_pe_in_team + shift_decimal) % pe_size); - put_wg(&pAta[shift_decimal * nelems], pAta, data_size, dest_pe); - - // Need to synchronize with both receiver and sender. So just sync all. - internal_sync(my_pe, pe_start, stride, pe_size, pSync); - } - - // Phase 2: Inverse rotation, shift data by (pe_size * nelems) elements - for (size_t i = tid; i < pe_size * nelems; i += blk_size) { - size_t offset = i % nelems; - size_t index = - ((pe_size + my_pe_in_team - i / nelems) % (pe_size)) * nelems; - dst[index + offset] = pAta[i]; - } - - // wait until everyone has sent the data - internal_sync(my_pe, pe_start, stride, pe_size, pSync); -} - -template -__device__ void GPUIBContext::fcollect_gcen(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // GPU-centric implementation of fcollect collective - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - int pe_size = team_obj->num_pes; - int stride = 1 << log_pe_stride; - - long *pSync = team_obj->alltoall_pSync; - long *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE]; - int my_pe_in_team = team_obj->my_pe; - - // Check if we have enough buffer space. If not, fail. - T *pAta = reinterpret_cast(team_obj->pAta); - if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { - GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n"); - assert(false); - } - - // Works when number of PEs divisible by root(PE_size) - int num_clust = sqrt(pe_size); - int clust_size = (pe_size + num_clust - 1) / num_clust; - // TODO(bpotter): Allow any size of cluster - assert(num_clust * clust_size == pe_size); - int clust_id = my_pe_in_team / clust_size; - - int64_t flag_val = 1; - // Step 1: Send data to PEs in cluster - for (int i = 0; i < clust_size; ++i) { - int src_pe = - team_obj->get_pe_in_world(clust_id * clust_size + (i % clust_size)); - get_nbi_wg(&pAta[i * nelems], src, nelems, src_pe); - } - - if (is_thread_zero_in_block()) { - int dest_pe = team_obj->get_pe_in_world(clust_id * clust_size); - if (dest_pe != my_pe) amo_add(pSync2, flag_val, dest_pe); - quiet(); - } - __syncthreads(); - - // Step 2: Send final data to PEs outside cluster - // Have each PE put their designated data to the other PEs - for (int i = 0; i < num_clust; i++) { - int dest_pe = team_obj->get_pe_in_world((my_pe_in_team % clust_size) + - i * clust_size); - int j = clust_id; - put_nbi_wg(&dst[j * nelems * clust_size], pAta, nelems * clust_size, - dest_pe); - } - - if (is_thread_zero_in_block()) { - quiet(); - if ((my_pe_in_team % clust_size) == 0) { - wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1)); - pSync2[0] = ROCSHMEM_SYNC_VALUE; - threadfence_system(); - for (int i = 1; i < clust_size; ++i) - put_nbi(&pSync2[0], &flag_val, 1, - team_obj->get_pe_in_world(my_pe_in_team + i)); - } else { - wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val); - pSync2[0] = ROCSHMEM_SYNC_VALUE; - threadfence_system(); - } - } - - // wait until everyone in ring has sent the data - internal_sync(my_pe, team_obj->get_pe_in_world(my_pe_in_team % clust_size), - clust_size * stride, num_clust, pSync); -} - -template -__device__ void GPUIBContext::fcollect_gcen2(rocshmem_team_t team, T *dst, - const T *src, int nelems) { - // GPU-centric implementation of fcollect collective - // Uses in-place blocking sync - GPUIBTeam *team_obj = reinterpret_cast(team); - - double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride; - int log_pe_stride = static_cast(dbl_log_pe_stride); - /** - * Ensure that the stride is a multiple of 2 for GPU_IB. - * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides - * and remove this assert. - */ - assert((dbl_log_pe_stride - log_pe_stride) == 0); - int pe_size = team_obj->num_pes; - int stride = 1 << log_pe_stride; - - long *pSync = team_obj->alltoall_pSync; - int my_pe_in_team = team_obj->my_pe; - - // Check if we have enough buffer space. If not, fail. - T *pAta = reinterpret_cast(team_obj->pAta); - if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) { - GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n"); - assert(false); - } - - // Works when number of PEs divisible by root(PE_size) - int num_clust = sqrt(pe_size); - int clust_size = (pe_size + num_clust - 1) / num_clust; - // TODO(bpotter): Allow any size of cluster - assert(num_clust * clust_size == pe_size); - int clust_id = my_pe_in_team / clust_size; - - // Step 1: Send data to PEs in cluster - for (int i = 0; i < clust_size; ++i) { - int src_pe = - team_obj->get_pe_in_world(clust_id * clust_size + (i % clust_size)); - get_nbi_wg(&pAta[i * nelems], src, nelems, src_pe); - } - - if (is_thread_zero_in_block()) { - quiet(); - } - internal_sync(my_pe, team_obj->get_pe_in_world(clust_id * clust_size), stride, - clust_size, pSync); - - // Step 2: Send final data to PEs outside cluster - // Have each PE put their designated data to the other PEs - for (int i = 0; i < num_clust; i++) { - int dest_pe = team_obj->get_pe_in_world((my_pe_in_team % clust_size) + - i * clust_size); - int j = clust_id; - put_nbi_wg(&dst[j * nelems * clust_size], pAta, nelems * clust_size, - dest_pe); - } - - if (is_thread_zero_in_block()) quiet(); - - // wait until everyone in ring has sent the data - internal_sync(my_pe, team_obj->get_pe_in_world(my_pe_in_team % clust_size), - clust_size * stride, num_clust, pSync); -} - -/****************************************************************************** - ***************** SHMEM X API EXTENSION FOR BLOCK/WAVE LEVEL ***************** - *****************************************************************************/ - -template -__device__ void GPUIBContext::put_wg(T *dest, const T *source, size_t nelems, - int pe) { - putmem_wg(dest, source, nelems * sizeof(T), pe); -} - -template -__device__ void GPUIBContext::put_wave(T *dest, const T *source, size_t nelems, - int pe) { - putmem_wave(dest, source, nelems * sizeof(T), pe); -} - -template -__device__ void GPUIBContext::put_nbi_wg(T *dest, const T *source, - size_t nelems, int pe) { - putmem_nbi_wg(dest, source, nelems * sizeof(T), pe); -} - -template -__device__ void GPUIBContext::put_nbi_wave(T *dest, const T *source, - size_t nelems, int pe) { - putmem_nbi_wave(dest, source, nelems * sizeof(T), pe); -} - -template -__device__ void GPUIBContext::get_wg(T *dest, const T *source, size_t nelems, - int pe) { - getmem_wg(dest, source, nelems * sizeof(T), pe); -} - -template -__device__ void GPUIBContext::get_wave(T *dest, const T *source, size_t nelems, - int pe) { - getmem_wave(dest, source, nelems * sizeof(T), pe); -} - -template -__device__ void GPUIBContext::get_nbi_wg(T *dest, const T *source, - size_t nelems, int pe) { - getmem_nbi_wg(dest, source, nelems * sizeof(T), pe); -} - -template -__device__ void GPUIBContext::get_nbi_wave(T *dest, const T *source, - size_t nelems, int pe) { - getmem_nbi_wave(dest, source, nelems * sizeof(T), pe); -} - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_ diff --git a/src/gpu_ib/context_ib_tmpl_host.hpp b/src/gpu_ib/context_ib_tmpl_host.hpp deleted file mode 100644 index 259f158162..0000000000 --- a/src/gpu_ib/context_ib_tmpl_host.hpp +++ /dev/null @@ -1,173 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_ -#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_ - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../host/host_templates.hpp" - -namespace rocshmem { - -template -__host__ void GPUIBHostContext::p(T *dest, T value, int pe) { - host_interface->p(dest, value, pe, context_window_info); -} - -template -__host__ T GPUIBHostContext::g(const T *source, int pe) { - return host_interface->g(source, pe, context_window_info); -} - -template -__host__ void GPUIBHostContext::put(T *dest, const T *source, size_t nelems, - int pe) { - host_interface->put(dest, source, nelems, pe, context_window_info); -} - -template -__host__ void GPUIBHostContext::get(T *dest, const T *source, size_t nelems, - int pe) { - host_interface->get(dest, source, nelems, pe, context_window_info); -} - -template -__host__ void GPUIBHostContext::put_nbi(T *dest, const T *source, size_t nelems, - int pe) { - host_interface->put_nbi(dest, source, nelems, pe, context_window_info); -} - -template -__host__ void GPUIBHostContext::get_nbi(T *dest, const T *source, size_t nelems, - int pe) { - host_interface->get_nbi(dest, source, nelems, pe, context_window_info); -} - -template -__host__ void GPUIBHostContext::amo_add(void *dst, T value, int pe) { - host_interface->amo_add(dst, value, pe, context_window_info); -} - -template -__host__ void GPUIBHostContext::amo_cas(void *dst, T value, T cond, int pe) { - host_interface->amo_cas(dst, value, cond, pe, context_window_info); -} - -template -__host__ T GPUIBHostContext::amo_fetch_add(void *dst, T value, int pe) { - return host_interface->amo_fetch_add(dst, value, pe, context_window_info); -} - -template -__host__ T GPUIBHostContext::amo_fetch_cas(void *dst, T value, T cond, int pe) { - return host_interface->amo_fetch_cas(dst, value, cond, pe, - context_window_info); -} - -template -__host__ void GPUIBHostContext::broadcast( - T *dest, const T *source, int nelems, int pe_root, int pe_start, - int log_pe_stride, int pe_size, - long *p_sync) { // NOLINT(runtime/int) - host_interface->broadcast(dest, source, nelems, pe_root, pe_start, - log_pe_stride, pe_size, p_sync); -} - -template -__host__ void GPUIBHostContext::broadcast(rocshmem_team_t team, T *dest, - const T *source, int nelems, - int pe_root) { - host_interface->broadcast(team, dest, source, nelems, pe_root); -} - -template -__host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce, - int pe_start, int log_pe_stride, - int pe_size, T *p_wrk, - long *p_sync) { // NOLINT(runtime/int) - host_interface->to_all(dest, source, nreduce, pe_start, log_pe_stride, - pe_size, p_wrk, p_sync); -} - -template -__host__ void GPUIBHostContext::to_all(rocshmem_team_t team, T *dest, - const T *source, int nreduce) { - host_interface->to_all(team, dest, source, nreduce); -} - -template -__host__ void GPUIBHostContext::wait_until(T *ivars, int cmp, T val) { - host_interface->wait_until(ivars, cmp, val, context_window_info); -} - -template -__host__ void GPUIBHostContext::wait_until_all(T *ivars, size_t nelems, - const int* status, - int cmp, T val) { - host_interface->wait_until_all(ivars, nelems, status, cmp, val, context_window_info); -} - -template -__host__ size_t GPUIBHostContext::wait_until_any(T *ivars, size_t nelems, - const int* status, - int cmp, T val) { - return host_interface->wait_until_any(ivars, nelems, status, cmp, val, context_window_info); -} - -template -__host__ size_t GPUIBHostContext::wait_until_some(T *ivars, size_t nelems, - size_t* indices, - const int* status, - int cmp, T val) { - return host_interface->wait_until_some(ivars, nelems, indices, status, cmp, val, context_window_info); -} - -template -__host__ void GPUIBHostContext::wait_until_all_vector(T *ivars, size_t nelems, - const int* status, - int cmp, T* vals) { - host_interface->wait_until_all_vector(ivars, nelems, status, cmp, vals, context_window_info); -} - -template -__host__ size_t GPUIBHostContext::wait_until_any_vector(T *ivars, size_t nelems, - const int* status, - int cmp, T* vals) { - return host_interface->wait_until_any_vector(ivars, nelems, status, cmp, vals, context_window_info); -} - -template -__host__ size_t GPUIBHostContext::wait_until_some_vector(T *ivars, size_t nelems, - size_t* indices, - const int* status, - int cmp, T* vals) { - return host_interface->wait_until_some_vector(ivars, nelems, indices, status, cmp, vals, context_window_info); -} - -template -__host__ int GPUIBHostContext::test(T *ivars, int cmp, T val) { - return host_interface->test(ivars, cmp, val, context_window_info); -} - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_ diff --git a/src/gpu_ib/debug.cpp b/src/gpu_ib/debug.cpp deleted file mode 100644 index dd236e1cae..0000000000 --- a/src/gpu_ib/debug.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "rocshmem/rocshmem_debug.hpp" - -#include "qe_dumper.hpp" - -namespace rocshmem { - -void debug_print_cq(int dest_pe, int src_wg, int cqe_index) { - QeDumper dumper(dest_pe, src_wg, cqe_index); - dumper.dump_cq(); -} - -void debug_print_sq(int dest_pe, int src_wg, int wqe_index) { - QeDumper dumper(dest_pe, src_wg, wqe_index); - dumper.dump_sq(); -} - -} // namespace rocshmem diff --git a/src/gpu_ib/dynamic_connection.cpp b/src/gpu_ib/dynamic_connection.cpp deleted file mode 100644 index 7195601b24..0000000000 --- a/src/gpu_ib/dynamic_connection.cpp +++ /dev/null @@ -1,381 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "dynamic_connection.hpp" - -#include - -#include "backend_ib.hpp" - -namespace rocshmem { - -DynamicConnection::DynamicConnection(GPUIBBackend* b) : Connection(b, 4) { - char* value = nullptr; - - if ((value = getenv("ROCSHMEM_NUM_DCIs"))) { - num_dcis = atoi(value); - } - - if ((value = getenv("ROCSHMEM_NUM_DCT"))) { - num_dct = atoi(value); - } -} - -DynamicConnection::~DynamicConnection() { - CHECK_HIP(hipFree(vec_lids)); - CHECK_HIP(hipFree(vec_dct_num)); -} - -ibv_qp_init_attr_ex DynamicConnection::dct_qp_init_attr( - ibv_cq* cq, ibv_srq* srq, [[maybe_unused]] uint8_t port) const { - ibv_qp_init_attr_ex attr{}; - - attr.comp_mask = IBV_QP_INIT_ATTR_PD; - attr.pd = ib_state->pd; - attr.recv_cq = cq; - attr.send_cq = cq; - attr.srq = srq; - attr.qp_type = IBV_QPT_DRIVER; - - return attr; -} - -mlx5dv_qp_init_attr DynamicConnection::dct_dv_init_attr() { - mlx5dv_qp_init_attr dv_attr{}; - dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC; - dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT; - dv_attr.dc_init_attr.dct_access_key = DC_IB_KEY; - - return dv_attr; -} - -Connection::InitQPState DynamicConnection::initqp(uint8_t port) { - InitQPState initqp{}; - - initqp.exp_qp_attr.port_num = port; - initqp.exp_qp_attr.pkey_index = 0; - initqp.exp_qp_attr.qp_access_flags = 0; - - return initqp; -} - -Connection::RtrState DynamicConnection::rtr([[maybe_unused]] dest_info_t* dest, - uint8_t port) { - RtrState rtr{}; - - rtr.exp_qp_attr.ah_attr.is_global = 1; - rtr.exp_qp_attr.ah_attr.port_num = port; - - rtr.exp_qp_attr.max_dest_rd_atomic = 0; - rtr.exp_qp_attr.min_rnr_timer = 0; - - return rtr; -} - -Connection::RtsState DynamicConnection::rts([ - [maybe_unused]] dest_info_t* dest) { - RtsState rts{}; - rts.exp_attr_mask |= IBV_QP_SQ_PSN; - return rts; -} - -void DynamicConnection::connect_dci(ibv_qp* qp, uint8_t port) { - init_qp_status(qp, port); - change_status_rtr(qp, nullptr, port); - change_status_rts(qp, nullptr); -} - -/* - * create a DCT and get is to ready state - */ -void DynamicConnection::create_dct(int32_t* dct_num, ibv_cq* cq, ibv_srq* srq, - uint8_t port) { - auto init_attr = dct_qp_init_attr(cq, srq, port); - auto dv_attr = dct_dv_init_attr(); - auto dct = mlx5dv_create_qp(ib_state->context, &init_attr, &dv_attr); - - if (dct == nullptr) { - printf("Failed to create dct \n"); - abort(); - } - - ibv_qp_attr qp_attr{}; - qp_attr.qp_state = IBV_QPS_INIT; - qp_attr.port_num = port; - qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; - - int attr_mask = - IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; - - int ret = ibv_modify_qp(dct, &qp_attr, attr_mask); - if (ret) { - abort(); - } - - qp_attr.qp_state = IBV_QPS_RTR; - qp_attr.path_mtu = IBV_MTU_4096; - qp_attr.min_rnr_timer = 7; - qp_attr.ah_attr.is_global = 1; - qp_attr.ah_attr.grh.hop_limit = 1; - qp_attr.ah_attr.grh.traffic_class = 0; - qp_attr.ah_attr.grh.sgid_index = 0; - qp_attr.ah_attr.port_num = port; - - attr_mask = IBV_QP_STATE | IBV_QP_MIN_RNR_TIMER | IBV_QP_AV | IBV_QP_PATH_MTU; - - ret = ibv_modify_qp(dct, &qp_attr, attr_mask); - if (ret) { - abort(); - } - - *dct_num = dct->qp_num; -} - -/* - * @brief create a qp (DCI qp) using DEVX - */ -ibv_qp* DynamicConnection::create_qp_0(ibv_context* context, - ibv_qp_init_attr_ex* qp_attr) { - ibv_qp* qp; - qp_attr->qp_type = IBV_QPT_DRIVER; - - mlx5dv_qp_init_attr dv_attr{}; - dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC; - dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI; - dv_attr.dc_init_attr.dct_access_key = DC_IB_KEY; - - qp = mlx5dv_create_qp(context, qp_attr, &dv_attr); - - return qp; -} - -void DynamicConnection::create_qps_1() { - ibv_srq_init_attr srq_init_attr{}; - srq_init_attr.attr.max_wr = 1; - srq_init_attr.attr.max_sge = 1; - - srq = ibv_create_srq(ib_state->pd, &srq_init_attr); - if (!srq) { - abort(); - } - - dct_cq = ibv_create_cq(ib_state->context, 100, nullptr, nullptr, 0); - if (!dct_cq) { - abort(); - } -} - -void DynamicConnection::create_qps_2(int port, int my_rank, - ibv_port_attr* ib_port_att) { - for (int i = 0; i < num_dct; i++) { - int32_t dct_num; - create_dct(&dct_num, dct_cq, srq, port); - dct_num = htobe32(dct_num); - dcts_num[my_rank * num_dct + i] = dct_num; - } - lids[my_rank] = htobe16(ib_port_att->lid); -} - -void DynamicConnection::create_qps_3( - int port, ibv_qp* qp, [[maybe_unused]] int offset, - [[maybe_unused]] ibv_port_attr* ib_port_att) { - return connect_dci(qp, port); -} - -void DynamicConnection::get_remote_conn(int* remote_conn) { - *remote_conn = num_dcis; -} - -void DynamicConnection::allocate_dynamic_members([ - [maybe_unused]] int num_wg) { - size_t num_pes_size_bytes = sizeof(uint16_t) * backend->num_pes; - lids = reinterpret_cast(malloc(num_pes_size_bytes)); - if (lids == nullptr) { - abort(); - } - - size_t num_dcts = num_dct * backend->num_pes; - size_t num_dcts_size_bytes = sizeof(uint32_t) * num_dcts; - dcts_num = reinterpret_cast(malloc(num_dcts_size_bytes)); - if (dcts_num == nullptr) { - abort(); - } -} - -/* - * get the wqe_av information from tyhe ibv_ah - * rely on DEVX to extract the AV. We use the AV to create - * the DC segment - */ -void DynamicConnection::dc_get_av(ibv_ah* ah, mlx5_wqe_av* mlx5_av) { - mlx5dv_obj dv; - mlx5dv_ah dah; - - dv.ah.in = ah; - dv.ah.out = &dah; - mlx5dv_init_obj(&dv, MLX5DV_OBJ_AH); - - memcpy(mlx5_av, dah.av, sizeof(mlx5_wqe_av)); -} - -void DynamicConnection::free_dynamic_members() { - free(lids); - free(dcts_num); -} - -void DynamicConnection::initialize_1(int port, [[maybe_unused]] int num_wg) { - MPI_Allgather(MPI_IN_PLACE, sizeof(int32_t) * num_dct, MPI_CHAR, dcts_num, - sizeof(int32_t) * num_dct, MPI_CHAR, backend->thread_comm); - - MPI_Allgather(MPI_IN_PLACE, sizeof(int16_t), MPI_CHAR, lids, sizeof(int16_t), - MPI_CHAR, backend->thread_comm); - - hipStream_t stream; - CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - CHECK_HIP(hipMalloc(reinterpret_cast(&vec_dct_num), - sizeof(int32_t) * num_dct * backend->num_pes)); - - CHECK_HIP(hipMemcpyAsync(vec_dct_num, dcts_num, - sizeof(int32_t) * num_dct * backend->num_pes, - hipMemcpyHostToDevice, stream)); - - CHECK_HIP(hipMalloc(reinterpret_cast(&vec_lids), - sizeof(int16_t) * backend->num_pes)); - - CHECK_HIP(hipMemcpyAsync(vec_lids, lids, sizeof(int16_t) * backend->num_pes, - hipMemcpyHostToDevice, stream)); - - struct ibv_ah_attr ah_attr; - memset(&ah_attr, 0, sizeof(ah_attr)); - - ah_attr.is_global = 1; - ah_attr.dlid = ib_state->portinfo.lid; - ah_attr.sl = 1; - ah_attr.src_path_bits = 0; - ah_attr.port_num = port; - - ah = ibv_create_ah(ib_state->pd, &ah_attr); - if (ah == nullptr) { - abort(); - } - - dc_get_av(ah, &mlx5_av); - - CHECK_HIP(hipStreamSynchronize(stream)); - CHECK_HIP(hipStreamDestroy(stream)); -} - -void DynamicConnection::initialize_rkey_handle(uint32_t** heap_rkey_handle, - ibv_mr* mr) { - CHECK_HIP(hipMalloc(heap_rkey_handle, sizeof(uint32_t) * backend->num_pes)); - (*heap_rkey_handle)[backend->my_pe] = htobe32(mr->rkey); -} - -void DynamicConnection::free_rkey_handle(uint32_t* heap_rkey_handle) { - CHECK_HIP(hipFree(heap_rkey_handle)); -} - -Connection::QPInitAttr DynamicConnection::qpattr(ibv_qp_cap cap) { - QPInitAttr qpattr(cap); - return qpattr; -} - -/* - * Create and write the DC segment to SQ. - * We get all the info needed from the mlx5_wqe_av that we extract from ibv_ah. - */ -void DynamicConnection::set_dgram_seg(mlx5_wqe_datagram_seg* dc_seg, - uint64_t dc_key, uint32_t dct_num, - uint8_t ext, mlx5_wqe_av* mlx5_av) { - dc_seg->av.key.dc_key = htobe64(dc_key); - dc_seg->av.dqp_dct = htobe32(((uint32_t)ext << 31 | dct_num)); - dc_seg->av.stat_rate_sl = mlx5_av->stat_rate_sl; - dc_seg->av.fl_mlid = mlx5_av->fl_mlid; - dc_seg->av.rlid = mlx5_av->rlid; -} - -/* - * create a DC wqe and post it to the SQ - * we rely on mlx5dv functions to ceate the ctrl and data - * segments but we use our own function to write teh DC and rdma segments - */ -void DynamicConnection::post_dv_dc_wqe(int remote_conn) { - mlx5_wqe_ctrl_seg* ctrl; - mlx5_wqe_datagram_seg* dc_seg; - mlx5_wqe_raddr_seg* rdma; - mlx5_wqe_data_seg* data; - - for (int i = 0; i < remote_conn; i++) { - uint64_t* ptr = get_address_sq(i); - - const uint32_t nb_post = 4 * sq_size; - for (uint16_t index = 0; index < nb_post; index++) { - uint8_t op_mod = 0; - uint8_t op_code = 8; - uint32_t qp_num = qps[i]->qp_num; - uint8_t fm_ce_se = 0; - uint8_t ds = 4; - ctrl = reinterpret_cast(ptr); - mlx5dv_set_ctrl_seg(ctrl, index, op_code, op_mod, qp_num, fm_ce_se, ds, 0, - 0); - ptr = ptr + 2; - - uint32_t dct_num = dcts_num[i]; - uint8_t ext = 1; - dc_seg = reinterpret_cast(ptr); - set_dgram_seg(dc_seg, (uint64_t)DC_IB_KEY, dct_num, ext, &mlx5_av); - ptr = ptr + 2; - - uint64_t address = 0; - uint32_t rkey = 0; - rdma = reinterpret_cast(ptr); - set_rdma_seg(rdma, address, rkey); - ptr = ptr + 2; - - uint32_t lkey = backend->networkImpl.heap_mr->lkey; - data = reinterpret_cast(ptr); - mlx5dv_set_data_seg(data, 1, lkey, 0); - ptr = ptr + 2; - } - } -} - -// TODO(bpotter): remove redundancies with the other derived class -void DynamicConnection::post_wqes() { - int remote_conn; - get_remote_conn(&remote_conn); - remote_conn *= backend->num_blocks_; - post_dv_dc_wqe(remote_conn); -} - -void DynamicConnection::initialize_wr_fields([[maybe_unused]] ibv_send_wr* wr, - [[maybe_unused]] ibv_ah* ah, - [[maybe_unused]] int dc_key) {} - -int DynamicConnection::get_sq_dv_offset([[maybe_unused]] int pe_idx, - [[maybe_unused]] int num_qps, - int wg_idx) { - return wg_idx; -} - -} // namespace rocshmem diff --git a/src/gpu_ib/dynamic_connection.hpp b/src/gpu_ib/dynamic_connection.hpp deleted file mode 100644 index ff85ef9305..0000000000 --- a/src/gpu_ib/dynamic_connection.hpp +++ /dev/null @@ -1,122 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_ -#define LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_ - -#include "connection.hpp" - -namespace rocshmem { - -class DynamicConnection : public Connection { - public: - explicit DynamicConnection(GPUIBBackend* backend); - - ~DynamicConnection() override; - - void get_remote_conn(int* remote_conn) override; - - void post_wqes() override; - - void initialize_rkey_handle(uint32_t** heap_rkey_handle, - ibv_mr* mr) override; - - void free_rkey_handle(uint32_t* heap_rkey_handle) override; - - uint32_t* get_vec_dct_num() const { return vec_dct_num; } - - uint16_t* get_vec_lids() const { return vec_lids; } - - private: - InitQPState initqp(uint8_t port) override; - - RtrState rtr(dest_info_t* dest, uint8_t port) override; - - RtsState rts(dest_info_t* dest) override; - - QPInitAttr qpattr(ibv_qp_cap cap) override; - - void connect_dci(ibv_qp* qp, uint8_t port); - - void create_dct(int32_t* dct_num, ibv_cq* cq, ibv_srq* srq, uint8_t port); - - ibv_qp_init_attr_ex dct_qp_init_attr(ibv_cq* cq, ibv_srq* srq, - uint8_t port) const; - - mlx5dv_qp_init_attr dct_dv_init_attr(); - - void dc_get_av(ibv_ah* ah, mlx5_wqe_av* mlx5_av); - - void set_dgram_seg(mlx5_wqe_datagram_seg* dc_seg, uint64_t dc_key, - uint32_t dct_num, uint8_t ext, mlx5_wqe_av* av); - - void set_data_seg(mlx5_wqe_data_seg* data_seg, uint32_t lkey); - - void post_dv_dc_wqe(int remote_conn); - - void create_qps_1() override; - - void create_qps_2(int port, int my_rank, - ibv_port_attr* ib_port_att) override; - - void create_qps_3(int port, ibv_qp* qp, int offset, - ibv_port_attr* ib_port_att) override; - - ibv_qp* create_qp_0(ibv_context* context, - ibv_qp_init_attr_ex* qp_attr) override; - - void allocate_dynamic_members(int num_wg) override; - - void free_dynamic_members() override; - - void initialize_1(int port, int num_wg) override; - - void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, int dc_key) override; - - int get_sq_dv_offset(int pe_idx, int32_t num_qps, int wg_idx) override; - - int num_dcis{1}; - - int num_dct{1}; - - static constexpr int DC_IB_KEY{0x1ee7a330}; - - uint32_t* dcts_num{nullptr}; - - uint16_t* lids{nullptr}; - - mlx5_wqe_av mlx5_av{}; - - ibv_ah* ah{nullptr}; - - ibv_srq* srq{nullptr}; - - ibv_cq* dct_cq{nullptr}; - - uint32_t* vec_dct_num{nullptr}; - - uint16_t* vec_lids{nullptr}; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_ diff --git a/src/gpu_ib/endian.cpp b/src/gpu_ib/endian.cpp deleted file mode 100644 index 82c64e87b3..0000000000 --- a/src/gpu_ib/endian.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "endian.hpp" - -namespace rocshmem { - -template -__device__ void swap_endian_store(T *dst, const T val) { - typedef union U { - T val; - uint8_t bytes[sizeof(T)]; - } union_type; - union_type src; - union_type dst_tmp; - - src.val = val; - std::reverse_copy(src.bytes, src.bytes + sizeof(T), dst_tmp.bytes); - *dst = dst_tmp.val; -} - -template <> -__device__ void swap_endian_store(uint64_t *dst, const uint64_t val) { - uint64_t new_val = ((val << 8) & 0xFF00FF00FF00FF00ULL) | - ((val >> 8) & 0x00FF00FF00FF00FFULL); - - new_val = ((new_val << 16) & 0xFFFF0000FFFF0000ULL) | - ((new_val >> 16) & 0x0000FFFF0000FFFFULL); - - *dst = (new_val << 32) | (new_val >> 32); -} - -template <> -__device__ void swap_endian_store(int64_t *dst, const int64_t val) { - swap_endian_store(reinterpret_cast(dst), (const uint64_t)val); -} - -template <> -__device__ void swap_endian_store(uint32_t *dst, const uint32_t val) { - uint32_t new_val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF); - - *dst = (new_val << 16) | (new_val >> 16); -} - -template <> -__device__ void swap_endian_store(int32_t *dst, const int32_t val) { - swap_endian_store(reinterpret_cast(dst), (const uint32_t)val); -} - -template <> -__device__ void swap_endian_store(uint16_t *dst, const uint16_t val) { - *dst = ((val << 8) & 0xFF00) | ((val >> 8) & 0x00FF); -} - -template <> -__device__ void swap_endian_store(int16_t *dst, const int16_t val) { - swap_endian_store(reinterpret_cast(dst), (const uint16_t)val); -} - -} // namespace rocshmem diff --git a/src/gpu_ib/endian.hpp b/src/gpu_ib/endian.hpp deleted file mode 100644 index 01f798cd5b..0000000000 --- a/src/gpu_ib/endian.hpp +++ /dev/null @@ -1,53 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_ENDIAN_HPP_ -#define LIBRARY_SRC_GPU_IB_ENDIAN_HPP_ - -#include - -namespace rocshmem { - -template -__device__ void swap_endian_store(T *dst, const T val); - -template <> -__device__ void swap_endian_store(uint64_t *dst, const uint64_t val); - -template <> -__device__ void swap_endian_store(int64_t *dst, const int64_t val); - -template <> -__device__ void swap_endian_store(uint32_t *dst, const uint32_t val); - -template <> -__device__ void swap_endian_store(int32_t *dst, const int32_t val); - -template <> -__device__ void swap_endian_store(uint16_t *dst, const uint16_t val); - -template <> -__device__ void swap_endian_store(int16_t *dst, const int16_t val); - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_ENDIAN_HPP_ diff --git a/src/gpu_ib/gpu_ib_team.cpp b/src/gpu_ib/gpu_ib_team.cpp deleted file mode 100644 index 0aa04c8c83..0000000000 --- a/src/gpu_ib/gpu_ib_team.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "gpu_ib_team.hpp" - -#include "../backend_type.hpp" -#include "backend_ib.hpp" - -namespace rocshmem { - -GPUIBTeam::GPUIBTeam(Backend *backend, TeamInfo *team_info_parent, - TeamInfo *team_info_world, int num_pes, int my_pe, - MPI_Comm mpi_comm, int pool_index) - : Team(backend, team_info_parent, team_info_world, num_pes, my_pe, - mpi_comm) { - type = BackendType::GPU_IB_BACKEND; - const GPUIBBackend *b = static_cast(backend); - - pool_index_ = pool_index; - - barrier_pSync = - &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]); - reduce_pSync = - &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]); - bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]); - alltoall_pSync = - &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]); - - pWrk = reinterpret_cast(b->pWrk_pool) + - ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index; - pAta = reinterpret_cast(b->pAta_pool) + - ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index; -} - -GPUIBTeam::~GPUIBTeam() {} - -} // namespace rocshmem diff --git a/src/gpu_ib/gpu_ib_team.hpp b/src/gpu_ib/gpu_ib_team.hpp deleted file mode 100644 index 3d7b18d1a7..0000000000 --- a/src/gpu_ib/gpu_ib_team.hpp +++ /dev/null @@ -1,50 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_ -#define LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_ - -#include "../team.hpp" - -namespace rocshmem { - -class GPUIBTeam : public Team { - public: - GPUIBTeam(Backend* handle, TeamInfo* team_info_wrt_parent, - TeamInfo* team_info_wrt_world, int num_pes, int my_pe, - MPI_Comm team_comm, int pool_index); - - virtual ~GPUIBTeam(); - - long* barrier_pSync{nullptr}; - long* reduce_pSync{nullptr}; - long* bcast_pSync{nullptr}; - long* alltoall_pSync{nullptr}; - void* pWrk{nullptr}; - void* pAta{nullptr}; - - int pool_index_{-1}; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_ diff --git a/src/gpu_ib/infiniband_structs.hpp b/src/gpu_ib/infiniband_structs.hpp deleted file mode 100644 index 3726be23f8..0000000000 --- a/src/gpu_ib/infiniband_structs.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_ -#define LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_ - -#include - -namespace rocshmem { - -typedef struct ib_mlx5_base_av { - uint64_t dc_key; - uint32_t dqp_dct; - uint8_t stat_rate_sl; - uint8_t fl_mlid; - uint16_t rlid; -} ib_mlx5_base_av_t; - -union mlx5_segment { - mlx5_wqe_ctrl_seg ctrl_seg; - mlx5_wqe_raddr_seg raddr_seg; - mlx5_wqe_atomic_seg atomic_seg; - mlx5_wqe_data_seg data_seg; - mlx5_wqe_inl_data_seg inl_data_seg; - ib_mlx5_base_av_t base_av; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_ diff --git a/src/gpu_ib/memory_builder_policy.hpp b/src/gpu_ib/memory_builder_policy.hpp deleted file mode 100644 index 1c6f211c95..0000000000 --- a/src/gpu_ib/memory_builder_policy.hpp +++ /dev/null @@ -1,78 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_ -#define LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_ - -#include - -#include - -namespace rocshmem { - -class GPUIBContext; - -class MemoryBuilderPolicyWrapper { - public: - __device__ MemoryBuilderPolicyWrapper() = default; - - __device__ ~MemoryBuilderPolicyWrapper() { - if (wrapped_policy_) { - delete wrapped_policy_; - } - } - - template - __device__ MemoryBuilderPolicyWrapper(T&& policy) - : wrapped_policy_(new Wrapper(std::forward(policy))) {} - - __device__ void operator()(GPUIBContext* context) { - return (*wrapped_policy_)(context); - } - - private: - class PolicyBase { - public: - __device__ virtual void operator()(GPUIBContext* context) = 0; - - __device__ virtual ~PolicyBase() {} - }; - - template - class Wrapper : public PolicyBase { - public: - __device__ Wrapper(const T& t) : wrapped_policy_(t) {} - - __device__ void operator()(GPUIBContext* context) override { - return wrapped_policy_(context); - } - - private: - T wrapped_policy_; - }; - - PolicyBase* wrapped_policy_; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_ diff --git a/src/gpu_ib/network_policy.cpp b/src/gpu_ib/network_policy.cpp deleted file mode 100644 index b251dd2ad2..0000000000 --- a/src/gpu_ib/network_policy.cpp +++ /dev/null @@ -1,500 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "network_policy.hpp" - -#include - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../atomic_return.hpp" -#include "../context_incl.hpp" -#include "backend_ib.hpp" -#include "connection.hpp" -#include "dynamic_connection.hpp" -#include "queue_pair.hpp" -#include "reliable_connection.hpp" - -namespace rocshmem { - -void NetworkOnImpl::dump_backend_stats(ROCStats *globalStats) { - /* - * TODO(bpotter): Refactor this into the Stats class to remove the ifdef - */ -#ifdef PROFILE - int statblocks = connection->total_number_connections(); - - uint64_t cycles_ring_sq_db = 0; - uint64_t cycles_update_wqe = 0; - uint64_t cycles_poll_cq = 0; - uint64_t cycles_next_cq = 0; - uint64_t cycles_init = gpu_qps[statblocks - 1].profiler.getStat(INIT); - uint64_t cycles_finalize = gpu_qps[statblocks - 1].profiler.getStat(FINALIZE); - - uint64_t total_quiet_count = 0; - uint64_t total_db_count = 0; - uint64_t total_wqe_count = 0; - - for (int i = 0; i < statblocks; i++) { - cycles_ring_sq_db += gpu_qps[i].profiler.getStat(RING_SQ_DB); - cycles_update_wqe += gpu_qps[i].profiler.getStat(UPDATE_WQE); - cycles_poll_cq += gpu_qps[i].profiler.getStat(POLL_CQ); - cycles_next_cq += gpu_qps[i].profiler.getStat(NEXT_CQ); - total_quiet_count += gpu_qps[i].profiler.getStat(QUIET_COUNT); - total_db_count += gpu_qps[i].profiler.getStat(DB_COUNT); - total_wqe_count += gpu_qps[i].profiler.getStat(WQE_COUNT); - } - - double us_ring_sq_db = cycles_ring_sq_db / gpu_clock_freq_mhz; - double us_update_wqe = cycles_update_wqe / gpu_clock_freq_mhz; - double us_poll_cq = cycles_poll_cq / gpu_clock_freq_mhz; - double us_next_cq = cycles_next_cq / gpu_clock_freq_mhz; - double us_init = cycles_init / gpu_clock_freq_mhz; - double us_finalize = cycles_finalize / gpu_clock_freq_mhz; - - const int FIELD_WIDTH = 20; - const int FLOAT_PRECISION = 2; - - printf("Counts: Internal Quiets %lu DB Rings %lu WQE Posts %lu\n", - total_quiet_count, total_db_count, total_wqe_count); - - printf("\n%*s%*s%*s%*s%*s%*s\n", FIELD_WIDTH + 1, "Init (us)", - FIELD_WIDTH + 1, "Finalize (us)", FIELD_WIDTH + 1, "Ring SQ DB (us)", - FIELD_WIDTH + 1, "Update WQE (us)", FIELD_WIDTH + 1, "Poll CQ (us)", - FIELD_WIDTH + 1, "Next CQ (us)"); - - uint64_t totalFinalize = globalStats->getStat(NUM_FINALIZE); - printf("%*.*f %*.*f %*.*f %*.*f %*.*f %*.*f\n", FIELD_WIDTH, FLOAT_PRECISION, - us_init / totalFinalize, FIELD_WIDTH, FLOAT_PRECISION, - us_finalize / totalFinalize, FIELD_WIDTH, FLOAT_PRECISION, - us_ring_sq_db / total_db_count, FIELD_WIDTH, FLOAT_PRECISION, - us_update_wqe / total_wqe_count, FIELD_WIDTH, FLOAT_PRECISION, - us_poll_cq / total_quiet_count, FIELD_WIDTH, FLOAT_PRECISION, - us_next_cq / total_quiet_count); -#endif -} - -void NetworkOnImpl::reset_backend_stats() { - int statblocks = connection->total_number_connections(); - - for (size_t i = 0; i < statblocks; i++) { - gpu_qps[i].profiler.resetStats(); - } -} - -void NetworkOnImpl::exchange_hdp_info(HdpPolicy *hdp_policy, - MPI_Comm thread_comm) { - /* - * Using Connection class, register the host-side hdp flush address - * with the InfiniBand network. - */ - connection->reg_mr(hdp_policy->get_hdp_flush_ptr(), 32, &hdp_mr, false); - - /* - * Allocate device-side memory for the remote HDP keys. - */ - CHECK_HIP(hipMalloc(reinterpret_cast(&hdp_rkey), - num_pes * sizeof(uint32_t))); - - /* - * Allocate device-side memory for the remote HDP addresses. - */ - CHECK_HIP(hipMalloc(reinterpret_cast(&hdp_address), - num_pes * sizeof(uintptr_t))); - - /* - * Allocate host-side memory to exchange hdp keys using MPI_Allgather. - */ - uint32_t *host_hdp_cpy = - reinterpret_cast(malloc(num_pes * sizeof(uint32_t))); - if (host_hdp_cpy == nullptr) { - abort(); - } - - /* - * Allocate host-side memory to exchange hdp addresses using - * MPI_Allgather. - */ - uint32_t **host_hdp_address_cpy = - reinterpret_cast(malloc(num_pes * sizeof(uint32_t *))); - if (host_hdp_address_cpy == nullptr) { - free(host_hdp_cpy); - abort(); - } - - /* - * This processing element writes its personal HDP key and HDP address - * into the host-side arrays which were just allocated. - */ - int my_rank = my_pe; - host_hdp_cpy[my_rank] = htobe32(hdp_mr->rkey); - host_hdp_address_cpy[my_rank] = hdp_policy->get_hdp_flush_ptr(); - - /* - * Do all-to-all exchange of our HDP key with other processing elements. - */ - MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_hdp_cpy, - sizeof(uint32_t), MPI_CHAR, thread_comm); - - /* - * Do all-to-all exchange of our HDP address with other processing - * elements. - */ - MPI_Allgather(MPI_IN_PLACE, sizeof(uintptr_t), MPI_CHAR, host_hdp_address_cpy, - sizeof(uint32_t *), MPI_CHAR, thread_comm); - - /* - * Copy the recently exchanged HDP keys to device memory. - */ - hipStream_t stream; - CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - CHECK_HIP(hipMemcpyAsync(hdp_rkey, host_hdp_cpy, num_pes * sizeof(uint32_t), - hipMemcpyHostToDevice, stream)); - - /* - * Copy the recently exchanged HDP addresses to device memory. - */ - CHECK_HIP(hipMemcpyAsync(hdp_address, host_hdp_address_cpy, - num_pes * sizeof(uint32_t *), hipMemcpyHostToDevice, - stream)); - CHECK_HIP(hipStreamSynchronize(stream)); - CHECK_HIP(hipStreamDestroy(stream)); - - /* - * Free the host-side resources used to exchange HDP resources - * between processing elements. - */ - free(host_hdp_cpy); - free(host_hdp_address_cpy); -} - -void NetworkOnImpl::setup_atomic_region() { - /* - * Allocate fine-grained device-side memory for the atomic return - * region. - */ - allocate_atomic_region(&atomic_ret, num_blocks); - - /* - * Register the atomic return region on the InfiniBand network. - */ - connection->reg_mr(atomic_ret->atomic_base_ptr, - sizeof(uint64_t) * max_nb_atomic * num_blocks, &mr, false); - - /* - * Set member variable from class. - */ - atomic_ret->atomic_lkey = htobe32(mr->lkey); -} - -void NetworkOnImpl::heap_memory_rkey(char *local_heap_base, size_t heap_size, - MPI_Comm thread_comm, bool is_managed) { - /* - * Allocate host-side memory to hold remote keys for all processing - * elements. - */ - const size_t rkeys_size = sizeof(uint32_t) * num_pes; - uint32_t *host_rkey_cpy = reinterpret_cast(malloc(rkeys_size)); - if (host_rkey_cpy == nullptr) { - abort(); - } - - /* - * Using the Connection class, register the symmetric heap with the - * InfiniBand network. - */ - void *base_heap = local_heap_base; - connection->reg_mr(base_heap, heap_size, &heap_mr, is_managed); - - /* - * Using the memory region from the prior heap memory registration, - * allocate and initialize some device-side memory to hold the remote - * keys for the symmetric heap base. - * - * Only the device-side memory entry for this processing element will be - * updated with the key for the heap memory region. - */ - connection->initialize_rkey_handle(&heap_rkey, heap_mr); - - /* - * Copy the device-side heap base remote key array to the host-side - * heap base remote key array. - */ - hipStream_t stream; - CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - CHECK_HIP(hipMemcpyAsync(host_rkey_cpy, heap_rkey, rkeys_size, - hipMemcpyDeviceToHost, stream)); - CHECK_HIP(hipStreamSynchronize(stream)); - - /* - * Do all-to-all exchange of symmetric heap base remote key between the - * processing elements. - */ - MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_rkey_cpy, - sizeof(uint32_t), MPI_CHAR, thread_comm); - - /* - * Copy the recently updated host-side heap base remote key array back - * to the device-side memory. - */ - CHECK_HIP(hipMemcpyAsync(heap_rkey, host_rkey_cpy, rkeys_size, - hipMemcpyHostToDevice, stream)); - CHECK_HIP(hipStreamSynchronize(stream)); - CHECK_HIP(hipStreamDestroy(stream)); - - /* - * Free the host-side resources used to do the processing element - * exchange of keys and addresses for the symmetric heap base. - */ - free(host_rkey_cpy); - - /* - * Initialize this member variable to hold the InfiniBand memory - * region's local key. - */ - lkey = heap_mr->lkey; -} - -void NetworkOnImpl::setup_gpu_qps(GPUIBBackend *B) { - /* - * Determine how many connections are needed. - * The number of connections depends on the connection type and the - * number of workgroups. - */ - int connections; - connection->get_remote_conn(&connections); - connections *= num_blocks; - - /* - * Allocate device-side memory for the queue pairs. - */ - CHECK_HIP(hipMalloc(&gpu_qps, sizeof(QueuePair) * connections)); - - /* - * For every connection, initialize the QueuePair. - */ - for (int i = 0; i < connections; i++) { - new (&gpu_qps[i]) QueuePair(B); - connection->init_gpu_qp_from_connection(&gpu_qps[i], i); - } -} - -void NetworkOnImpl::rocshmem_g_init(SymmetricHeap *heap_handle, - MPI_Comm thread_comm) { - init_g_ret(heap_handle, thread_comm, num_blocks, &g_ret); -} - -__host__ void NetworkOnImpl::networkHostSetup(GPUIBBackend *B) { - num_pes = B->num_pes; - my_pe = B->my_pe; - num_blocks = B->num_blocks_; - -#ifdef USE_DC - connection = new DynamicConnection(B); -#else - connection = new ReliableConnection(B); -#endif - - connection->initialize(B->num_blocks_); - exchange_hdp_info(B->hdp_policy, B->thread_comm); - - const auto &heap_bases{B->heap.get_heap_bases()}; - heap_memory_rkey(heap_bases[my_pe], B->heap.get_size(), B->thread_comm, - B->heap.is_managed()); - // The earliest we can allow the main thread to launch a kernel to - // avoid potential deadlock - network_init_done = true; - - setup_atomic_region(); - - connection->initialize_gpu_policy(&connection_policy, heap_rkey); - - rocshmem_g_init(&B->heap, B->thread_comm); - - connection->post_wqes(); - - setup_gpu_qps(B); -} - -__host__ void NetworkOnImpl::networkHostFinalize() { - CHECK_HIP(hipFree(hdp_rkey)); - hdp_rkey = nullptr; - - CHECK_HIP(hipFree(hdp_address)); - hdp_address = nullptr; - - CHECK_HIP(hipFree(atomic_ret)); - atomic_ret = nullptr; - - CHECK_HIP(hipFree(gpu_qps)); - gpu_qps = nullptr; - - CHECK_HIP(hipFree(connection_policy)); - connection_policy = nullptr; - - connection->free_rkey_handle(heap_rkey); - - connection->finalize(); - delete connection; - connection = nullptr; -} - -__host__ void NetworkOnImpl::networkHostInit(GPUIBContext *ctx, int buffer_id) { - int remote_conn = getNumQueuePairs(); - - CHECK_HIP(hipMalloc(&ctx->device_qp_proxy, remote_conn * sizeof(QueuePair))); - - for (int i = 0; i < getNumQueuePairs(); i++) { - /* - * RC gpu_qp is actually [NUM_PE][NUM_BLOCK] qps but is flattened. - * Each num_pe entry contains num_block QPs connected to that PE. - * For RC, we need to iterate gpu_qp[i][buffer_id] to collect a - * single QP for each connected PE in order to build context. - * For DC, NUM_PE = 1 so can just use buffer_id directly. - */ - int offset = num_blocks * i + buffer_id; - new (ctx->getQueuePair(i)) QueuePair(gpu_qps[offset]); - - auto *qp = ctx->getQueuePair(i); - qp->global_qp = &gpu_qps[offset]; - qp->num_cqs = getNumQueuePairs(); - qp->atomic_ret.atomic_base_ptr = - &atomic_ret->atomic_base_ptr[max_nb_atomic * buffer_id]; - qp->base_heap = ctx->base_heap; - } - ctx->g_ret = g_ret; -} - -__device__ void NetworkOnImpl::networkGpuInit(GPUIBContext *ctx, - int buffer_id) { - for (int i = 0; i < getNumQueuePairs(); i++) { - int offset = num_blocks * i + buffer_id; - - auto *qp = ctx->getQueuePair(i); - new (qp) QueuePair(gpu_qps[offset]); - - qp->global_qp = &gpu_qps[offset]; - qp->num_cqs = getNumQueuePairs(); - qp->atomic_ret.atomic_base_ptr = - &atomic_ret->atomic_base_ptr[max_nb_atomic * buffer_id]; - qp->base_heap = ctx->base_heap; - } - ctx->g_ret = g_ret; -} - -__device__ __host__ QueuePair *NetworkOnImpl::getQueuePair(QueuePair *qp_handle, - int pe) { -#ifdef USE_DC - return qp_handle; -#else - return &qp_handle[pe]; -#endif -} - -__device__ __host__ int NetworkOnImpl::getNumQueuePairs() { -#ifdef USE_DC - return 1; -#else - return num_pes; -#endif -} - -void NetworkOffImpl::networkHostSetup(GPUIBBackend *B) { - num_pes = B->num_pes; - my_pe = B->my_pe; - num_blocks = B->num_blocks_; - - exchange_hdp_info(B->hdp_policy, B->thread_comm); -} -void NetworkOffImpl::exchange_hdp_info(HdpPolicy *hdp_policy, - MPI_Comm thread_comm) { -#ifdef USE_SINGLE_NODE - // We are using the symmetric heap for the HDP flush ptr - hdp_address = reinterpret_cast(hdp_policy->get_hdp_flush_ptr()); -#else - /* - * Allocate device-side memory for the remote HDP addresses. - */ - CHECK_HIP(hipMalloc(reinterpret_cast(&hdp_address), - num_pes * sizeof(uintptr_t))); - - /* - * Allocate host-side memory to exchange hdp keys using MPI_Allgather. - */ - uint32_t *host_hdp_cpy = - reinterpret_cast(malloc(num_pes * sizeof(uint32_t))); - if (host_hdp_cpy == nullptr) { - abort(); - } - - /* - * Allocate host-side memory to exchange hdp addresses using - * MPI_Allgather. - */ - uint32_t **host_hdp_address_cpy = - reinterpret_cast(malloc(num_pes * sizeof(uint32_t *))); - if (host_hdp_address_cpy == nullptr) { - free(host_hdp_cpy); - abort(); - } - - /* - * This processing element writes its personal HDP address - * into the host-side array which were just allocated. - */ - int my_rank = my_pe; - host_hdp_address_cpy[my_rank] = hdp_policy->get_hdp_flush_ptr(); - - /* - * Do all-to-all exchange of our HDP address with other processing - * elements. - */ - MPI_Allgather(MPI_IN_PLACE, sizeof(uintptr_t), MPI_CHAR, host_hdp_address_cpy, - sizeof(uint32_t *), MPI_CHAR, thread_comm); - - /* - * Copy the recently exchanged HDP addresses to device memory. - */ - hipStream_t stream; - CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - CHECK_HIP(hipMemcpyAsync(hdp_address, host_hdp_address_cpy, - num_pes * sizeof(uint32_t *), hipMemcpyHostToDevice, - stream)); - CHECK_HIP(hipStreamSynchronize(stream)); - CHECK_HIP(hipStreamDestroy(stream)); - - /* - * Free the host-side resources used to exchange HDP resources - * between processing elements. - */ - free(host_hdp_cpy); - free(host_hdp_address_cpy); -#endif -} - -void NetworkOffImpl::networkHostFinalize() { -#ifndef USE_SINGLE_NODE - CHECK_HIP(hipFree(hdp_address)); -#endif - hdp_address = nullptr; -} - -} // namespace rocshmem diff --git a/src/gpu_ib/network_policy.hpp b/src/gpu_ib/network_policy.hpp deleted file mode 100644 index ab3c75a589..0000000000 --- a/src/gpu_ib/network_policy.hpp +++ /dev/null @@ -1,357 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_ -#define LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_ - -#include -#include - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "rocshmem/rocshmem.hpp" -#include "connection_policy.hpp" -#include "queue_pair.hpp" -#include "../hdp_policy.hpp" -#include "../memory/symmetric_heap.hpp" -#include "../stats.hpp" -#include "../util.hpp" - -struct ibv_mr; -struct hdp_reg_t; - -namespace rocshmem { - -struct atomic_ret_t; -class GPUIBBackend; -class GPUIBContext; -class GPUIBHostContext; -class Connection; - -class NetworkOnImpl { - public: - void dump_backend_stats(ROCStats *globalStats); - - void reset_backend_stats(); - - /** - * @brief setup the network resources and initialization for the - * GPUIBBackend - */ - __host__ void networkHostSetup(GPUIBBackend *B); - - /** - * @brief deallocate and close the network resources - */ - __host__ void networkHostFinalize(); - - /** - * @brief initialize the network resources for each context - */ - __host__ void networkHostInit(GPUIBContext *ctx, int buffer_id); - - /** - * @brief initialize the network resources for each context on GPU side - */ - __device__ void networkGpuInit(GPUIBContext *ctx, int buffer_id); - - /** - * @brief returns the QP for the targeted pe - */ - __device__ __host__ QueuePair *getQueuePair(QueuePair *qp, int pe); - - /** - * @brief returns the numbers of QPs used per the calling PE - */ - __device__ __host__ int getNumQueuePairs(); - - /** - * @brief returns the number of PEs accessible via network - */ - __device__ __host__ int getNumDest() { return num_pes; } - - static uint32_t externSharedBytes(int num_pes) { - int remote_conn{1}; -#ifndef USE_DC - remote_conn = num_pes; -#endif - return remote_conn * sizeof(QueuePair); - } - - protected: - /** - * @brief flag to indicated that the helper thread reach this milestone - */ - volatile bool network_init_done{false}; - - void heap_memory_rkey(char *local_heap_base, size_t heap_size, - MPI_Comm thread_comm, bool is_managed); - - /** - * @brief Exchange HDP information between all processing elements. - * - * Each device has a Host Data Path (HDP) associated with it must be - * manually controlled when using fine-grained memory accesses. (The - * symmetric heap is allocated with fine-grained memory to support both - * host memory accesses and device memory accesses.) The HDP can be - * cleared by accessing an address on the device. These addresses must be - * shared across the network (to support updates on remote accesses). - * - * These HDPs are visible to the network by registering them as - * InfiniBand memory regions. Every memory region has a remote key - * which needs to be shared across the network (to access the memory - * region). - * - * This method is responsible to allocating and initializing the - * library's HDP device-side memory and running the all-to-all exchange - * to share both the keys and addresses. - * - * @todo Implement HDP policy class methods to hide most of this - * method. The guts should be encapsulated in the policy class and - * not exposed here in the backend. Within the policy class methods, - * create helper function to improve code reuse regarding the many - * data transfers. - */ - void exchange_hdp_info(HdpPolicy *hdp_policy, MPI_Comm thread_comm); - - /** - * @brief Allocate and initialize the atomic region. - * - * The atomic region is used by the atomic operations which have return - * values. The library user does not need to provide an address for the - * return value so we are forced to do it on their behalf. - * - * The atomic_ret member is initialized upon completion of this method. - */ - void setup_atomic_region(); - - /** - * @brief Allocate and initialize device-side queue pair objects. - * - * Upon completion, the gpu_qps member will be initialized. - */ - void setup_gpu_qps(GPUIBBackend *B); - - /** - * @brief Allocate and initialize device-side memory that will be used for - * the return of g shmem ops (eg: shmem_int_g) - */ - void rocshmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm); - - /** - * @brief The backend delegates some InfiniBand connection setup to - * the Connection class. - */ - Connection *connection{nullptr}; - - public: - /** - * @brief Number of PEs. Get directly from the GPUIBBackend. - */ - int num_pes{0}; - - /** - * @brief This PE's rank. - */ - int my_pe{-1}; - - /** - * @brief Number of WG that will be performing communication - */ - int num_blocks{0}; - - /** - * @brief Holds InfiniBand remote keys for HDP memory regions. - * - * The member holds a C-array allocation for remote keys (from - * InfiniBand memory registrations) for remote HDP registers. The C-array - * has one entry for each processing element (indexed by processing - * element ID). - * - * @todo Remove duplication between the backend class and the QueuePair - * class. QueuePair stores a copy of this member too. The backend - * class does not do much besides initialize this data structure and - * hold it until the QueuePair can consume it. - */ - uint32_t *hdp_rkey{nullptr}; - - /** - * @brief Holds HDP register addresses for each processing element. - * - * The Host Data Path (HDP) addresses are used to clear a buffer - * which interferes with memory visibility of accesses to fine-grained - * allocations. - * - * The member holds a C-array allocation for the register addresses. - * The C-array has one entry for each processing element (indexed by - * processing element ID). - * - * @todo Remove duplication between the backend class and the QueuePair - * class. QueuePair stores a copy of this member too. The backend - * class does not do much besides initialize this data structure and - * hold it until the QueuePair can consume it. - */ - uintptr_t *hdp_address{nullptr}; - - /** - * @brief Handle for the HDP memory region. - */ - ibv_mr *hdp_mr{nullptr}; - - /** - * @brief Set of QueuePairs used by device to do networking. - * - * The member is used during Context creation. - * - * @todo What we really need here is a collection of Contexts that can - * either be copied into LDS or used directly by the GPU depending on - * what type of context it is (shareable, serialized, or private). - * No need to pool up QueuePairs, they can just be managed by their - * owning Context. Should then consider pushing into base class since - * it's not gpu-ib specific. - */ - QueuePair *gpu_qps{nullptr}; - - /** - * @brief C-array of symmetric heap base pointers. - * - * A C-array of char* pointers corresponding to the heap base pointers - * virtual address for each processing element that we can communicate - * with. - */ - uint32_t *heap_rkey{nullptr}; - - /** - * @brief Handle for the symmetric heap memory region. - */ - ibv_mr *heap_mr{nullptr}; - - /** - * @brief Local key for the symmetric heap memory region. - */ - uint32_t lkey{0}; - - /** - * @brief Control struct for atomic memory region. - * - * The atomic region is used by the atomic operations which have return - * values. The library user does not need to provide an address for the - * return value so we are forced to do it on their behalf. - */ - atomic_ret_t *atomic_ret{nullptr}; - - /** - * @brief Handle for the atomic memory region. - * - * @todo Provide more descriptive variable name. - */ - ibv_mr *mr{nullptr}; - - /** - * @brief Buffer used to store the results of a *_g operation. - * - * These operations do not provide a destination buffer so the runtime - * must manage one. - */ - char *g_ret{nullptr}; - - /** - * @brief Compile-time configuration policy for InfiniBand connections. - * - * The configuration option "USE_DC" can be enabled to create - * Dynamic connection types. By default, Reliable connections are - * created. - */ - ConnectionImpl *connection_policy{nullptr}; -}; - -// clang-format off -NOWARN(-Wunused-parameter, -class NetworkOffImpl { - public: - void dump_backend_stats(ROCStats *globalStats) { } - - void reset_backend_stats() { } - - __host__ void networkHostSetup(GPUIBBackend *B); - - __host__ void exchange_hdp_info(HdpPolicy *hdp_policy, MPI_Comm thread_comm); - - __host__ void networkHostFinalize(); - - __host__ void networkHostInit(GPUIBContext *ctx, int buffer_id) {} - - __device__ void networkGpuInit(GPUIBContext *ctx, int buffer_id) {} - - __device__ __host__ QueuePair *getQueuePair(QueuePair *qp, int pe) { - return nullptr; - } - - __device__ __host__ int getNumQueuePairs() { return 0; } - - __device__ __host__ int getNumDest() { return 0; } - - static uint32_t externSharedBytes(int num_pes) { return 0; } - - public: - int num_pes{0}; - - int my_pe{-1}; - - int num_blocks{0}; - - uint32_t *hdp_rkey{nullptr}; - - uintptr_t *hdp_address{nullptr}; - - ibv_mr *hdp_mr{nullptr}; - - QueuePair *gpu_qps{nullptr}; - - uint32_t *heap_rkey{nullptr}; - - ibv_mr *heap_mr{nullptr}; - - uint32_t lkey{0}; - - atomic_ret_t *atomic_ret{nullptr}; - - ibv_mr *mr{nullptr}; - - char *g_ret{nullptr}; - - ConnectionImpl *connection_policy{nullptr}; -}; -) -// clang-format on - -/* - * Select which one of our IPC policies to use at compile time. - */ -#ifdef USE_SINGLE_NODE -typedef NetworkOffImpl NetworkImpl; -#else -typedef NetworkOnImpl NetworkImpl; -#endif - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_ diff --git a/src/gpu_ib/qe_dumper.cpp b/src/gpu_ib/qe_dumper.cpp deleted file mode 100644 index e2360bd482..0000000000 --- a/src/gpu_ib/qe_dumper.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "qe_dumper.hpp" - -namespace rocshmem { - -QeDumper::QeDumper(int dest_pe, int src_wg, int index) - : dest_pe_(dest_pe), src_wg_(src_wg), index_(index) { - void* temp = malloc(sizeof(GPUIBBackend*)); - gpu_backend_ = static_cast(temp); - - GPUIBBackend* device_backend_proxy_address; - CHECK_HIP(hipGetSymbolAddress( - reinterpret_cast(&device_backend_proxy_address), - HIP_SYMBOL(device_backend_proxy))); - - CHECK_HIP(hipMemcpy(&gpu_backend_, device_backend_proxy_address, - sizeof(GPUIBBackend*), hipMemcpyDeviceToHost)); - - int qp_offset = gpu_backend_->num_blocks_ * dest_pe_ + src_wg_; - - qp_ = &(gpu_backend_->networkImpl.gpu_qps[qp_offset]); -} - -QeDumper::~QeDumper() { - /*if (gpu_backend_) { - free(gpu_backend_); - }*/ -} - -void QeDumper::dump_cq() { - type_ = "CQ"; - - auto* raw_cqe = &(qp_->current_cq_q_H[index_]); - raw_u64_ = reinterpret_cast(raw_cqe); - - dump_uint64_(8); -} - -void QeDumper::dump_sq() { - type_ = "SQ"; - - auto* raw_sqe = &(qp_->current_sq_H[index_ * 8]); - raw_u64_ = reinterpret_cast(raw_sqe); - - dump_uint64_(8); -} - -void QeDumper::dump_uint64_(size_t num_elems) const { - printf("%s(%d, %d, %d) *** = ", type_.c_str(), dest_pe_, src_wg_, index_); - - for (size_t i = 0; i < num_elems; i++) { - printf(" %lx ", raw_u64_[i]); - } - - printf("done %s\n", type_.c_str()); -} - -} // namespace rocshmem diff --git a/src/gpu_ib/qe_dumper.hpp b/src/gpu_ib/qe_dumper.hpp deleted file mode 100644 index c613a190c0..0000000000 --- a/src/gpu_ib/qe_dumper.hpp +++ /dev/null @@ -1,66 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_ -#define LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_ - -#include -#include - -#include - -#include "backend_ib.hpp" -#include "queue_pair.hpp" - -namespace rocshmem { - -class QeDumper { - public: - QeDumper(int dest_pe, int src_wg, int index); - - ~QeDumper(); - - void dump_cq(); - - void dump_sq(); - - private: - void dump_uint64_(size_t num_elems) const; - - int dest_pe_{-1}; - - int src_wg_{-1}; - - int index_{-1}; - - GPUIBBackend* gpu_backend_{nullptr}; - - std::string type_{}; - - QueuePair* qp_{nullptr}; - - uint64_t* raw_u64_{nullptr}; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_ diff --git a/src/gpu_ib/queue_pair.cpp b/src/gpu_ib/queue_pair.cpp deleted file mode 100644 index 49357ff66c..0000000000 --- a/src/gpu_ib/queue_pair.cpp +++ /dev/null @@ -1,437 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "queue_pair.hpp" - -#include - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "backend_ib.hpp" -#include "endian.hpp" -#include "segment_builder.hpp" -#include "../util.hpp" - -namespace rocshmem { - -QueuePair::QueuePair(GPUIBBackend *backend) - : hdp_policy(backend->hdp_policy), - connection_policy(*backend->networkImpl.connection_policy) { - hdp_rkey = backend->networkImpl.hdp_rkey; - hdp_address = backend->networkImpl.hdp_address; - - atomic_ret.atomic_lkey = backend->networkImpl.atomic_ret->atomic_lkey; - atomic_ret.atomic_counter = 0; -} - -__device__ QueuePair::~QueuePair() { - uint64_t start = profiler.startTimer(); - - global_qp->sq_counter = sq_counter; - global_qp->local_sq_cnt = local_sq_cnt; - global_qp->cq_consumer_counter = cq_consumer_counter; - global_qp->current_sq = current_sq; - global_qp->current_cq_q = current_cq_q; - global_qp->sq_overflow = sq_overflow; - global_qp->quiet_counter = quiet_counter; - profiler.endTimer(start, FINALIZE); - - global_qp->profiler.accumulateStats(profiler); - - __syncthreads(); -} - -__device__ uint8_t QueuePair::get_cq_error_syndrome(mlx5_cqe64 *cqe_entry) { - mlx5_err_cqe *cqe_err = reinterpret_cast(cqe_entry); - return cqe_err->syndrome; -} - -__device__ void QueuePair::ring_doorbell(uint64_t db_val) { - swap_endian_store(const_cast(dbrec_send), - reinterpret_cast(sq_counter)); - STORE(db.ptr, db_val); - db.uint ^= 256; -} - -__device__ void QueuePair::set_completion_flag_on_wqe(int num_wqes) { - uint64_t *wqe = ¤t_sq[8 * ((sq_counter - num_wqes) % max_nwqe)]; - uint8_t *wqe_ce = reinterpret_cast(wqe) + 11; - *wqe_ce = 8; -} - -template <> -__device__ void QueuePair::update_wqe_ce_single(int num_wqes) { - if (sq_counter % max_nwqe == (max_nwqe - 2)) { - set_completion_flag_on_wqe(num_wqes); - quiet_counter++; - } -} - -template <> -__device__ void QueuePair::update_wqe_ce_single(int num_wqes) { - set_completion_flag_on_wqe(num_wqes); - quiet_counter++; -} - -template <> -__device__ void QueuePair::update_wqe_ce_thread(int num_wqes) {} - -template <> -__device__ void QueuePair::update_wqe_ce_thread(int num_wqes) { - set_completion_flag_on_wqe(num_wqes); - atomicAdd(&quiet_counter, 1); -} - -__device__ void QueuePair::compute_db_val_opcode(uint64_t *db_val, - uint16_t dbrec_val, - uint8_t opcode) { - uint64_t opcode64 = opcode; - opcode64 = opcode64 << 24 & 0x000000FFFF000000; - - uint64_t dbrec = dbrec_val << 8; - dbrec = dbrec & 0x0000000000FFFF00; - - uint64_t val = *db_val; - val = val & 0xFFFFFFFFFF0000FF; - - *db_val = val | dbrec | opcode64; -} - -template -__device__ void QueuePair::quiet_internal() { - /* - * If there are nothing to quiet, just return early. - */ - uint32_t quiet_val = quiet_counter; - if (!quiet_val) { - return; - } - - profiler.incStat(QUIET_COUNT); - uint64_t start = profiler.startTimer(); - - /* - * Generate a pointer to the completion queue entry. - */ - cq_consumer_counter = cq_consumer_counter + quiet_val - 1; - uint32_t indx = (cq_consumer_counter % cq_size); - mlx5_cqe64 *cqe_entry = ¤t_cq_q[indx]; - - /* - * Access the op_own value in the completion queue entry. - */ - int val_ld = uncached_load_ubyte(&(cqe_entry->op_own)); - uint8_t val_op_own = val_ld; - - /* - * If the completion queue entry is not valid, wait for it to become so. - */ - while (!((val_op_own & 0x1) == ((cq_consumer_counter >> cq_log_size) & 1)) || - ((val_op_own) >> 4) == 0xF) { - val_ld = uncached_load_ubyte(&(cqe_entry->op_own)); - val_op_own = val_ld; - } - - /* - * Grab the opcode from the op_own field and report if it is an error. - */ - uint8_t opcode = val_op_own >> 4; - if (opcode != 0) { - uint8_t syndrome = get_cq_error_syndrome(cqe_entry); - mlx5_err_cqe *cqe_err = reinterpret_cast(cqe_entry); - GPU_DPRINTF("QUIET ERROR: signature %d opcode_qpn %llx wqe_cnt %llx \n", - syndrome, cqe_err->s_wqe_opcode_qpn, cqe_err->wqe_counter); - } - - /* - * Decrement the quiet count by the amount determined at the beginning - * of this method. - * - * bpotter - There are two areas of concern in this method for me. - * 1) In multithreaded builds, we may need to make this method a critical - * section to prevent data races on these variables. - * - * 2) Is there a data race in the API if a one remote process calls quiet - * while another process continues adding events? Is it ever possible for - * a quiet to complete, but the quiet_counter decrement here is not set - * to zero? - */ - level L; - L.decQuietCounter(&quiet_counter, quiet_val); - - profiler.endTimer(start, POLL_CQ); - start = profiler.startTimer(); - - /* - * Increment the trailing index counter which tracks our spot in the - * completion queue. - */ - cq_consumer_counter++; - swap_endian_store(const_cast(dbrec_cq), cq_consumer_counter); - - profiler.endTimer(start, NEXT_CQ); -} - -template -__device__ void QueuePair::quiet_single() { - level L; - L.quiet(this); -} - -template -__device__ void QueuePair::quiet_single_heavy(int pe) { - level L; - L.quiet_heavy(this, pe); -} - -template -__device__ void QueuePair::update_posted_wqe_generic( - int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, - int64_t atomic_data, int64_t atomic_cmp, bool ring_db, - uint64_t atomic_ret_pos, bool zero_byte_rd) { - uint64_t start = profiler.startTimer(); - - level L; - L.postLock(this, pe); - uint32_t num_wqes = connection_policy.getNumWqes(opcode); - - // Get the index for my thread's put in the SQ. - uint64_t my_sq_counter = L.threadAtomicAdd(&sq_counter, num_wqes); - uint64_t my_sq_index = my_sq_counter % max_nwqe; - - // 16-bit little endian version of the SQ index needed to build the cntrl - // segment in the WQE. - uint16_t le_sq_counter; - uint16_t sq_counter_u16 = my_sq_counter; - swap_endian_store(&le_sq_counter, sq_counter_u16); - - bool flag = sq_overflow; - uint32_t lkey_in_stack_frame = lkey; - uint32_t rkey_in_stack_frame = rkey; - uint32_t ctrl_qp_sq_in_stack_frame = ctrl_qp_sq; - uint64_t ctrl_sig_in_stack_frame = ctrl_sig; - - connection_policy.setRkey(&rkey_in_stack_frame, pe); - - if (opcode == MLX5_OPCODE_RDMA_WRITE && !size) { - rkey_in_stack_frame = hdp_rkey[pe]; - size = 4; - } - - /* - * Build out all the segments required for my WQE(s) based on the - * operation, starting at my_sq_index into the SQ. SegmentBuilder will - * keep track of placing the segments in the correct location. - */ - SegmentBuilder seg_build(my_sq_index, current_sq); - seg_build.update_cntrl_seg(opcode, le_sq_counter, ctrl_qp_sq_in_stack_frame, - ctrl_sig_in_stack_frame, &connection_policy, - zero_byte_rd); - seg_build.update_connection_seg(pe, &connection_policy); - seg_build.update_rdma_seg(raddr, rkey_in_stack_frame); - - if (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS) { - seg_build.update_atomic_data_seg(atomic_data, atomic_cmp); - size = 8; - lkey_in_stack_frame = atomic_ret.atomic_lkey; - laddr = &atomic_ret.atomic_base_ptr[atomic_ret_pos]; - } - - if (size <= inline_threshold && opcode == MLX5_OPCODE_RDMA_WRITE) { - seg_build.update_inl_data_seg(laddr, size); - } else { - seg_build.update_data_seg(laddr, size, lkey_in_stack_frame); - } - - profiler.incStat(WQE_COUNT); - profiler.endTimer(start, UPDATE_WQE); - start = profiler.startTimer(); - - L.template finishPost(this, ring_db, num_wqes, pe, le_sq_counter, - opcode); - - profiler.incStat(DB_COUNT); - profiler.endTimer(start, RING_SQ_DB); -} - -/****************************************************************************** - ****************************** SHMEM INTERFACE ******************************* - *****************************************************************************/ -template -__device__ void QueuePair::put_nbi(void *dest, const void *source, - size_t nelems, int pe, bool db_ring) { - uintptr_t *src = reinterpret_cast(const_cast(source)); - uintptr_t *dst = reinterpret_cast(dest); - - update_posted_wqe_generic( - pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0); -} - -template -__device__ void QueuePair::put_nbi_cqe(void *dest, const void *source, - size_t nelems, int pe, bool db_ring) { - uintptr_t *src = reinterpret_cast(const_cast(source)); - uintptr_t *dst = reinterpret_cast(dest); - - update_posted_wqe_generic( - pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0); -} - -template -__device__ void QueuePair::get_nbi(void *dest, const void *source, - size_t nelems, int pe, bool db_ring) { - uintptr_t *src = reinterpret_cast(const_cast(source)); - uintptr_t *dst = reinterpret_cast(dest); - - update_posted_wqe_generic( - pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0); -} - -template -__device__ void QueuePair::get_nbi_cqe(void *dest, const void *source, - size_t nelems, int pe, bool db_ring) { - uintptr_t *src = reinterpret_cast(const_cast(source)); - uintptr_t *dst = reinterpret_cast(dest); - - update_posted_wqe_generic( - pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0); -} - -template -__device__ void QueuePair::zero_b_rd(int pe) { - uintptr_t *dst = reinterpret_cast(base_heap[pe]); - - update_posted_wqe_generic(pe, 0, nullptr, dst, - MLX5_OPCODE_RDMA_READ, 0, 0, true, 0, - true); // enable 0_byte read op -} - -__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t value, - int64_t cond, int pe, bool db_ring, - uint8_t atomic_op) { - THREAD TH; - uint64_t pos = TH.threadAtomicAdd( - reinterpret_cast(/* NOLINT(runtime/int) */ - &atomic_ret.atomic_counter)); - - pos = pos % max_nb_atomic; - - int64_t *atomic_base_ptr = - reinterpret_cast(atomic_ret.atomic_base_ptr); - - int64_t *load_address = &atomic_base_ptr[pos]; - - *load_address = -100; - - uintptr_t *dst = reinterpret_cast(dest); - - update_posted_wqe_generic(pe, sizeof(int64_t), nullptr, dst, - atomic_op, value, cond, db_ring, pos); - quiet_single(); - - while (uncached_load(load_address) == -100) { - } - - int64_t ret = *load_address; - - __threadfence(); - - return ret; -} - -__device__ void QueuePair::atomic_nofetch(void *dest, int64_t value, - int64_t cond, int pe, bool db_ring, - uint8_t atomic_op) { - THREAD TH; - uint64_t pos = TH.threadAtomicAdd( - reinterpret_cast(/* NOLINT(runtime/int) */ - &atomic_ret.atomic_counter)); - pos = pos % max_nb_atomic; - uintptr_t *dst = reinterpret_cast(dest); - - update_posted_wqe_generic(pe, sizeof(int64_t), nullptr, dst, - atomic_op, value, cond, db_ring, pos); - - quiet_single(); -} - -__device__ void QueuePair::fence(int pe) { - // TODO(khamidou): should this be replaced by a zero_byte_rd? - // FIXME: the relaxed ordering requires an intervening read to order - // prior operations. - auto remote_hdp_uncast = hdp_address[pe]; - uintptr_t *remote_hdp = reinterpret_cast(remote_hdp_uncast); - update_posted_wqe_generic( - pe, 0, nullptr, remote_hdp, MLX5_OPCODE_RDMA_WRITE, 0, 0, true, 0); -} - -__device__ void QueuePair::waitCQSpace(int num_msgs) { - // We cannot post more outstanding requests than the completion queue - // size. Force a quiet if we are out of space. - if ((quiet_counter + num_msgs) >= cq_size) { - GPU_DPRINTF( - "*** inside post_cq forcing flush: outstanding %d " - "adding %d cq_size %d\n", - quiet_counter, num_msgs, cq_size); - - // TODO(khamidou): More targeted flush would be better here. - quiet_single(); - } -} - -__device__ void QueuePair::waitSQSpace(int num_msgs) { - // We cannot post more outstanding requests than the Send queue - // size. Force a quiet if we are out of space. - local_sq_cnt += num_msgs; - int div = local_sq_cnt / max_nwqe; - - if (div > 0) { - GPU_DPRINTF( - "*** inside waitSQSpace forcing flush to overrun the SQ" - " sq_counter %d adding %d quiet_conter %d \n", - sq_counter, num_msgs, max_nwqe, quiet_counter); - - quiet_single(); - local_sq_cnt = local_sq_cnt % max_nwqe; - } -} - -void QueuePair::setDBval(uint64_t val) { db_val = val; } - -#define THREAD_LEVEL_GEN(T) \ - template __device__ void QueuePair::put_nbi( \ - void *dest, const void *source, size_t nelems, int pe, bool db_ring); \ - template __device__ void QueuePair::put_nbi_cqe( \ - void *dest, const void *source, size_t nelems, int pe, bool db_ring); \ - template __device__ void QueuePair::get_nbi( \ - void *dest, const void *source, size_t nelems, int pe, bool db_ring); \ - template __device__ void QueuePair::get_nbi_cqe( \ - void *dest, const void *source, size_t nelems, int pe, bool db_ring); \ - template __device__ void QueuePair::zero_b_rd(int pe); \ - template __device__ void QueuePair::quiet_single(); \ - template __device__ void QueuePair::quiet_single_heavy(int pe); \ - template __device__ void QueuePair::quiet_internal(); - -THREAD_LEVEL_GEN(THREAD) -THREAD_LEVEL_GEN(WG) -THREAD_LEVEL_GEN(WAVE) - -} // namespace rocshmem diff --git a/src/gpu_ib/queue_pair.hpp b/src/gpu_ib/queue_pair.hpp deleted file mode 100644 index f17c51b3fe..0000000000 --- a/src/gpu_ib/queue_pair.hpp +++ /dev/null @@ -1,431 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_ -#define LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_ - -/** - * @file queue_pair.hpp - * - * @section DESCRIPTION - * An IB QueuePair (SQ and CQ) that the device can use to perform network - * operations. Most important rocSHMEM operations are performed by this - * class. - */ - -#include - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../atomic_return.hpp" -#include "connection_policy.hpp" -#include "thread_policy.hpp" -#include "../hdp_policy.hpp" -#include "../stats.hpp" - -namespace rocshmem { - -class GPUIBBackend; - -enum gpu_ib_stats { - RING_SQ_DB = 0, - UPDATE_WQE, - POLL_CQ, - NEXT_CQ, - QUIET_COUNT, - DB_COUNT, - WQE_COUNT, - MEM_WAIT, - INIT, - FINALIZE, - GPU_IB_NUM_STATS -}; - -typedef union db_reg { - uint64_t *ptr; - uintptr_t uint; -} db_reg_t; - -class QueuePair { - public: - /** - * @brief Constructor. - * - * @param[in] backend Backend needed for member access. - */ - explicit QueuePair(GPUIBBackend *backend); - - /** - * @brief Destructor. - */ - __device__ ~QueuePair(); - - /** - * @brief Inspect completion queue and possibly wait for free space. - * - * @param[in] num_msgs Number of entries needing space in completion queue. - */ - __device__ void waitCQSpace(int num_msgs); - - /** - * @brief Inspect send queue and possibly wait for free space. - * - * @param[in] num_msgs Number of entries needing space in send queue. - */ - __device__ void waitSQSpace(int num_msgs); - - /** - * @brief Create and enqueue a non-blocking put work queue entry (wqe). - * - * @tparam level Implements specific behaviors for thread, warp, block access. - * - * @param[in] dest Destination address for data transmission. - * @param[in] source Source address for data transmission. - * @param[in] nelems Size in bytes of data transmission. - * @param[in] pe Destination processing element of data transmission. - * @param[in] db_ring Denotes whether send queue door bell should be rung. - */ - template - __device__ void put_nbi(void *dest, const void *source, size_t nelems, int pe, - bool db_ring); - - /** - * @brief Create and enqueue a non-blocking put work queue entry (wqe). - * - * @note This variant differs from put_nbi by requesting that a completion - * queue entry is generated in the completion queue. - * - * @tparam level Implements specific behaviors for thread, warp, block access. - * - * @param[in] dest Destination address for data transmission. - * @param[in] source Source address for data transmission. - * @param[in] nelems Size in bytes of data transmission. - * @param[in] pe Destination processing element of data transmission. - * @param[in] db_ring Denotes whether send queue door bell should be rung. - */ - template - __device__ void put_nbi_cqe(void *dest, const void *source, size_t nelems, - int pe, bool db_ring); - - /** - * @brief Consume a completion queue entry from this queue pair's - * completion queue. - * - * @tparam level Implements specific behaviors for thread, warp, block access. - */ - template - __device__ void quiet_single(); - - /** - * @brief Send a zero-byte read to enforce ordering and then consume - * a completion queue entry from this queue pair's completion queue. - * - * @tparam level Implements specific behaviors for thread, warp, block access. - * - * @param[in] pe Processing element id to send the zero_b_rd. - */ - template - __device__ void quiet_single_heavy(int pe); - - /** - * @brief Create and enqueue a HDP flush work queue entry on the remote PE. - * - * @param[in] pe Processing element id to send the HDP flush operation. - * - * TODO(@khamidou): does this require a zero_b_rd to enforce write ordering - * The HDP flush is itself a write. Could this write be reordered with - * respect to other write on the network and arrive out-of-order? - */ - __device__ void fence(int pe); - - /** - * @brief Create and enqueue a non-blocking get work queue entry (wqe). - * - * @tparam level Implements specific behaviors for thread, warp, block access. - * - * @param[in] dest Destination address for data transmission. - * @param[in] source Source address for data transmission. - * @param[in] nelems Size in bytes of data transmission. - * @param[in] pe Destination processing element of data transmission. - * @param[in] db_ring Denotes whether send queue door bell should be rung. - */ - template - __device__ void get_nbi(void *dest, const void *source, size_t nelems, int pe, - bool db_ring); - - /** - * @brief Create and enqueue a non-blocking get work queue entry (wqe). - * - * @note This variant differs from get_nbi by requesting that a completion - * queue entry is generated in the completion queue. - * - * @tparam level Implements specific behaviors for thread, warp, block access. - * - * @param[in] dest Destination address for data transmission. - * @param[in] source Source address for data transmission. - * @param[in] nelems Size in bytes of data transmission. - * @param[in] pe Destination processing element of data transmission. - * @param[in] db_ring Denotes whether send queue door bell should be rung. - */ - template - __device__ void get_nbi_cqe(void *dest, const void *source, size_t nelems, - int pe, bool db_ring); - - /** - * @brief Create and enqueue a zero-byte read to enforce write ordering. - * - * @tparam level Implements specific behaviors for thread, warp, block access. - * - * @param[in] pe Processing element id to send the zero_b_rd. - */ - template - __device__ void zero_b_rd(int pe); - - /** - * @brief Create and enqueue an atomic fetch work queue entry (wqe). - * - * @param[in] dest Destination address for data transmission. - * @param[in] value Data value for the atomic operation. - * @param[in] cond Used in atomic comparisons. - * @param[in] pe Destination processing element of data transmission. - * @param[in] db_ring Denotes whether send queue door bell should be rung. - * @param[in] atomic_op The atomic operation to perform. - * - * @return An atomic value - */ - __device__ int64_t atomic_fetch(void *dest, int64_t value, int64_t cond, - int pe, bool db_ring, uint8_t atomic_op); - - /** - * @brief Create and enqueue an atomic fetch work queue entry (wqe). - * - * @param[in] dest Destination address for data transmission. - * @param[in] value Data value for the atomic operation. - * @param[in] cond Used in atomic comparisons. - * @param[in] pe Destination processing element of data transmission. - * @param[in] db_ring Denotes whether send queue door bell should be rung. - * @param[in] atomic_op The atomic operation to perform. - */ - __device__ void atomic_nofetch(void *dest, int64_t value, int64_t cond, - int pe, bool db_ring, uint8_t atomic_op); - - /** - * @brief Helper method to set the doorbell's value. - * - * @param[in] val Desired value for the doorbell. - */ - void setDBval(uint64_t val); - - protected: - /** - * @brief Helper method to build work requests for the send queue. - * - * @tparam level Implements specific behaviors for thread, warp, block access. - * @tparam cqe Flag to optionally generate cqes. - * - * @param[in] pe Destination processing element of data transmission. - * @param[in] size Size in bytes of data transmission. - * @param[in] laddr Local address. - * @param[in] raddr Remote address. - * @param[in] opcode Operation to be performed. - * @param[in] atomic_data An atomic data value to be used. - * @param[in] atomic_cmp An atomic comparison operation to be performed. - * @param[in] ring_db Boolean denoting if doorbell should be rung. - * @param[in] atomic_ret_pos Index into atomic return structure. - * @param[in] zero_byte_rd Boolean if zero byte read should be used. - */ - template - __device__ __attribute__((noinline)) void update_posted_wqe_generic( - int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, - int64_t atomic_data, int64_t atomic_cmp, bool ring_db, - uint64_t atomic_ret_pos, bool zero_byte_rd = false); - - /** - * @brief Helper method to drain completion queue entries. - * - * @tparam level Implements specific behaviors for thread, warp, block access. - * - */ - template - __device__ __attribute__((noinline)) void quiet_internal(); - - /** - * @brief Helper method to compute doorbell value opcode which is used to - * ring the doorbell. - * - * @param[in,out] db_val - * @param[in] dbrec_val - * @param[in] opcode - */ - __device__ void compute_db_val_opcode(uint64_t *db_val, uint16_t dbrec_val, - uint8_t opcode); - - /** - * @brief Helper method that sets the field in a work queue entry to - * generate a completion entry in the completion queue. - * - * @param num_wqes Number of work entries this completion entry represents. - */ - __device__ void set_completion_flag_on_wqe(int num_wqes); - - /** - * @brief Helper method to update fields for the work queue entry. - * - * @tparam cqe Flag to optionally generate cqes. - * - * @note Single variant is meant to be callable by a block leader. - */ - template - __device__ void update_wqe_ce_single(int num_wqes); - - /** - * @brief Helper method to update fields for the work queue entry. - * - * @tparam cqe Flag to optionally generate cqes. - * - * @note Thread variant is meant to be callable by multiple threads. - */ - template - __device__ void update_wqe_ce_thread(int num_wqes); - - /** - * @brief Helper method to ring the doorbell - * - * @param[in] db_val Doorbell value is written by method. - */ - __device__ void ring_doorbell(uint64_t db_val); - - /** - * @brief Helper method to extract syndrome field from cqe. - * - * @param[in] cq_entry Completion queue entry. - */ - __device__ uint8_t get_cq_error_syndrome(mlx5_cqe64 *cq_entry); - - private: - const int inline_threshold{8}; - - /* TODO(bpotter): Most of these should be private/protected */ - public: -#ifdef PROFILE - typedef Stats GPUIBStats; -#else - typedef NullStats GPUIBStats; -#endif - - /* - * Pointer to the hardware doorbell register for the QP. - */ - db_reg_t db{}; - - /* - * Base pointer of this QP's SQ - * TODO(bpotter): Use the correct struct type for this. - */ - uint64_t *current_sq{nullptr}; - uint64_t *current_sq_H{nullptr}; - - /* - * Base pointer of this QP's CQ - */ - mlx5_cqe64 *current_cq_q{nullptr}; - mlx5_cqe64 *current_cq_q_H{nullptr}; - - /* - * Pointer to the doorbell record for this SQ. - */ - volatile uint32_t *dbrec_send{nullptr}; - - /* - * Pointer to the doorbell record for the CQ. - */ - volatile uint32_t *dbrec_cq{nullptr}; - - uint32_t *hdp_rkey{nullptr}; - - uintptr_t *hdp_address{nullptr}; - - HdpPolicy *hdp_policy{}; - - atomic_ret_t atomic_ret{}; - - ThreadImpl threadImpl{}; - - ConnectionImpl connection_policy; - - char *const *base_heap{nullptr}; - /* - * Current index into the SQ (non-modulo size). - */ - uint32_t sq_counter{0}; - uint32_t local_sq_cnt{0}; - - /* - * Number of outstanding messages on this QP that need to be completed - * during a quiet operation. - */ - uint32_t quiet_counter{0}; - - int num_cqs{0}; - - /* - * Current index into the SQ (non-module size). - */ - uint32_t cq_consumer_counter{0}; - - uint16_t cq_log_size{0}; - - uint16_t cq_size{0}; - - uint32_t ctrl_qp_sq{0}; - - uint64_t ctrl_sig{0}; - - uint32_t rkey{0}; - - uint32_t lkey{0}; - - GPUIBStats profiler{}; - - uint16_t max_nwqe{0}; - - bool sq_overflow{0}; - - uint64_t db_val{}; - /* - * Pointer to the QP in global memory that this QP is copied from. When - * this QP is destroyed, the dynamic (indicies, stats, etc) in the - * global_qp are updated. - */ - QueuePair *global_qp{nullptr}; - - friend SingleThreadImpl; - friend MultiThreadImpl; - friend THREAD; - friend WG; - friend WAVE; - friend RCConnectionImpl; - friend DCConnectionImpl; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_ diff --git a/src/gpu_ib/reliable_connection.cpp b/src/gpu_ib/reliable_connection.cpp deleted file mode 100644 index 7988b6ad00..0000000000 --- a/src/gpu_ib/reliable_connection.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "reliable_connection.hpp" - -#include - -#include "backend_ib.hpp" - -namespace rocshmem { - -ReliableConnection::ReliableConnection(GPUIBBackend* b) : Connection(b, 0) {} - -ReliableConnection::~ReliableConnection() {} - -Connection::InitQPState ReliableConnection::initqp(uint8_t port) { - InitQPState init{}; - - init.exp_qp_attr.qp_access_flags = - IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; - init.exp_qp_attr.port_num = port; - - init.exp_attr_mask |= IBV_QP_ACCESS_FLAGS; - - return init; -} - -Connection::RtrState ReliableConnection::rtr(dest_info_t* dest, uint8_t port) { - RtrState rtr{}; - - rtr.exp_qp_attr.dest_qp_num = dest->qpn; - rtr.exp_qp_attr.rq_psn = dest->psn; - rtr.exp_qp_attr.ah_attr.port_num = port; - if (ib_state->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { - rtr.exp_qp_attr.ah_attr.dlid = dest->lid; - } else { - rtr.exp_qp_attr.ah_attr.is_global = 1; - rtr.exp_qp_attr.ah_attr.grh.dgid = dest->gid; - rtr.exp_qp_attr.ah_attr.grh.sgid_index = 0; - rtr.exp_qp_attr.ah_attr.grh.hop_limit = 1; - } - - rtr.exp_attr_mask |= IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; - - return rtr; -} - -Connection::RtsState ReliableConnection::rts(dest_info_t* dest) { - RtsState rts{}; - - rts.exp_qp_attr.sq_psn = dest->psn; - - rts.exp_attr_mask |= IBV_QP_SQ_PSN; - - return rts; -} - -ibv_qp* ReliableConnection::create_qp_0(ibv_context* context, - ibv_qp_init_attr_ex* qp_attr) { - return ibv_create_qp_ex(context, qp_attr); -} - -void ReliableConnection::create_qps_1() { } - -void ReliableConnection::create_qps_2(int port, int my_rank, - ibv_port_attr* ib_port_att) { } - -void ReliableConnection::create_qps_3(int port, ibv_qp* qp, int offset, - ibv_port_attr* ib_port_att) { - init_qp_status(qp, port); - - all_qp[offset].lid = ib_port_att->lid; - all_qp[offset].qpn = qp->qp_num; - all_qp[offset].psn = 0; - union ibv_gid gid; - ibv_query_gid(ib_state->context, port, 0, &gid); - all_qp[offset].gid = gid; -} - -void ReliableConnection::get_remote_conn(int* remote_conn) { - *remote_conn = backend->num_pes; -} - -void ReliableConnection::allocate_dynamic_members(int num_blocks) { - all_qp.resize(backend->num_pes * num_blocks); -} - -void ReliableConnection::free_dynamic_members() { -} - -void ReliableConnection::initialize_1(int port, int num_blocks) { - MPI_Alltoall(MPI_IN_PLACE, sizeof(dest_info_t) * num_blocks, MPI_CHAR, - all_qp.data(), sizeof(dest_info_t) * num_blocks, MPI_CHAR, - backend->thread_comm); - - for (int i = 0; i < qps.size(); i++) { - change_status_rtr(qps[i], &all_qp[i], port); - } - - MPI_Barrier(backend->thread_comm); - - for (int i = 0; i < qps.size(); i++) { - change_status_rts(qps[i], &all_qp[i]); - } -} - -void ReliableConnection::initialize_rkey_handle(uint32_t** heap_rkey_handle, - ibv_mr* mr) { - CHECK_HIP( - hipHostMalloc(heap_rkey_handle, sizeof(uint32_t) * backend->num_pes)); - (*heap_rkey_handle)[backend->my_pe] = mr->rkey; -} - -void ReliableConnection::free_rkey_handle(uint32_t* heap_rkey_handle) { - CHECK_HIP(hipHostFree(heap_rkey_handle)); -} - -Connection::QPInitAttr ReliableConnection::qpattr(ibv_qp_cap cap) { - QPInitAttr qpattr(cap); - qpattr.attr.qp_type = IBV_QPT_RC; - return qpattr; -} - -void ReliableConnection::post_dv_rc_wqe(int remote_conn) { - mlx5_wqe_ctrl_seg* ctrl; - mlx5_wqe_raddr_seg* rdma; - mlx5_wqe_data_seg* data; - - for (int i = 0; i < remote_conn; i++) { - int num_blocks = backend->num_blocks_; - for (int j = 0; j < num_blocks; j++) { - int qp_index = i * num_blocks + j; - uint64_t* ptr = get_address_sq(qp_index); - - const uint16_t nb_post = 1; // 4 * sq_size; - for (uint16_t index = 0; index < nb_post; index++) { - uint8_t op_mod = 0; - uint8_t op_code = 8; - uint32_t qp_num = qps[qp_index]->qp_num; - uint8_t fm_ce_se = 0; - uint8_t ds = 3; - ctrl = reinterpret_cast(ptr); - mlx5dv_set_ctrl_seg(ctrl, index, op_code, op_mod, qp_num, fm_ce_se, ds, - 0, 0); - ptr = ptr + 2; - - rdma = reinterpret_cast(ptr); - const auto& heap_bases = backend->heap.get_heap_bases(); - auto temp = heap_bases[(backend->my_pe + 1) % 2]; - uint64_t r_address = reinterpret_cast(temp); - uint32_t rkey = backend->networkImpl.heap_rkey[i]; - set_rdma_seg(rdma, r_address, rkey); - ptr = ptr + 2; - - data = reinterpret_cast(ptr); - uint32_t lkey = backend->networkImpl.heap_mr->lkey; - temp = heap_bases[backend->my_pe]; - uint64_t address = reinterpret_cast(temp); - mlx5dv_set_data_seg(data, 1, lkey, address); - ptr = ptr + 4; - } - } - } -} - -// TODO(bpotter): remove redundancies with the other derived class -void ReliableConnection::post_wqes() { - int remote_conn; - get_remote_conn(&remote_conn); - post_dv_rc_wqe(remote_conn); -} - -void ReliableConnection::initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, - int dc_key) {} - -int ReliableConnection::get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) { - return pe_idx * num_qps + wg_idx; -} - -} // namespace rocshmem diff --git a/src/gpu_ib/reliable_connection.hpp b/src/gpu_ib/reliable_connection.hpp deleted file mode 100644 index e642ab7990..0000000000 --- a/src/gpu_ib/reliable_connection.hpp +++ /dev/null @@ -1,84 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_ -#define LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_ - -#include - -#include "connection.hpp" - -namespace rocshmem { - -class ReliableConnection : public Connection { - public: - explicit ReliableConnection(GPUIBBackend* backend); - - ~ReliableConnection() override; - - void get_remote_conn(int* remote_conn) override; - - void post_wqes() override; - - void initialize_rkey_handle(uint32_t** heap_rkey_handle, - ibv_mr* mr) override; - - void free_rkey_handle(uint32_t* heap_rkey_handle) override; - - private: - InitQPState initqp(uint8_t port) override; - - RtrState rtr(dest_info_t* dest, uint8_t port) override; - - RtsState rts(dest_info_t* dest) override; - - QPInitAttr qpattr(ibv_qp_cap cap) override; - - void create_qps_1() override; - - void create_qps_2(int port, int my_rank, - ibv_port_attr* ib_port_att) override; - - void create_qps_3(int port, ibv_qp* qp, int offset, - ibv_port_attr* ib_port_att) override; - - ibv_qp* create_qp_0(ibv_context* context, - ibv_qp_init_attr_ex* qp_attr) override; - - void allocate_dynamic_members(int num_wg) override; - - void free_dynamic_members() override; - - void initialize_1(int port, int num_wg) override; - - void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, int dc_key) override; - - int get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) override; - - std::vector all_qp; - - void post_dv_rc_wqe(int remote_conn); -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_ diff --git a/src/gpu_ib/segment_builder.cpp b/src/gpu_ib/segment_builder.cpp deleted file mode 100644 index 6f2a77a23e..0000000000 --- a/src/gpu_ib/segment_builder.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "segment_builder.hpp" - -#include "../util.hpp" -#include "endian.hpp" - -namespace rocshmem { - -__device__ SegmentBuilder::SegmentBuilder(uint64_t wqe_idx, void *base) { - mlx5_segment *base_ptr = static_cast(base); - size_t segment_offset = SEGMENTS_PER_WQE * wqe_idx; - seg_ptr = &base_ptr[segment_offset]; -} - -__device__ void SegmentBuilder::update_cntrl_seg( - uint8_t opcode, uint16_t wqe_idx, uint32_t ctrl_qp_sq, uint64_t ctrl_sig, - ConnectionImpl *connection_policy, bool zero_byte_rd) { - mlx5_wqe_ctrl_seg ctrl_seg; - - ctrl_seg.opmod_idx_opcode = (opcode << 24) | (wqe_idx << 8); - - uint32_t DS = 2; - if (zero_byte_rd == false) { - DS = (opcode == MLX5_OPCODE_RDMA_WRITE || opcode == MLX5_OPCODE_RDMA_READ) - ? 3 - : 4; - } - - DS += connection_policy->wqeCntrlOffset(); - - ctrl_seg.qpn_ds = (DS << 24) | ctrl_qp_sq; - - ctrl_seg.signature = ctrl_sig; - - ctrl_seg.fm_ce_se = ctrl_sig >> 24; - - ctrl_seg.imm = ctrl_sig >> 32; - - memcpy(&seg_ptr->ctrl_seg, &ctrl_seg, sizeof(mlx5_wqe_ctrl_seg)); - - seg_ptr++; -} - -__device__ void SegmentBuilder::update_atomic_data_seg(uint64_t atomic_data, - uint64_t atomic_cmp) { - mlx5_wqe_atomic_seg atomic_seg; - - swap_endian_store(reinterpret_cast(&atomic_seg.swap_add), - atomic_data); - - swap_endian_store(reinterpret_cast(&atomic_seg.compare), - atomic_cmp); - - memcpy(&seg_ptr->atomic_seg, &atomic_seg, sizeof(mlx5_wqe_atomic_seg)); - seg_ptr++; -} - -__device__ void SegmentBuilder::update_rdma_seg(uintptr_t *raddr, - uint32_t rkey) { - mlx5_wqe_raddr_seg raddr_seg; - - raddr_seg.rkey = rkey; - - swap_endian_store(reinterpret_cast(&raddr_seg.raddr), - reinterpret_cast(raddr)); - - memcpy(&seg_ptr->raddr_seg, &raddr_seg, sizeof(mlx5_wqe_raddr_seg)); - seg_ptr++; -} - -__device__ void SegmentBuilder::update_data_seg(uintptr_t *laddr, int32_t size, - uint32_t lkey) { - if (laddr == nullptr) { - return; - } - - mlx5_wqe_data_seg data_seg; - data_seg.lkey = lkey; - - swap_endian_store(&data_seg.byte_count, size & 0x7FFFFFFFU); - swap_endian_store(reinterpret_cast(&data_seg.addr), - reinterpret_cast(laddr)); - - memcpy(&seg_ptr->data_seg, &data_seg, sizeof(mlx5_wqe_data_seg)); - seg_ptr++; -} - -__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr, - int32_t size) { - mlx5_wqe_inl_data_seg inl_data_seg; - - swap_endian_store(&inl_data_seg.byte_count, (size & 0x3FF) | 0x80000000); - - // Assume fence HDP flush - // TODO(khamidou): Rework fence interface to avoid this - size_t field_size{sizeof(mlx5_wqe_inl_data_seg)}; - if (!laddr) { - uint8_t flush_val = 1; - memcpy(&inl_data_seg + 1, &flush_val, sizeof(flush_val)); - field_size += sizeof(flush_val); - } else { - memcpy(&inl_data_seg + 1, laddr, size); - field_size += size; - } - - memcpy(&seg_ptr->inl_data_seg, &inl_data_seg, field_size); - seg_ptr++; -} - -__device__ void SegmentBuilder::update_connection_seg( - int pe, ConnectionImpl *conn_policy) { - if (conn_policy->updateConnectionSegmentImpl(&seg_ptr->base_av, pe)) { - seg_ptr++; - } -} - -} // namespace rocshmem diff --git a/src/gpu_ib/segment_builder.hpp b/src/gpu_ib/segment_builder.hpp deleted file mode 100644 index 866ff2c75b..0000000000 --- a/src/gpu_ib/segment_builder.hpp +++ /dev/null @@ -1,64 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_ -#define LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_ - -#include - -#include "connection_policy.hpp" -#include "infiniband_structs.hpp" -#include "../util.hpp" - -namespace rocshmem { - -class SegmentBuilder { - public: - __device__ SegmentBuilder(uint64_t wqe_idx, void *base); - - __device__ void update_cntrl_seg(uint8_t opcode, uint16_t wqe_idx, - uint32_t ctrl_qp_sq, uint64_t ctrl_sig, - ConnectionImpl *connection_policy, - bool zero_byte_rd); - - __device__ void update_connection_seg(int pe, - ConnectionImpl *connection_policy); - - __device__ void update_atomic_data_seg(uint64_t atomic_data, - uint64_t atomic_cmp); - - __device__ void update_rdma_seg(uintptr_t *raddr, uint32_t rkey); - - __device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size); - - __device__ void update_data_seg(uintptr_t *laddr, int32_t size, - uint32_t lkey); - - private: - const int SEGMENTS_PER_WQE = 4; - - mlx5_segment *seg_ptr; -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_ diff --git a/src/gpu_ib/thread_policy.cpp b/src/gpu_ib/thread_policy.cpp deleted file mode 100644 index 80dbcf5482..0000000000 --- a/src/gpu_ib/thread_policy.cpp +++ /dev/null @@ -1,358 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "thread_policy.hpp" - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "queue_pair.hpp" - -namespace rocshmem { - -__device__ void SingleThreadImpl::quiet(QueuePair *handle) { - handle->quiet_internal(); -} - -__device__ void SingleThreadImpl::quiet_heavy(QueuePair *handle, int pe) { - handle->zero_b_rd(pe); - handle->quiet_internal(); -} - -__device__ void MultiThreadImpl::quiet(QueuePair *handle) { - int thread_id = get_flat_block_id(); - /* - * Each WF selects one thread to perform the quiet. Only one thread - * per WG is allowed to do a quiet at once to avoid races with the CQ. - */ - if (thread_id % WF_SIZE == lowerID()) { - while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) { - } - handle->quiet_internal(); - __threadfence(); - handle->threadImpl.cq_lock = 0; - } -} - -__device__ void MultiThreadImpl::quiet_heavy(QueuePair *handle, int pe) { - int thread_id = get_flat_block_id(); - /* - * Each WF selects one thread to perform the quiet. Only one thread - * per WG is allowed to do a quiet at once to avoid races with the CQ. - */ - if (thread_id % WF_SIZE == lowerID()) { - // zero_byte read - handle->zero_b_rd(pe); - - while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) { - } - handle->quiet_internal(); - __threadfence(); - handle->threadImpl.cq_lock = 0; - } -} - -__device__ void WG::quiet(QueuePair *handle) { handle->quiet_internal(); } - -__device__ void WG::quiet_heavy(QueuePair *handle, int pe) { - handle->zero_b_rd(pe); - handle->quiet_internal(); -} - -__device__ void WAVE::quiet(QueuePair *handle) { - int thread_id = get_flat_block_id(); - /* - * Each WF selects one thread to perform the quiet. Only one thread - * per WG is allowed to do a quiet at once to avoid races with the CQ. - */ - - if (thread_id % WF_SIZE == 0) { - while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) { - } - handle->quiet_internal(); - __threadfence(); - handle->threadImpl.cq_lock = 0; - } -} - -__device__ void WAVE::quiet_heavy(QueuePair *handle, int pe) { - int thread_id = get_flat_block_id(); - /* - * Each WF selects one thread to perform the quiet. Only one thread - * per WG is allowed to do a quiet at once to avoid races with the CQ. - */ - if (thread_id % WF_SIZE == 0) { - // post a zero-byte read - handle->zero_b_rd(pe); - while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) { - } - handle->quiet_internal(); - __threadfence(); - handle->threadImpl.cq_lock = 0; - } -} - -__device__ void SingleThreadImpl::decQuietCounter(uint32_t *quiet_counter, - int num) { - *quiet_counter -= num; -} - -__device__ void MultiThreadImpl::decQuietCounter(uint32_t *quiet_counter, - int num) { - atomicSub(quiet_counter, num); -} - -__device__ void WG::decQuietCounter(uint32_t *quiet_counter, int num) { - *quiet_counter -= num; -} - -__device__ void WAVE::decQuietCounter(uint32_t *quiet_counter, int num) { - *quiet_counter -= num; -} - -template -__device__ void SingleThreadImpl::finishPost(QueuePair *handle, bool ring_db, - int num_wqes, int pe, - uint16_t le_sq_counter, - uint8_t opcode) { - if (ring_db) { - uint64_t db_val = handle->db_val; - handle->compute_db_val_opcode(&db_val, le_sq_counter, opcode); - handle->update_wqe_ce_single(num_wqes); - handle->ring_doorbell(db_val); - } -} - -template -__device__ void MultiThreadImpl::finishPost(QueuePair *handle, bool ring_db, - int num_wqes, int pe, - uint16_t le_sq_counter, - uint8_t opcode) { - /* - * For RC, we can't allow a wave to have different PEs in it, else the - * doorbell ringing logic will not work. This little for loop forces - * control flow divergence based on the PE. It works well for small - * numbers of PEs, but we might want a different solution for large - * numbers. - */ - if (handle->connection_policy.forcePostDivergence()) { - for (int i = 0; i < handle->num_cqs; i++) { - if (i != pe) { - continue; - } - - finishPost_internal(handle, ring_db, num_wqes, pe, le_sq_counter, - opcode); - } - } else { - finishPost_internal(handle, ring_db, num_wqes, pe, le_sq_counter, - opcode); - } -} - -template -__device__ void MultiThreadImpl::finishPost_internal(QueuePair *handle, - bool ring_db, int num_wqes, - int pe, - uint16_t le_sq_counter, - uint8_t opcode) { - /* - * Assuming here that postLock locks out all wavefronts in this WG but - * one, and that this will select a single thread in the wavefront. - */ - __threadfence(); - if (get_flat_block_id() % WF_SIZE == lowerID()) { - if (ring_db) { - uint64_t db_val = - handle->current_sq[8 * ((handle->sq_counter - num_wqes) % - handle->max_nwqe)]; - handle->update_wqe_ce_thread(num_wqes); - handle->ring_doorbell(db_val); - } - - handle->threadImpl.sq_lock = 0; - } -} - -template -__device__ void WG::finishPost(QueuePair *handle, bool ring_db, int num_wqes, - int pe, uint16_t le_sq_counter, uint8_t opcode) { - if (ring_db) { - uint64_t db_val = handle->current_sq[8 * ((handle->sq_counter - num_wqes) % - handle->max_nwqe)]; - handle->update_wqe_ce_single(num_wqes); - handle->ring_doorbell(db_val); - } -} - -template -__device__ void WAVE::finishPost(QueuePair *handle, bool ring_db, int num_wqes, - int pe, uint16_t le_sq_counter, - uint8_t opcode) { - if (ring_db) { - uint64_t db_val = handle->current_sq[8 * ((handle->sq_counter - num_wqes) % - handle->max_nwqe)]; - handle->update_wqe_ce_thread(num_wqes); - handle->ring_doorbell(db_val); - } - handle->threadImpl.sq_lock = 0; -} - -__device__ void SingleThreadImpl::postLock(QueuePair *handle, int pe) { - handle->hdp_policy->hdp_flush(); - // handle->waitCQSpace(1); - handle->waitSQSpace(1); -} - -__device__ void MultiThreadImpl::postLock_internal(QueuePair *handle) { - int thread_id = get_flat_block_id(); - int active_threads = wave_SZ(); - - if (thread_id % WF_SIZE == lowerID()) { - handle->hdp_policy->hdp_flush(); - /* - * Don't let more than one wave in this WG go any further or a - * horrible variety of impossible to debug race conditions can occur. - */ - while (atomicCAS(&(handle->threadImpl.sq_lock), 0, 1) == 1) { - } - - /* - * This is a tiny bit over-aggressive as it assumes that all of the - * active_threads are going to the same PE when calculating whether - * we are full. - */ - // handle->waitCQSpace(active_threads); - handle->waitSQSpace(active_threads); - } - - /* - * Double check we've got the same exec mask (assuming divergence after - * the previous if. - */ - if (active_threads != wave_SZ()) { - __builtin_trap(); - } -} - -__device__ void MultiThreadImpl::postLock(QueuePair *handle, int pe) { - /* - * For RC, we can't allow a wave to have different PEs in it, else the - * doorbell ringing logic will not work. This little for loop forces - * control flow divergence based on the PE. It works well for small - * numbers of PEs, but we might want a different solution for large - * numbers. - */ - if (handle->connection_policy.forcePostDivergence()) { - for (int i = 0; i < handle->num_cqs; i++) { - if (i != pe) { - continue; - } - - postLock_internal(handle); - } - } else { - postLock_internal(handle); - } -} - -__device__ void WG::postLock(QueuePair *handle, int pe) { - handle->hdp_policy->hdp_flush(); - // handle->waitCQSpace(1); - handle->waitSQSpace(1); -} - -__device__ void WAVE::postLock(QueuePair *handle, int pe) { - handle->hdp_policy->hdp_flush(); - - /* - * Don't let more than one wave in this WG go any further or a horrible - * variety of impossible to debug race conditions can occur. - */ - while (atomicCAS(&(handle->threadImpl.sq_lock), 0, 1) == 1) { - } - - /* - * This is a tiny bit over-aggressive as it assumes that all of the - * active_threads are going to the same PE when calculating whether - * we are full. - */ - // handle->waitCQSpace(1); - handle->waitSQSpace(1); -} - -template -__device__ T SingleThreadImpl::threadAtomicAdd(T *val, T value) { - T old_val = *val; - *val += value; - return old_val; -} - -template -__device__ T MultiThreadImpl::threadAtomicAdd(T *val, T value) { - return atomicAdd(val, value); -} - -template -__device__ T WG::threadAtomicAdd(T *val, T value) { - T old_val = *val; - *val += value; - return old_val; -} - -template -__device__ T WAVE::threadAtomicAdd(T *val, T value) { - return atomicAdd(val, value); -} - -#define TYPE_GEN(T) \ - template __device__ T SingleThreadImpl::threadAtomicAdd(T * val, \ - T value); \ - template __device__ T MultiThreadImpl::threadAtomicAdd(T * val, T value); \ - template __device__ T WG::threadAtomicAdd(T * val, T value); \ - template __device__ T WAVE::threadAtomicAdd(T * val, T value); - -TYPE_GEN(float) -TYPE_GEN(double) -TYPE_GEN(int) -TYPE_GEN(unsigned int) -TYPE_GEN(unsigned long long) // NOLINT(runtime/int) - -#define TYPE_BOOL(T) \ - template __device__ void SingleThreadImpl::finishPost( \ - QueuePair * handle, bool ring_db, int num_wqes, int pe, \ - uint16_t le_sq_counter, uint8_t opcode); \ - template __device__ void MultiThreadImpl::finishPost( \ - QueuePair * handle, bool ring_db, int num_wqes, int pe, \ - uint16_t le_sq_counter, uint8_t opcode); \ - template __device__ void WG::finishPost( \ - QueuePair * handle, bool ring_db, int num_wqes, int pe, \ - uint16_t le_sq_counter, uint8_t opcode); \ - template __device__ void WAVE::finishPost( \ - QueuePair * handle, bool ring_db, int num_wqes, int pe, \ - uint16_t le_sq_counter, uint8_t opcode); \ - template __device__ void MultiThreadImpl::finishPost_internal( \ - QueuePair * handle, bool ring_db, int num_wqes, int pe, \ - uint16_t le_sq_counter, uint8_t opcode); - -TYPE_BOOL(true) -TYPE_BOOL(false) - -} // namespace rocshmem diff --git a/src/gpu_ib/thread_policy.hpp b/src/gpu_ib/thread_policy.hpp deleted file mode 100644 index be79cb350e..0000000000 --- a/src/gpu_ib/thread_policy.hpp +++ /dev/null @@ -1,176 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_ -#define LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_ - -#include "rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../util.hpp" - -namespace rocshmem { - -class QueuePair; - -/* - * GPU single-thread policy class. Only a single work-item per work-group - * is allowed to call into a rocSHMEM function (unless it is specifically - * called out as a collective API. This thread policy is the fastest but - * is not as flexible. - */ -class SingleThreadImpl { - public: - uint32_t cq_lock = 0; - uint32_t sq_lock = 0; - - __device__ void quiet(QueuePair *handle); - - __device__ void quiet_heavy(QueuePair *handle, int pe); - - __device__ void decQuietCounter(uint32_t *quiet_counter, int num); - - template - __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes, - int pe, uint16_t le_sq_counter, uint8_t opcode); - - __device__ void postLock(QueuePair *handle, int pe); - - template - __device__ T threadAtomicAdd(T *val, T value = 1); -}; - -/* - * GPU multi-thread policy class. Multiple work-items per work-group are - * allowed to call into a rocSHMEM function. A bit slower than its - * single-thread counterpart but it enables a much more flexible user-facing - * API. - */ -class MultiThreadImpl { - /* - * Per-wg locks for the CQ and the SQ, respectively. - */ - template - __device__ void finishPost_internal(QueuePair *handle, bool ring_db, - int num_wqes, int pe, - uint16_t le_sq_counter, uint8_t opcode); - - __device__ void postLock_internal(QueuePair *handle); - - public: - uint32_t cq_lock = 0; - uint32_t sq_lock = 0; - - __device__ void quiet(QueuePair *handle); - - __device__ void quiet_heavy(QueuePair *handle, int pe); - - __device__ void decQuietCounter(uint32_t *quiet_counter, int num); - - template - __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes, - int pe, uint16_t le_sq_counter, uint8_t opcode); - - __device__ void postLock(QueuePair *handle, int pe); - - template - __device__ T threadAtomicAdd(T *val, T value = 1); -}; - -/* - * Select which one of our thread policies to use at compile time. - */ -#ifdef USE_THREADS -typedef MultiThreadImpl ThreadImpl; -#else -typedef SingleThreadImpl ThreadImpl; -#endif - -class THREAD { - public: - ThreadImpl threadImpl; - - __device__ void quiet(QueuePair *handle) { threadImpl.quiet(handle); } - - __device__ void quiet_heavy(QueuePair *handle, int pe) { - threadImpl.quiet_heavy(handle, pe); - } - - __device__ void decQuietCounter(uint32_t *quiet_counter, int num) { - threadImpl.decQuietCounter(quiet_counter, num); - } - - template - __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes, - int pe, uint16_t le_sq_counter, uint8_t opcode) { - threadImpl.finishPost(handle, ring_db, num_wqes, pe, le_sq_counter, - opcode); - } - - __device__ void postLock(QueuePair *handle, int pe) { - threadImpl.postLock(handle, pe); - } - - template - __device__ T threadAtomicAdd(T *val, T value = 1) { - T tmp = threadImpl.threadAtomicAdd(val, value); - return tmp; - } -}; - -class WAVE { - public: - __device__ void quiet(QueuePair *handle); - - __device__ void quiet_heavy(QueuePair *handle, int pe); - - __device__ void decQuietCounter(uint32_t *quiet_counter, int num); - - template - __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes, - int pe, uint16_t le_sq_counter, uint8_t opcode); - - __device__ void postLock(QueuePair *handle, int pe); - - template - __device__ T threadAtomicAdd(T *val, T value = 1); -}; - -class WG { - public: - __device__ void quiet(QueuePair *handle); - - __device__ void quiet_heavy(QueuePair *handle, int pe); - - __device__ void decQuietCounter(uint32_t *quiet_counter, int num); - - template - __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes, - int pe, uint16_t le_sq_counter, uint8_t opcode); - - __device__ void postLock(QueuePair *handle, int pe); - - template - __device__ T threadAtomicAdd(T *val, T value = 1); -}; - -} // namespace rocshmem - -#endif // LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_ diff --git a/src/ipc/backend_ipc.cpp b/src/ipc/backend_ipc.cpp index 9cd99f9329..172472b474 100644 --- a/src/ipc/backend_ipc.cpp +++ b/src/ipc/backend_ipc.cpp @@ -35,7 +35,7 @@ namespace rocshmem { extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; -rocshmem_team_t get_external_team(GPUIBTeam *team) { +rocshmem_team_t get_external_team(IPCTeam *team) { return reinterpret_cast(team); } @@ -223,7 +223,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team, * Allocate device-side memory for team_world and * construct a IPC team in it */ - GPUIBTeam *new_team_obj; + IPCTeam *new_team_obj; CHECK_HIP(hipMalloc(&new_team_obj, sizeof(IPCTeam))); new (new_team_obj) IPCTeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes, diff --git a/src/ipc/context_ipc_device.hpp b/src/ipc/context_ipc_device.hpp index a2ab824fc5..d2e0f49565 100644 --- a/src/ipc/context_ipc_device.hpp +++ b/src/ipc/context_ipc_device.hpp @@ -298,4 +298,4 @@ class IPCContext : public Context { } // namespace rocshmem -#endif // LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_ +#endif // LIBRARY_SRC_IPC_CONTEXT_DEVICE_HPP_ diff --git a/src/memory/symmetric_heap.hpp b/src/memory/symmetric_heap.hpp index ac36108273..4c6a39396c 100644 --- a/src/memory/symmetric_heap.hpp +++ b/src/memory/symmetric_heap.hpp @@ -35,7 +35,7 @@ * both host access and device access to the memory space. * * The symmetric heaps are visible to network by registering them as - * InfiniBand memory regions. Every memory region has a remote key + * memory regions. Every memory region has a remote key * which needs to be shared across the network (to access the memory * region). */ diff --git a/src/reverse_offload/context_ro_tmpl_host.hpp b/src/reverse_offload/context_ro_tmpl_host.hpp index fb9a319d18..e989f2c5a9 100644 --- a/src/reverse_offload/context_ro_tmpl_host.hpp +++ b/src/reverse_offload/context_ro_tmpl_host.hpp @@ -29,14 +29,14 @@ namespace rocshmem { template __host__ void ROHostContext::p(T *dest, T value, int pe) { - DPRINTF("Function: gpu_ib_host_p\n"); + DPRINTF("Function: ro_host_p\n"); host_interface->p(dest, value, pe, context_window_info); } template __host__ T ROHostContext::g(const T *source, int pe) { - DPRINTF("Function: gpu_ib_host_g\n"); + DPRINTF("Function: ro_host_g\n"); return host_interface->g(source, pe, context_window_info); } @@ -44,7 +44,7 @@ __host__ T ROHostContext::g(const T *source, int pe) { template __host__ void ROHostContext::put(T *dest, const T *source, size_t nelems, int pe) { - DPRINTF("Function: gpu_ib_host_put\n"); + DPRINTF("Function: ro_host_put\n"); host_interface->put(dest, source, nelems, pe, context_window_info); } @@ -52,7 +52,7 @@ __host__ void ROHostContext::put(T *dest, const T *source, size_t nelems, template __host__ void ROHostContext::get(T *dest, const T *source, size_t nelems, int pe) { - DPRINTF("Function: gpu_ib_host_get\n"); + DPRINTF("Function: ro_host_get\n"); host_interface->get(dest, source, nelems, pe, context_window_info); } @@ -60,7 +60,7 @@ __host__ void ROHostContext::get(T *dest, const T *source, size_t nelems, template __host__ void ROHostContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) { - DPRINTF("Function: gpu_ib_host_put_nbi\n"); + DPRINTF("Function: ro_host_put_nbi\n"); host_interface->put_nbi(dest, source, nelems, pe, context_window_info); } @@ -68,7 +68,7 @@ __host__ void ROHostContext::put_nbi(T *dest, const T *source, size_t nelems, template __host__ void ROHostContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) { - DPRINTF("Function: gpu_ib_host_get_nbi\n"); + DPRINTF("Function: ro_host_get_nbi\n"); host_interface->get_nbi(dest, source, nelems, pe, context_window_info); } @@ -107,7 +107,7 @@ __host__ void ROHostContext::broadcast(T *dest, const T *source, int nelems, int pe_root, int pe_start, int log_pe_stride, int pe_size, long *p_sync) { - DPRINTF("Function: gpu_ib_host_broadcast\n"); + DPRINTF("Function: ro_host_broadcast\n"); host_interface->broadcast(dest, source, nelems, pe_root, pe_start, log_pe_stride, pe_size, p_sync); diff --git a/src/rocshmem.cpp b/src/rocshmem.cpp index e9106c8fb4..931e4c83a0 100644 --- a/src/rocshmem.cpp +++ b/src/rocshmem.cpp @@ -36,10 +36,7 @@ #include "backend_bc.hpp" #include "context_incl.hpp" -#ifdef USE_GPU_IB -#include "gpu_ib/backend_ib.hpp" -#include "gpu_ib/context_ib_tmpl_host.hpp" -#elif defined(USE_RO) +#ifdef USE_RO #include "reverse_offload/backend_ro.hpp" #include "reverse_offload/context_ro_tmpl_host.hpp" #else @@ -86,10 +83,7 @@ rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; rocshmem_env_config_init(); -#ifdef USE_GPU_IB - CHECK_HIP(hipHostMalloc(&backend, sizeof(GPUIBBackend))); - backend = new (backend) GPUIBBackend(comm); -#elif defined(USE_RO) +#ifdef USE_RO CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend))); backend = new (backend) ROBackend(comm); #else diff --git a/src/rocshmem_gpu.cpp b/src/rocshmem_gpu.cpp index 5517da4b47..fc78f0931e 100644 --- a/src/rocshmem_gpu.cpp +++ b/src/rocshmem_gpu.cpp @@ -49,9 +49,7 @@ #include "templates.hpp" #include "util.hpp" -#ifdef USE_GPU_IB -#include "gpu_ib/context_ib_tmpl_device.hpp" -#elif defined(USE_RO) +#ifdef USE_RO #include "reverse_offload/context_ro_tmpl_device.hpp" #else #include "ipc/context_ipc_tmpl_device.hpp" diff --git a/src/team.cpp b/src/team.cpp index b162df67e4..2980337794 100644 --- a/src/team.cpp +++ b/src/team.cpp @@ -36,10 +36,6 @@ __host__ __device__ Team* get_internal_team(rocshmem_team_t team) { return reinterpret_cast(team); } -GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team) { - return reinterpret_cast(team); -} - ROTeam* get_internal_ro_team(rocshmem_team_t team) { return reinterpret_cast(team); } diff --git a/src/team.hpp b/src/team.hpp index 249c1b123b..77aa63750d 100644 --- a/src/team.hpp +++ b/src/team.hpp @@ -33,7 +33,6 @@ namespace rocshmem { class Backend; class Team; class ROTeam; -class GPUIBTeam; class IPCTeam; class TeamInfo { @@ -154,13 +153,11 @@ class Team { * * @note This is required to do some reinterpret_casts. */ - BackendType type{BackendType::GPU_IB_BACKEND}; + BackendType type{BackendType::RO_BACKEND}; }; __host__ __device__ Team* get_internal_team(rocshmem_team_t team); -GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team); - ROTeam* get_internal_ro_team(rocshmem_team_t team); IPCTeam* get_internal_ipc_team(rocshmem_team_t team);