diff --git a/CMakeLists.txt b/CMakeLists.txt index 43a0c293cf..47c9d3f620 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,8 +41,9 @@ endif() ############################################################################### option(DEBUG "Enable debug trace" OFF) option(PROFILE "Enable statistics and timing support" OFF) -option(USE_RO "Enable RO conduit." ON) +option(USE_RO "Enable RO conduit" ON) option(USE_IPC "Enable IPC support (using HIP)" OFF) +option(USE_GDA "Enable GDA conduit" OFF) option(USE_THREADS "Enable workgroup threads to share network queues" OFF) option(USE_WF_COAL "Enable wavefront message coalescing" OFF) option(USE_HEAP_DEVICE_FINEGRAIN "Heap uses GPU memory in finegrain mode" ON) @@ -68,6 +69,8 @@ option(BUILD_TOOLS "Build binary tools (e.g., rocshmem_info)" ON) option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF) option(BUILD_CODE_COVERAGE "Build with code coverage flags (gcc only)" OFF) +option(GDA_IONIC "Build for AMD Pensando IONIC RDMA provider" OFF) +option(GDA_BNXT "Build for Broadcom" OFF) ############################################################################### # PROJECT @@ -162,7 +165,6 @@ if (NOT BUILD_TESTS_ONLY) target_compile_options( ${PROJECT_NAME} PUBLIC - ${offload_flags} -fgpu-rdc ) @@ -172,6 +174,7 @@ if (NOT BUILD_TESTS_ONLY) $ $ # rocshmem_config.h $ # rocshmem_config.h from rocshmem.hpp + $ $ ) diff --git a/README.md b/README.md index 0754d4a6df..62c9e46f47 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,8 @@ code complexity and enables more fine-grained communication/computation overlap than traditional host-driven networking. rocSHMEM uses a single symmetric heap (SHEAP) that is allocated on GPU memories. -There are currently two backends for rocSHMEM; -IPC and Reverse Offload (RO). +There are currently three backends for rocSHMEM; +IPC, Reverse Offload (RO), and GPU-IB. The backends primarily differ in their implementations of intra-kernel networking. The IPC backend implements communication primitives using load/store operations issued from the GPU. diff --git a/cmake/FindIBVerbs.cmake b/cmake/FindIBVerbs.cmake new file mode 100644 index 0000000000..6c4d631262 --- /dev/null +++ b/cmake/FindIBVerbs.cmake @@ -0,0 +1,83 @@ +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### + +find_package(PkgConfig QUIET) +if (PkgConfig_FOUND) +if (IBVerbs_ROOT ) + # We don't use IBVerbs_DIR as this is supposed to be used when finding hwloc-config.cmake only + set(ENV{PKG_CONFIG_PATH} "${IBVerbs_ROOT}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") +endif() +pkg_check_modules(PC_IBVerbs QUIET libibverbs) +endif() + +find_path(IBVerbs_INCLUDE_DIR infiniband/verbs.h + HINTS ${PC_IBVerbs_INCLUDEDIR} ${PC_IBVerbs_INCLUDE_DIRS} + PATH_SUFFIXES include +) + +find_library(IBVerbs_LIBRARY + NAMES ibverbs libibverbs + HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS} + PATH_SUFFIXES lib lib64 +) + +if (GDA_IONIC) +find_library(IBVerbs_PROVIDER_LIBRARY + NAMES ionic libionic + HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS} + PATH_SUFFIXES lib lib64 +) +elseif (GDA_BNXT) +find_library(IBVerbs_PROVIDER_LIBRARY + NAMES bnxt_re libbnxt_re + HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS} + PATH_SUFFIXES lib lib64 +) +else() +find_library(IBVerbs_PROVIDER_LIBRARY + NAMES mlx5 libmlx5 + HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS} + PATH_SUFFIXES lib lib64 +) +endif() + +find_package_handle_standard_args(IBVerbs DEFAULT_MSG + IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY +) +mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY) + +if (IBVerbs_FOUND) +add_library(IBVerbs::verbs UNKNOWN IMPORTED) +set_target_properties(IBVerbs::verbs PROPERTIES + IMPORTED_LOCATION "${IBVerbs_LIBRARY}" + INTERFACE_COMPILE_OPTIONS "${PC_IBVerbs_CFLAGS_OTHER}" + INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_INCLUDE_DIR}" +) +add_library(IBVerbs::verbs_provider UNKNOWN IMPORTED) +set_target_properties(IBVerbs::verbs_provider PROPERTIES + IMPORTED_LOCATION "${IBVerbs_PROVIDER_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_PROVIDER_INCLUDE_DIR}" +) +target_link_libraries(IBVerbs::verbs INTERFACE IBVerbs::verbs_provider) +endif() diff --git a/cmake/rocshmem_config.h.in b/cmake/rocshmem_config.h.in index 36c5aeae24..644a87a69e 100644 --- a/cmake/rocshmem_config.h.in +++ b/cmake/rocshmem_config.h.in @@ -26,6 +26,7 @@ #cmakedefine PROFILE #cmakedefine USE_RO #cmakedefine USE_IPC +#cmakedefine USE_GDA #cmakedefine USE_THREADS #cmakedefine USE_SHARED_CTX #cmakedefine USE_WF_COAL @@ -41,3 +42,5 @@ #cmakedefine USE_SINGLE_NODE #cmakedefine USE_HDP_FLUSH #cmakedefine USE_HDP_FLUSH_HOST_SIDE +#cmakedefine GDA_IONIC +#cmakedefine GDA_BNXT diff --git a/cmake/setup_project.cmake b/cmake/setup_project.cmake index 5658aab4b9..df3a98376c 100644 --- a/cmake/setup_project.cmake +++ b/cmake/setup_project.cmake @@ -75,3 +75,4 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_FLAGS_DEBUG "-O0 -ggdb") +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) diff --git a/scripts/build_configs/gda b/scripts/build_configs/gda new file mode 100755 index 0000000000..c339b7155e --- /dev/null +++ b/scripts/build_configs/gda @@ -0,0 +1,49 @@ +#!/bin/bash +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### + +set -e + +src_path=$(dirname "$(realpath $0)")/../../ + +cmake \ + -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \ + -DCMAKE_VERBOSE_MAKEFILE=OFF \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DBUILD_FUNCTIONAL_TESTS=ON \ + -DBUILD_UNIT_TESTS=ON \ + -DDEBUG=OFF \ + -DPROFILE=OFF \ + -DUSE_GDA=ON \ + -DUSE_RO=OFF \ + -DUSE_IPC=OFF \ + -DUSE_THREADS=OFF \ + -DUSE_WF_COAL=OFF \ + -DUSE_HDP_FLUSH=OFF \ + -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ + $* $src_path +cmake --build . --parallel 8 +cmake --install . diff --git a/scripts/build_configs/gda_bnxt b/scripts/build_configs/gda_bnxt new file mode 100755 index 0000000000..77f3f29391 --- /dev/null +++ b/scripts/build_configs/gda_bnxt @@ -0,0 +1,30 @@ +#!/bin/bash +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### + +set -e + +script_path=$(dirname "$(realpath $0)") + +source $script_path/gda -DGDA_BNXT=ON $* diff --git a/scripts/build_configs/gda_ionic b/scripts/build_configs/gda_ionic new file mode 100755 index 0000000000..cd91bcc6a1 --- /dev/null +++ b/scripts/build_configs/gda_ionic @@ -0,0 +1,30 @@ +#!/bin/bash +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### + +set -e + +script_path=$(dirname "$(realpath $0)") + +source $script_path/gda -DGDA_IONIC=ON $* diff --git a/scripts/build_configs/gda_mlx5 b/scripts/build_configs/gda_mlx5 new file mode 100755 index 0000000000..9337aed9d1 --- /dev/null +++ b/scripts/build_configs/gda_mlx5 @@ -0,0 +1,30 @@ +#!/bin/bash +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### + +set -e + +script_path=$(dirname "$(realpath $0)") + +exec $script_path/gda $* diff --git a/scripts/build_configs/ipc_single b/scripts/build_configs/ipc_single index 83a82d4756..5432710399 100755 --- a/scripts/build_configs/ipc_single +++ b/scripts/build_configs/ipc_single @@ -1,3 +1,4 @@ +#!/bin/bash ############################################################################### # Copyright (c) Advanced Micro Devices, Inc. All rights reserved. # @@ -22,37 +23,28 @@ # IN THE SOFTWARE. ############################################################################### -#!/bin/bash - set -e -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - src_path=$(dirname "$(realpath $0)")/../../ cmake \ -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DBUILD_FUNCTIONAL_TESTS=ON \ + -DBUILD_UNIT_TESTS=ON \ -DDEBUG=OFF \ -DPROFILE=OFF \ + -DUSE_GDA=OFF \ -DUSE_RO=OFF \ -DUSE_IPC=ON \ -DUSE_THREADS=OFF \ -DUSE_WF_COAL=OFF \ - -DUSE_SINGLE_NODE=ON \ -DUSE_HDP_FLUSH=OFF \ -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ - -DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \ - -DBUILD_FUNCTIONAL_TESTS=ON \ - -DBUILD_UNIT_TESTS=ON \ - $src_path + -DUSE_SINGLE_NODE=ON \ + $* $src_path cmake --build . --parallel 8 cmake --install . diff --git a/scripts/build_configs/ipc_tests_only b/scripts/build_configs/ipc_tests_only index 41fde20882..f219ba6165 100755 --- a/scripts/build_configs/ipc_tests_only +++ b/scripts/build_configs/ipc_tests_only @@ -1,3 +1,4 @@ +#!/bin/bash ############################################################################### # Copyright (c) Advanced Micro Devices, Inc. All rights reserved. # @@ -22,27 +23,22 @@ # IN THE SOFTWARE. ############################################################################### -#!/bin/bash - set -e -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - src_path=$(dirname "$(realpath $0)")/../../ -# If as specific rocSHMEM version is required, the recommended approach -# is to set environment variable 'rocshmem_ROOT' cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ + -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DBUILD_FUNCTIONAL_TESTS=ON \ + -DBUILD_EXAMPLES=ON \ + -DBUILD_UNIT_TESTS=OFF \ -DDEBUG=OFF \ -DPROFILE=OFF \ + -DUSE_GDA=OFF \ -DUSE_RO=OFF \ -DUSE_IPC=ON \ -DUSE_THREADS=OFF \ @@ -50,10 +46,6 @@ cmake \ -DUSE_SINGLE_NODE=ON \ -DUSE_HDP_FLUSH=OFF \ -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ - -DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \ -DBUILD_TESTS_ONLY=ON \ - -DBUILD_FUNCTIONAL_TESTS=ON \ - -DBUILD_EXAMPLES=ON \ - -DBUILD_UNIT_TESTS=OFF \ - $src_path + $* $src_path cmake --build . --parallel 8 diff --git a/scripts/build_configs/ro_ipc b/scripts/build_configs/ro_ipc index b39438e665..09e158577e 100755 --- a/scripts/build_configs/ro_ipc +++ b/scripts/build_configs/ro_ipc @@ -1,3 +1,4 @@ +#!/bin/bash ############################################################################### # Copyright (c) Advanced Micro Devices, Inc. All rights reserved. # @@ -22,35 +23,27 @@ # IN THE SOFTWARE. ############################################################################### -#!/bin/bash - set -e -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - src_path=$(dirname "$(realpath $0)")/../../ cmake \ -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DBUILD_FUNCTIONAL_TESTS=ON \ + -DBUILD_UNIT_TESTS=ON \ -DDEBUG=OFF \ -DPROFILE=OFF \ + -DUSE_GDA=OFF \ + -DUSE_RO=ON \ -DUSE_IPC=ON \ -DUSE_THREADS=OFF \ -DUSE_WF_COAL=OFF \ -DUSE_HDP_FLUSH=OFF \ -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ - -DUSE_RO=ON \ - -DBUILD_FUNCTIONAL_TESTS=ON \ - -DBUILD_UNIT_TESTS=ON \ - $src_path + $* $src_path cmake --build . --parallel 8 cmake --install . diff --git a/scripts/build_configs/ro_net b/scripts/build_configs/ro_net index abdcffcdd4..7757f4d58f 100755 --- a/scripts/build_configs/ro_net +++ b/scripts/build_configs/ro_net @@ -1,3 +1,4 @@ +#!/bin/bash ############################################################################### # Copyright (c) Advanced Micro Devices, Inc. All rights reserved. # @@ -22,35 +23,27 @@ # IN THE SOFTWARE. ############################################################################### -#!/bin/bash - set -e -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - src_path=$(dirname "$(realpath $0)")/../../ cmake \ -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_path \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \ -DCMAKE_VERBOSE_MAKEFILE=OFF \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DBUILD_FUNCTIONAL_TESTS=ON \ + -DBUILD_UNIT_TESTS=ON \ -DDEBUG=OFF \ -DPROFILE=OFF \ + -DUSE_GDA=OFF \ + -DUSE_RO=ON \ -DUSE_IPC=OFF \ -DUSE_THREADS=OFF \ -DUSE_WF_COAL=OFF \ -DUSE_HDP_FLUSH=OFF \ -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ - -DUSE_RO=ON \ - -DBUILD_FUNCTIONAL_TESTS=ON \ - -DBUILD_UNIT_TESTS=ON \ - $src_path + $* $src_path cmake --build . --parallel 8 cmake --install . diff --git a/scripts/build_configs/ro_net_debug b/scripts/build_configs/ro_net_debug index a7c42ba234..e77c8a6f89 100755 --- a/scripts/build_configs/ro_net_debug +++ b/scripts/build_configs/ro_net_debug @@ -1,3 +1,4 @@ +#!/bin/bash ############################################################################### # Copyright (c) Advanced Micro Devices, Inc. All rights reserved. # @@ -22,33 +23,8 @@ # IN THE SOFTWARE. ############################################################################### -#!/bin/bash - set -e -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi +script_path=$(dirname "$(realpath $0)") -src_path=$(dirname "$(realpath $0)")/../../ - -cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_INSTALL_PREFIX=$install_path \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DDEBUG=OFF \ - -DPROFILE=OFF \ - -DUSE_IPC=OFF \ - -DUSE_THREADS=OFF \ - -DUSE_WF_COAL=OFF \ - -DUSE_HDP_FLUSH=OFF \ - -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ - -DUSE_RO=ON \ - -DBUILD_FUNCTIONAL_TESTS=ON \ - -DBUILD_UNIT_TESTS=ON \ - $src_path -cmake --build . --parallel 8 -cmake --install . +BUILD_TYPE=Debug source $script_path/ro_net $* diff --git a/scripts/functional_tests/driver.sh b/scripts/functional_tests/driver.sh index 3c4902134f..269092ef72 100755 --- a/scripts/functional_tests/driver.sh +++ b/scripts/functional_tests/driver.sh @@ -195,6 +195,7 @@ TestRMAPut() { ExecTest "waveput" 2 2 128 1048576 ExecTest "waveput" 2 16 128 8 + ExecTest "defaultctxput" 2 4 128 1024 ExecTest "teamctxput" 2 4 128 1024 ExecTest "teamctxput" 2 16 256 1024 @@ -226,6 +227,7 @@ TestRMAPut() { ExecTest "waveputnbi" 2 2 128 1048576 ExecTest "waveputnbi" 2 16 128 8 + ExecTest "defaultctxputnbi" 2 4 128 1024 ExecTest "teamctxputnbi" 2 4 128 1024 ExecTest "teamctxputnbi" 2 16 256 1024 } @@ -250,6 +252,7 @@ TestRMAGet() { ExecTest "waveget" 2 2 128 1048576 ExecTest "waveget" 2 16 128 8 + ExecTest "defaultctxget" 2 4 128 1024 ExecTest "teamctxget" 2 4 128 1024 ExecTest "teamctxget" 2 16 256 1024 @@ -276,6 +279,7 @@ TestRMAGet() { ExecTest "wavegetnbi" 2 2 128 1048576 ExecTest "wavegetnbi" 2 16 128 8 + ExecTest "defaultctxgetnbi" 2 4 128 1024 ExecTest "teamctxgetnbi" 2 4 128 1024 ExecTest "teamctxgetnbi" 2 16 256 1024 } @@ -434,6 +438,186 @@ TestOther() { unset ROCSHMEM_MAX_NUM_CONTEXTS } +# TODO: remove when GDA is feature complete +TestGDA() { + ############################################################################## + # | Name | Ranks | Workgroups | Threads | Max Message Size # + ############################################################################## + ExecTest "put" 2 1 1 1048576 + ExecTest "put" 2 1 1024 512 + ExecTest "put" 2 8 1 1048576 + ExecTest "put" 2 16 128 8 + ExecTest "put" 2 32 256 512 + ExecTest "put" 2 64 1024 8 + +# ExecTest "wgput" 2 1 64 1048576 +# ExecTest "wgput" 2 2 64 1048576 +# ExecTest "wgput" 2 16 64 8 + + ExecTest "waveput" 2 1 64 1048576 + ExecTest "waveput" 2 2 64 1048576 + ExecTest "waveput" 2 2 128 1048576 + ExecTest "waveput" 2 16 128 8 + + ExecTest "defaultctxput" 2 4 128 1024 + ExecTest "teamctxput" 2 4 128 1024 + ExecTest "teamctxput" 2 16 256 1024 + +# ExecTest "get" 2 1 1 1048576 +# ExecTest "get" 2 1 1024 512 +# ExecTest "get" 2 8 1 1048576 +# ExecTest "get" 2 16 128 8 +# ExecTest "get" 2 32 256 512 +# ExecTest "get" 2 64 1024 8 + +# ExecTest "wgget" 2 1 64 1048576 +# ExecTest "wgget" 2 2 64 1048576 +# ExecTest "wgget" 2 16 64 8 + +# ExecTest "waveget" 2 1 64 1048576 +# ExecTest "waveget" 2 2 64 1048576 +# ExecTest "waveget" 2 2 128 1048576 +# ExecTest "waveget" 2 16 128 8 + +# ExecTest "defaultctxget" 2 4 128 1024 +# ExecTest "teamctxget" 2 4 128 1024 +# ExecTest "teamctxget" 2 16 256 1024 + +# ExecTest "g" 2 1 1 128 +# ExecTest "g" 2 1 1024 2 +# ExecTest "g" 2 8 1 32 +# ExecTest "g" 2 16 128 4 + +#Implemented but known incorrect +# ExecTest "p" 2 1 1 128 +# ExecTest "p" 2 1 1024 2 +# ExecTest "p" 2 8 1 32 +# ExecTest "p" 2 16 128 4 + + ################################ Non-Blocking ################################ + + ExecTest "putnbi" 2 1 1 1048576 + ExecTest "putnbi" 2 1 1024 512 + ExecTest "putnbi" 2 8 1 1048576 + ExecTest "putnbi" 2 16 128 8 + ExecTest "putnbi" 2 32 256 512 + ExecTest "putnbi" 2 64 1024 8 + +# ExecTest "wgputnbi" 2 1 64 1048576 +# ExecTest "wgputnbi" 2 2 64 1048576 +# ExecTest "wgputnbi" 2 16 64 8 + + ExecTest "waveputnbi" 2 1 64 1048576 + ExecTest "waveputnbi" 2 2 64 1048576 + ExecTest "waveputnbi" 2 2 128 1048576 + ExecTest "waveputnbi" 2 16 128 8 + + ExecTest "defaultctxputnbi" 2 4 128 1024 + ExecTest "teamctxputnbi" 2 4 128 1024 + ExecTest "teamctxputnbi" 2 16 256 1024 + +# ExecTest "getnbi" 2 1 1 1048576 +# ExecTest "getnbi" 2 1 1024 512 +# ExecTest "getnbi" 2 8 1 1048576 +# ExecTest "getnbi" 2 16 128 8 +# ExecTest "getnbi" 2 32 256 512 +# ExecTest "getnbi" 2 64 1024 8 + +# ExecTest "wggetnbi" 2 1 64 1048576 +# ExecTest "wggetnbi" 2 2 64 1048576 +# ExecTest "wggetnbi" 2 16 64 8 + +# ExecTest "wavegetnbi" 2 1 64 1048576 +# ExecTest "wavegetnbi" 2 2 64 1048576 +# ExecTest "wavegetnbi" 2 2 128 1048576 +# ExecTest "wavegetnbi" 2 16 128 8 + +# ExecTest "defaultctxgetnbi" 2 4 128 1024 +# ExecTest "teamctxgetnbi" 2 4 128 1024 +# ExecTest "teamctxgetnbi" 2 16 256 1024 + +#TestAMO() { + ############################################################################## + # | Name | Ranks | Workgroups | Threads | Max Message Size # + ############################################################################## +# ExecTest "amo_fetch" 2 1 1 +# ExecTest "amo_fetch" 2 1 1024 +# ExecTest "amo_fetch" 2 8 1 +# ExecTest "amo_fetch" 2 32 128 + +# ExecTest "amo_set" 2 1 1 +# ExecTest "amo_set" 2 8 1 +# ExecTest "amo_set" 2 32 1 + +# ExecTest "amo_fcswap" 2 1 1 +# ExecTest "amo_fcswap" 2 32 1 +# ExecTest "amo_fcswap" 2 8 1 + +#Works on CX7, not implemented on BNXT +# ExecTest "amo_finc" 2 1 1 +# ExecTest "amo_finc" 2 1 1024 +# ExecTest "amo_finc" 2 8 1 +# ExecTest "amo_finc" 2 32 128 + +#This works but tester requires get +# ExecTest "amo_inc" 2 1 1 +# ExecTest "amo_inc" 2 1 1024 +# ExecTest "amo_inc" 2 8 1 +# ExecTest "amo_inc" 2 32 128 + +#Works on CX7, not implemented on BNXT +# ExecTest "amo_fadd" 2 1 1 +# ExecTest "amo_fadd" 2 1 1024 +# ExecTest "amo_fadd" 2 8 1 +# ExecTest "amo_fadd" 2 32 128 + +#This works but tester requires get +# ExecTest "amo_add" 2 1 1 +# ExecTest "amo_add" 2 1 1024 +# ExecTest "amo_add" 2 8 1 +# ExecTest "amo_add" 2 32 128 + +# ExecTest "amo_fetchand" 2 1 1 + +# ExecTest "amo_and" 2 1 1 + +# ExecTest "amo_xor" 2 1 1 + +#TestColl() { + ############################################################################## + # | Name | Ranks | Workgroups | Threads | Max Message Size # + ############################################################################## + ExecTest "barrierall" 2 1 1 + ExecTest "teambarrier" 2 1 1 + + ExecTest "sync" 2 1 1 + ExecTest "syncall" 2 1 1 + +# ExecTest "alltoall" 2 1 1 512 + +# ExecTest "teambroadcast" 2 1 1 32768 + +# ExecTest "fcollect" 2 1 1 512 +# ExecTest "fcollect" 2 1 1 32768 + +# ExecTest "teamreduction" 2 1 1 32768 + +#TestOther() { + ############################################################################## + # | Name | Ranks | Workgroups | Threads | Max Message Size # + ############################################################################## + ExecTest "init" 2 1 1 + +# ExecTest "pingpong" 2 1 1 +# ExecTest "pingpong" 2 8 1 +# ExecTest "pingpong" 2 32 1 + + # This test requires more contexts than workgroups + export ROCSHMEM_MAX_NUM_CONTEXTS=1024 + ExecTest "teamctxinfra" 2 1 1 + unset ROCSHMEM_MAX_NUM_CONTEXTS +} + ValidateInput() { INPUT_COUNT=$1 if [ $INPUT_COUNT -lt 3 ] ; then @@ -467,6 +651,9 @@ ValidateInput $# ValidateLogDir $LOG_DIR case $TEST in + *"gda") + TestGDA + ;; *"all") TestRMA TestAMO diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 732eece2c9..c8542da45f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -63,13 +63,15 @@ target_compile_options(${PROJECT_NAME} PUBLIC ${ROCSHMEM_COMPILE_FLAGS}) ############################################################################### # ROCSHMEM TARGET FOR BACKENDS ############################################################################### -IF (USE_RO) +if (USE_RO) add_subdirectory(reverse_offload) -ELSE() +elseif (USE_IPC) add_subdirectory(ipc) -ENDIF() -add_subdirectory(containers) +elseif (USE_GDA) +add_subdirectory(gda) +endif() add_subdirectory(host) +add_subdirectory(containers) add_subdirectory(memory) add_subdirectory(sync) add_subdirectory(bootstrap) diff --git a/src/backend_bc.cpp b/src/backend_bc.cpp index 1469d9a774..c974c11428 100644 --- a/src/backend_bc.cpp +++ b/src/backend_bc.cpp @@ -27,10 +27,12 @@ #include "backend_type.hpp" #include "context_incl.hpp" -#ifdef USE_RO +#if defined(USE_RO) #include "reverse_offload/backend_ro.hpp" -#else +#elif defined(USE_IPC) #include "ipc/backend_ipc.hpp" +#elif defined(USE_GDA) +#include "gda/backend_gda.hpp" #endif #include @@ -247,18 +249,22 @@ void Backend::reset_stats() { } __device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) { -#ifdef USE_RO +#if defined(USE_RO) return static_cast(this)->create_ctx(option, ctx); -#else +#elif defined(USE_IPC) return static_cast(this)->create_ctx(option, ctx); +#elif defined(USE_GDA) + return static_cast(this)->create_ctx(option, ctx); #endif } __device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) { -#ifdef USE_RO +#if defined(USE_RO) static_cast(this)->destroy_ctx(ctx); -#else +#elif defined(USE_IPC) static_cast(this)->destroy_ctx(ctx); +#elif defined(USE_GDA) + static_cast(this)->destroy_ctx(ctx); #endif } diff --git a/src/backend_type.hpp b/src/backend_type.hpp index 98268c7422..ef9c7d3874 100644 --- a/src/backend_type.hpp +++ b/src/backend_type.hpp @@ -46,7 +46,7 @@ namespace rocshmem { * @note Derived classes which use Backend as a base class must add * themselves to this enum class to support static polymorphism. */ -enum class BackendType { RO_BACKEND, IPC_BACKEND }; +enum class BackendType { RO_BACKEND, IPC_BACKEND, GDA_BACKEND }; /** * @brief Helper macro for some dispatch calls @@ -56,40 +56,52 @@ enum class BackendType { RO_BACKEND, IPC_BACKEND }; /** * @brief Device static dispatch method call. */ -#ifdef USE_RO +#if defined(USE_RO) #define DISPATCH(Func) \ static_cast(this)->Func; -#else +#elif defined(USE_IPC) #define DISPATCH(Func) \ static_cast(this)->Func; +#elif defined(USE_GDA) +#define DISPATCH(Func) \ + static_cast(this)->Func; #endif /** * @brief Device static dispatch method call with a return value. */ -#ifdef USE_RO +#if defined(USE_RO) #define DISPATCH_RET(Func) \ auto ret_val = static_cast(this)->Func; \ return ret_val; -#else -#define DISPATCH_RET(Func) \ - auto ret_val{0}; \ - ret_val = static_cast(this)->Func; \ +#elif defined(USE_IPC) +#define DISPATCH_RET(Func) \ + auto ret_val = static_cast(this)->Func; \ + return ret_val; +#elif defined(USE_GDA) +#define DISPATCH_RET(Func) \ + auto ret_val = static_cast(this)->Func; \ return ret_val; #endif + /** * @brief Device static dispatch method call with a return type of pointer. */ -#ifdef USE_RO +#if defined(USE_RO) #define DISPATCH_RET_PTR(Func) \ void *ret_val{nullptr}; \ ret_val = static_cast(this)->Func; \ return ret_val; -#else +#elif defined(USE_IPC) #define DISPATCH_RET_PTR(Func) \ void *ret_val{nullptr}; \ ret_val = static_cast(this)->Func; \ return ret_val; +#elif defined(USE_GDA) +#define DISPATCH_RET_PTR(Func) \ + void *ret_val{nullptr}; \ + ret_val = static_cast(this)->Func; \ + return ret_val; #endif /** @@ -99,11 +111,14 @@ enum class BackendType { RO_BACKEND, IPC_BACKEND }; * MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and * threading semantics of collectives in OpenSHMEM match those of MPI. */ -#ifdef USE_RO +#if defined(USE_RO) #define HOST_DISPATCH(Func) static_cast(this)->Func; -#else +#elif defined(USE_IPC) #define HOST_DISPATCH(Func) static_cast(this)->Func; +#elif defined(USE_GDA) +#define HOST_DISPATCH(Func) static_cast(this)->Func; #endif + /** * @brief Host static dispatch method call with return value. * @@ -111,31 +126,38 @@ enum class BackendType { RO_BACKEND, IPC_BACKEND }; * MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and * threading semantics of collectives in OpenSHMEM match those of MPI. */ - -#ifdef USE_RO +#if defined(USE_RO) #define HOST_DISPATCH_RET(Func) \ auto ret_val = static_cast(this)->Func; \ return ret_val; -#else -#define HOST_DISPATCH_RET(Func) \ - auto ret_val{0}; \ - ret_val = static_cast(this)->Func; \ +#elif defined(USE_IPC) +#define HOST_DISPATCH_RET(Func) \ + auto ret_val = static_cast(this)->Func; \ + return ret_val; +#elif defined(USE_GDA) +#define HOST_DISPATCH_RET(Func) \ + auto ret_val = static_cast(this)->Func; \ return ret_val; #endif /** * @brief Host static dispatch method call with a return type of pointer. */ -#ifdef USE_RO +#if defined(USE_RO) #define HOST_DISPATCH_RET_PTR(Func) \ void *ret_val{nullptr}; \ ret_val = static_cast(this)->Func; \ return ret_val; -#else +#elif defined(USE_IPC) #define HOST_DISPATCH_RET_PTR(Func) \ void *ret_val{nullptr}; \ ret_val = static_cast(this)->Func; \ return ret_val; +#elif defined(USE_GDA) +#define HOST_DISPATCH_RET_PTR(Func) \ + void *ret_val{nullptr}; \ + ret_val = static_cast(this)->Func; \ + return ret_val; #endif } // namespace rocshmem diff --git a/src/bootstrap/bootstrap.cpp b/src/bootstrap/bootstrap.cpp index 54311a462a..ff107695a4 100644 --- a/src/bootstrap/bootstrap.cpp +++ b/src/bootstrap/bootstrap.cpp @@ -32,7 +32,7 @@ #include "bootstrap.hpp" #include "utils.hpp" -#include "../util.hpp" +#include "util.hpp" #include "socket.hpp" namespace rocshmem { diff --git a/src/bootstrap/socket.cpp b/src/bootstrap/socket.cpp index 5b1b57dfda..1760c20096 100644 --- a/src/bootstrap/socket.cpp +++ b/src/bootstrap/socket.cpp @@ -36,7 +36,7 @@ #include "socket.hpp" #include "utils.hpp" -#include "../util.hpp" +#include "util.hpp" namespace rocshmem { diff --git a/src/bootstrap/utils.cpp b/src/bootstrap/utils.cpp index 829599134a..bf68ded083 100644 --- a/src/bootstrap/utils.cpp +++ b/src/bootstrap/utils.cpp @@ -34,7 +34,7 @@ #include #include "utils.hpp" -#include "../util.hpp" +#include "util.hpp" constexpr char HOSTID_FILE[32] = "/proc/sys/kernel/random/boot_id"; diff --git a/src/containers/array_impl.hpp b/src/containers/array_impl.hpp index 6697ace54a..551cf5bf91 100644 --- a/src/containers/array_impl.hpp +++ b/src/containers/array_impl.hpp @@ -26,7 +26,7 @@ #define LIBRARY_SRC_CONTAINERS_ARRAY_IMPL_HPP_ #include "array.hpp" -#include "../constants.hpp" +#include "constants.hpp" #include #include diff --git a/src/containers/atomic_wf_queue.hpp b/src/containers/atomic_wf_queue.hpp index 02e0f1d82c..bac0dbb030 100644 --- a/src/containers/atomic_wf_queue.hpp +++ b/src/containers/atomic_wf_queue.hpp @@ -27,9 +27,9 @@ #include -#include "../memory/hip_allocator.hpp" -#include "../sync/abql_block_mutex.hpp" -#include "../src/util.hpp" +#include "memory/hip_allocator.hpp" +#include "sync/abql_block_mutex.hpp" +#include "util.hpp" namespace rocshmem { diff --git a/src/containers/free_list.hpp b/src/containers/free_list.hpp index 9c238b4687..2bc640b338 100644 --- a/src/containers/free_list.hpp +++ b/src/containers/free_list.hpp @@ -27,8 +27,8 @@ #include -#include "../memory/hip_allocator.hpp" -#include "../sync/abql_block_mutex.hpp" +#include "memory/hip_allocator.hpp" +#include "sync/abql_block_mutex.hpp" namespace rocshmem { diff --git a/src/containers/share_strategy.cpp b/src/containers/share_strategy.cpp index 1e6ee557b7..e40c4b6256 100644 --- a/src/containers/share_strategy.cpp +++ b/src/containers/share_strategy.cpp @@ -23,7 +23,7 @@ *****************************************************************************/ #include "share_strategy.hpp" -#include "../constants.hpp" +#include "constants.hpp" #include diff --git a/src/context_incl.hpp b/src/context_incl.hpp index b95bbf94f3..5f8106ebed 100644 --- a/src/context_incl.hpp +++ b/src/context_incl.hpp @@ -28,12 +28,17 @@ #include "context.hpp" #include "context_tmpl_device.hpp" #include "context_tmpl_host.hpp" -#ifdef USE_RO +#if defined(USE_RO) #include "reverse_offload/context_ro_device.hpp" #include "reverse_offload/context_ro_host.hpp" -#else +#elif defined(USE_IPC) #include "ipc/context_ipc_device.hpp" #include "ipc/context_ipc_host.hpp" +#elif defined(USE_GDA) +#include "gda/context_gda_device.hpp" +#include "gda/context_gda_host.hpp" +#else +#error "Select one backend among USE_RO, USE_IPC, USE_GDA" #endif #endif // LIBRARY_SRC_CONTEXT_INCL_HPP_ diff --git a/src/context_tmpl_device.hpp b/src/context_tmpl_device.hpp index 0fedf3ffd0..3c0b5802ac 100644 --- a/src/context_tmpl_device.hpp +++ b/src/context_tmpl_device.hpp @@ -27,10 +27,12 @@ #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_type.hpp" -#ifdef USE_RO +#if defined(USE_RO) #include "reverse_offload/context_ro_device.hpp" -#else +#elif defined(USE_IPC) #include "ipc/context_ipc_device.hpp" +#elif defined(USE_GDA) +#include "gda/context_gda_device.hpp" #endif namespace rocshmem { diff --git a/src/context_tmpl_host.hpp b/src/context_tmpl_host.hpp index 68a572bdaf..53236c540c 100644 --- a/src/context_tmpl_host.hpp +++ b/src/context_tmpl_host.hpp @@ -27,11 +27,14 @@ #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) #include "backend_type.hpp" -#ifdef USE_RO +#if defined(USE_RO) #include "reverse_offload/context_ro_host.hpp" -#else +#elif defined(USE_IPC) #include "ipc/context_ipc_host.hpp" +#elif defined(USE_GDA) +#include "gda/context_gda_host.hpp" #endif + namespace rocshmem { template diff --git a/src/gda/CMakeLists.txt b/src/gda/CMakeLists.txt new file mode 100644 index 0000000000..056f5f2423 --- /dev/null +++ b/src/gda/CMakeLists.txt @@ -0,0 +1,55 @@ +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### + +############################################################################### +# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY +############################################################################### +target_sources( + ${PROJECT_NAME} + PRIVATE + context_gda_device.cpp + context_gda_device_coll.cpp + context_gda_host.cpp + backend_gda.cpp + gda_team.cpp + queue_pair.cpp + endian.cpp + topology.cpp +) + +find_package(IBVerbs REQUIRED) + +target_link_libraries( + ${PROJECT_NAME} + PUBLIC + IBVerbs::verbs + numa +) + +if (GDA_IONIC) +elseif (GDA_BNXT) + add_subdirectory(bnxt) +else() + target_sources(${PROJECT_NAME} PRIVATE segment_builder.cpp) +endif() diff --git a/src/gda/backend_gda.cpp b/src/gda/backend_gda.cpp new file mode 100644 index 0000000000..a7c555a527 --- /dev/null +++ b/src/gda/backend_gda.cpp @@ -0,0 +1,1238 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include + +#include "backend_gda.hpp" +#include "gda_team.hpp" +#include "util.hpp" +#include "topology.hpp" + +#include +#include +#include + +namespace rocshmem { + +#define NET_CHECK(cmd) { \ + if (cmd != MPI_SUCCESS) { \ + fprintf(stderr, "Unrecoverable error: MPI Failure\n"); \ + abort(); \ + } \ + } + +extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; + +rocshmem_team_t get_external_team(GDATeam *team) { + return reinterpret_cast(team); +} + +int get_ls_non_zero_bit(char *bitmask, int mask_length) { + int position{-1}; + for (int bit_i = 0; bit_i < mask_length; bit_i++) { + int byte_i = bit_i / CHAR_BIT; + if (bitmask[byte_i] & (1 << (bit_i % CHAR_BIT))) { + position = bit_i; + break; + } + } + + return position; +} + +GDABackend::GDABackend(MPI_Comm comm): Backend(comm) { + init(); +} + +GDABackend::GDABackend(TcpBootstrap *bootstrap): Backend(bootstrap) { + init(); +} + +void GDABackend::init() { + type = BackendType::GDA_BACKEND; + read_env(); + + //TODO setup_host_interface(); + /* Initialize the host interface */ + if (MPI_COMM_NULL != backend_comm) + host_interface = std::make_shared(hdp_proxy_.get(), //TODO: need an hdp proxy? + backend_comm, + &heap); + else + host_interface = std::make_shared(hdp_proxy_.get(), //TODO: need an hdp proxy? + backend_bootstr, + &heap); + + setup_wrk_sync_buffer(); + setup_fence_buffer(); + setup_collectives(); + + setup_teams(); + setup_team_world(); + rte_barrier(); + + setup_ibv(); + setup_heap_memory_rkey(); + setup_gpu_qps(); + + setup_ctxs(); + rte_barrier(); +} + +GDABackend::~GDABackend() { + cleanup_ctxs(); + + cleanup_teams(); + auto *team_world{team_tracker.get_team_world()}; + team_world->~Team(); + CHECK_HIP(hipFree(team_world)); + + cleanup_wrk_sync_buffer(); + + cleanup_gpu_qps(); + cleanup_heap_memory_rkey(); + cleanup_ibv(); +} + +void GDABackend::read_env() { + if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) { + std::stringstream sstream(maximum_num_contexts_str); + sstream >> maximum_num_contexts_; + } + char* value{nullptr}; + if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) { + requested_dev = strdup(value); + } else { + int gpu_dev = 0; + CHECK_HIP(hipGetDevice(&gpu_dev)); + int nic_dev = rocshmem::GetClosestNicToGpu(gpu_dev, &requested_dev); + assert (nic_dev != -1); + } + if ((value = getenv("ROCSHMEM_SQ_SIZE"))) { + sq_size = atoi(value); + } +} + + +void GDABackend::setup_host_ctx() { + default_host_ctx = std::make_unique(this, 0); + ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get(); +} + +void GDABackend::setup_default_ctx() { + TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world; + default_context_proxy_ = GDADefaultContextProxyT(this, tinfo); +} + +void GDABackend::setup_ctxs() { + setup_host_ctx(); + setup_default_ctx(); + + CHECK_HIP(hipMalloc(&ctx_array, sizeof(GDAContext) * maximum_num_contexts_)); + // 0th context is default context + for (size_t i = 0; i < maximum_num_contexts_; i++) { + new (&ctx_array[i]) GDAContext(this, i + 1); + ctx_free_list.get()->push_back(ctx_array + i); + } +} + +void GDABackend::cleanup_ctxs() { + ctx_free_list.~FreeListProxy(); + for (size_t i = 0; i < maximum_num_contexts_; i++) { + ctx_array[i].~GDAContext(); + } + + CHECK_HIP(hipFree(ctx_array)); +} + +__device__ bool GDABackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) { + GDAContext *ctx_{nullptr}; + + auto pop_result = ctx_free_list.get()->pop_front(); + if (!pop_result.success) { + return false; + } + ctx_ = pop_result.value; + + ctx->ctx_opaque = ctx_; + + ctx_->tinfo = reinterpret_cast(ctx->team_opaque); + return true; +} + +__device__ void GDABackend::destroy_ctx(rocshmem_ctx_t *ctx) { + ctx_free_list.get()->push_back(static_cast(ctx->ctx_opaque)); +} + +void GDABackend::setup_team_world() { + TeamInfo *team_info_wrt_parent, *team_info_wrt_world; + + /** + * Allocate device-side memory for team_world and construct a + * GDA team in it. + */ + CHECK_HIP(hipMalloc(&team_info_wrt_parent, sizeof(TeamInfo))); + CHECK_HIP(hipMalloc(&team_info_wrt_world, sizeof(TeamInfo))); + + new (team_info_wrt_parent) TeamInfo(nullptr, 0, 1, num_pes); + new (team_info_wrt_world) TeamInfo(nullptr, 0, 1, num_pes); + + GDATeam *team_world{nullptr}; + CHECK_HIP(hipMalloc(&team_world, sizeof(GDATeam))); + new (team_world) GDATeam(this, team_info_wrt_parent, team_info_wrt_world, + num_pes, my_pe, backend_comm, 0); + team_tracker.set_team_world(team_world); + + /** + * Copy the address to ROCSHMEM_TEAM_WORLD. + */ + ROCSHMEM_TEAM_WORLD = reinterpret_cast(team_world); +} + +void GDABackend::team_destroy(rocshmem_team_t team) { + GDATeam *team_obj = get_internal_gda_team(team); + + /* Mark the pool as available */ + int bit = team_obj->pool_index_; + int byte_i = bit / CHAR_BIT; + team_pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT); + + team_obj->~GDATeam(); + CHECK_HIP(hipFree(team_obj)); +} + +//TODO: factorize somewhere else maybe backend_bc +void GDABackend::Alltoall_char_inplace (char *inoutbuf, size_t num_bytes, rocshmem_team_t team) { + // Implement an Alltoall outside of MPI assuming in_place communication + GDATeam *team_obj = reinterpret_cast(team); + int num_pes = team_obj->num_pes; + int my_pe = team_obj->my_pe; + int *pes_in_world = new int[num_pes]; + + int my_pe_in_world = team_obj->my_pe_in_world; + for (int i = 0; i < num_pes; i++) { + pes_in_world[i] = team_obj->get_pe_in_world(i); + } + + // Since this is an in-place algorithm, allocate the temporary receive buffer first + char *recv_buf = new char[num_bytes * num_pes]; + std::memset(recv_buf, 0, num_pes * num_bytes); + + // Perform pairwise exchange - local copy is ommitted + for (int step = 1; step < num_pes; step++) { + int sendto_team = (my_pe + step) % num_pes; + int recvfrom_team = (my_pe + num_pes - step) % num_pes; + + char *tmpsend = (char*)inoutbuf + (ptrdiff_t)sendto_team * num_bytes; + char *tmprecv = (char*)recv_buf + (ptrdiff_t)recvfrom_team * num_bytes; + + // similarly to the allGather in the bootstrap code, we do send first + // followed by the receive. + // There is a chance for deadlock in my opinion for large messages. + backend_bootstr->send(tmpsend, num_bytes, pes_in_world[sendto_team], step /* used as tag */); + backend_bootstr->recv(tmprecv, num_bytes, pes_in_world[recvfrom_team], step); + } + //Since this is an in_place all-to-all, copy data back into the user buffer + for (int step = 0; step < num_pes; step++) { + if (step == my_pe) continue; + std::memcpy(&inoutbuf[step*num_bytes], &recv_buf[step*num_bytes], num_bytes); + } + + delete[] recv_buf; + delete[] pes_in_world; +} + +//TODO: factorize somewhere else, maybe backend_bc? +void GDABackend::Allreduce_char_BAND (char* inbuf, char *outbuf, size_t num_bytes, + Team *team) { + + // Implement an Allreduce outside of MPI. This is specialized for the scenario + // required for the team creation, i.e. assuming bytes and using BAND operation. + // Implementation uses an Allgather operation followed a local reduction. + + GDATeam *team_obj = reinterpret_cast(team); + int num_pes = team_obj->num_pes; + int my_pe = team_obj->my_pe; + + char *tmp_buffer = new char[num_pes * num_bytes]; + std::memset(tmp_buffer, 0, num_pes * num_bytes); + std::memcpy (&tmp_buffer[my_pe * num_bytes], inbuf, num_bytes); + + if (num_pes == backend_bootstr->getNranks() ) { + backend_bootstr->allGather(tmp_buffer, num_bytes); + } else { + printf("GDABackend::create_new_team: non-mpi version only supports parent_teams that contain all processes. Aborting.\n"); + abort(); + } + + for (int i = 0; i < num_bytes; i++) { + outbuf[i] = tmp_buffer[i]; + for (int j = 1; j < num_pes; j++) { + outbuf[i] &= tmp_buffer[j * num_bytes + i]; + } + } + + delete[] tmp_buffer; +} + +void GDABackend::create_new_team([[maybe_unused]] Team *parent_team, + TeamInfo *team_info_wrt_parent, + TeamInfo *team_info_wrt_world, int num_pes, + int my_pe_in_new_team, MPI_Comm team_comm, + rocshmem_team_t *new_team) { + /** + * Read the bit mask and find out a common index into + * the pool of available work arrays. + */ + if (team_comm != MPI_COMM_NULL) { + NET_CHECK(MPI_Allreduce(team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_, + MPI_CHAR, MPI_BAND, team_comm)); + } else { + Allreduce_char_BAND (team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_, parent_team); + } + + /* Pick the least significant non-zero bit (logical layout) in the reduced + * bitmask */ + auto max_num_teams{team_tracker.get_max_num_teams()}; + int common_index = get_ls_non_zero_bit(team_reduced_bitmask_, max_num_teams); + if (common_index < 0) { + /* No team available */ + printf("Could not create team, all bits in use. Aborting.\n"); + abort(); + } + + /* Mark the team as taken (by unsetting the bit in the pool bitmask) */ + int byte = common_index / CHAR_BIT; + team_pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT)); + + /** + * Allocate device-side memory for team_world and + * construct a GDA team in it + */ + GDATeam *new_team_obj; + CHECK_HIP(hipMalloc(&new_team_obj, sizeof(GDATeam))); + new (new_team_obj) + GDATeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes, + my_pe_in_new_team, team_comm, common_index); + + *new_team = get_external_team(new_team_obj); +} + +void GDABackend::ctx_create(int64_t options, void **ctx) { + GDAHostContext *new_ctx{nullptr}; + new_ctx = new GDAHostContext(this, options); + *ctx = new_ctx; +} + +GDAHostContext *get_internal_gda_net_ctx(Context *ctx) { + return reinterpret_cast(ctx); +} + +void GDABackend::ctx_destroy(Context *ctx) { + GDAHostContext *gda_host_ctx{get_internal_gda_net_ctx(ctx)}; + delete gda_host_ctx; +} + +void GDABackend::reset_backend_stats() { + assert(false); +} + +void GDABackend::dump_backend_stats() { + assert(false); +} + +__host__ void GDABackend::global_exit(int status) { + if (backend_comm != MPI_COMM_NULL) + MPI_Abort(backend_comm, status); + else + abort(); +} + +void GDABackend::cleanup_teams() { + free(team_pool_bitmask_); + free(team_reduced_bitmask_); +} + +void GDABackend::setup_wrk_sync_buffer() { + /** + * compute work/sync buffer size + */ + auto max_num_teams{team_tracker.get_max_num_teams()}; + + /** + * size of barrier sync + */ + wrk_sync_pool_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE; + + /** + * Size of sync arrays for the teams + */ + wrk_sync_pool_size_ += sizeof(long) * max_num_teams * + (ROCSHMEM_BARRIER_SYNC_SIZE + + ROCSHMEM_REDUCE_SYNC_SIZE + + ROCSHMEM_BCAST_SYNC_SIZE + + ROCSHMEM_ALLTOALL_SYNC_SIZE); + + /** + * Size of work arrays for the teams + * Accommodate largest possible data type for pWrk + */ + wrk_sync_pool_size_ += sizeof(double) * max_num_teams * + (ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE + + ROCSHMEM_ATA_MAX_WRKDATA_SIZE); + + /** + * Size of fence array + */ + wrk_sync_pool_size_ += sizeof(int) * num_pes; //TODO: do we need a fence array? + + /** + * Allocate a buffer of size wrk_sync_pool_size_, using heap memory + * (should be uncached fine-grained ideally) + */ + heap.malloc((void**)&wrk_sync_pool_, wrk_sync_pool_size_); + assert(wrk_sync_pool_); + wrk_sync_pool_top_ = wrk_sync_pool_; +} + +void GDABackend::cleanup_wrk_sync_buffer() { + heap.free(wrk_sync_pool_); +} + +void GDABackend::setup_fence_buffer() { //TODO is this used? + /* + * Reserve memory for fence + */ + fence_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(int) * num_pes; +} + +void GDABackend::setup_collectives() { + /* + * Allocate heap space for barrier_sync + */ + size_t one_sync_size_bytes {sizeof(*barrier_sync)}; + size_t sync_size_bytes {one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE}; + + barrier_sync = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sync_size_bytes; + + /* + * Initialize the barrier synchronization array with default values. + */ + for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) { + barrier_sync[i] = ROCSHMEM_SYNC_VALUE; + } + + /* + * Make sure that all processing elements have done this before + * continuing. + */ + rte_barrier(); +} + +void GDABackend::setup_teams() { + /** + * Allocate pools for the teams sync and work arrary from the SHEAP. + */ + auto max_num_teams{team_tracker.get_max_num_teams()}; + + barrier_pSync_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE + * max_num_teams; + + reduce_pSync_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE + * max_num_teams; + + bcast_pSync_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE + * max_num_teams; + + alltoall_pSync_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE + * max_num_teams; + + /* Accommodating for largest possible data type for pWrk */ + pWrk_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE + * max_num_teams; + + + pAta_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE + * max_num_teams; + + /** + * Initialize the sync arrays in the pool with default values. + */ + long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync; + for (int team_i = 0; team_i < max_num_teams; team_i++) { + barrier_pSync = reinterpret_cast( + &barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]); + reduce_pSync = reinterpret_cast( + &reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]); + bcast_pSync = reinterpret_cast( + &bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]); + alltoall_pSync = reinterpret_cast( + &alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]); + + for (size_t i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) { + barrier_pSync[i] = ROCSHMEM_SYNC_VALUE; + } + for (size_t i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) { + reduce_pSync[i] = ROCSHMEM_SYNC_VALUE; + } + for (size_t i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) { + bcast_pSync[i] = ROCSHMEM_SYNC_VALUE; + } + for (size_t i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) { + alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE; + } + } + + /** + * Initialize bit mask + * + * Logical: + * MSB..........................................................................LSB + * Physical: MSB...1st least significant 8 bits...LSB MSB...2nd least + * signifant 8 bits...LSB + * + * Description shows only a 2-byte long mask but idea extends to any + * arbitrary size. + */ + team_bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1) + : (max_num_teams / CHAR_BIT); + team_pool_bitmask_ = reinterpret_cast(malloc(team_bitmask_size_)); + team_reduced_bitmask_ = reinterpret_cast(malloc(team_bitmask_size_)); + + memset(team_pool_bitmask_, 0, team_bitmask_size_); + memset(team_reduced_bitmask_, 0, team_bitmask_size_); + /* Set all to available except the 0th one (reserved for TEAM_WORLD) */ + for (int bit_i = 1; bit_i < max_num_teams; bit_i++) { + int byte_i = bit_i / CHAR_BIT; + team_pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT); + } + + /** + * Make sure that all processing elements have done this before + * continuing. + */ + rte_barrier(); +} + +void GDABackend::rte_barrier() { + if (backend_comm != MPI_COMM_NULL) { + NET_CHECK(MPI_Barrier(backend_comm)); + } else { + backend_bootstr->barrier(); + } +} + +static void dump_ibv_context(struct ibv_context *x); +static void dump_ibv_device(struct ibv_device *x); +static void dump_ibv_pd(struct ibv_pd *x); +static void dump_ibv_port_attr(struct ibv_port_attr *x); +static void dump_ibv_qp(struct ibv_qp *qp, int conn_num); +static void dump_mlx5dv_qp(struct mlx5dv_qp *qp_dv, int conn_num); +static void dump_mlx5dv_cq(struct mlx5dv_cq *cq_dv, int conn_num); + +void GDABackend::setup_ibv() { + dest_info.resize(num_pes * (maximum_num_contexts_ + 1)); + int ib_devices{0}; + dev_list = ibv_get_device_list(&ib_devices); + CHECK_NNULL(dev_list, "ibv_get_device"); + struct ibv_device* ib_dev = dev_list[0]; //TODO default to HIP selected device? + if (requested_dev) { + for (int i = 0; i < ib_devices; i++) { + const char* select_dev{ibv_get_device_name(dev_list[i])}; + CHECK_NNULL(select_dev, "ibv_get_device_name"); + if (strstr(select_dev, requested_dev)) { + ib_dev = dev_list[i]; + break; + } + } + } + uint8_t port{1}; + ib_init(ib_dev, port); + create_qps(port, &ib_state->portinfo); + + auto npes = num_pes; + auto dinfo = dest_info.data(); + for (int i = 0; i < maximum_num_contexts_ + 1; i++) { + if (backend_comm != MPI_COMM_NULL) { + MPI_Alltoall(MPI_IN_PLACE, sizeof(dest_info_t), MPI_CHAR, dinfo + i * npes, sizeof(dest_info_t), MPI_CHAR, backend_comm); + } else { + Alltoall_char_inplace(reinterpret_cast(dinfo + i * npes), sizeof(dest_info_t), ROCSHMEM_TEAM_WORLD); + } + } + + for (int i = 0; i < qps.size(); i++) { + change_status_rtr(qps[i], &dest_info[i], port); + } + rte_barrier(); + for (int i = 0; i < qps.size(); i++) { + change_status_rts(qps[i], &dest_info[i]); + dump_ibv_qp(qps[i], i); + } + rte_barrier(); +} + +void GDABackend::cleanup_ibv() { + ibv_free_device_list(dev_list); + + delete ib_state; + if (requested_dev != nullptr) + free(requested_dev); +} + + +void GDABackend::setup_heap_memory_rkey() { + auto *base_heap = heap.get_local_heap_base(); + int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; + + heap_mr = ibv_reg_mr(ib_state->pd_orig, base_heap, heap.get_size(), access); + CHECK_NNULL(heap_mr, "ibv_reg_mr"); + + const size_t rkeys_size = sizeof(uint32_t) * num_pes; + uint32_t *host_rkey_cpy = reinterpret_cast(malloc(rkeys_size)); + if (!host_rkey_cpy) { abort(); } + + CHECK_HIP(hipHostMalloc(&heap_rkey, sizeof(uint32_t) * num_pes)); + heap_rkey[my_pe] = heap_mr->rkey; + + hipStream_t stream; + CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + CHECK_HIP(hipMemcpyAsync(host_rkey_cpy, heap_rkey, rkeys_size, hipMemcpyDeviceToHost, stream)); + CHECK_HIP(hipStreamSynchronize(stream)); + + if (backend_comm != MPI_COMM_NULL) + MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_rkey_cpy, sizeof(uint32_t), MPI_CHAR, backend_comm); + else + backend_bootstr->allGather(host_rkey_cpy, sizeof(uint32_t)); + + CHECK_HIP(hipMemcpyAsync(heap_rkey, host_rkey_cpy, rkeys_size, hipMemcpyHostToDevice, stream)); + CHECK_HIP(hipStreamSynchronize(stream)); + CHECK_HIP(hipStreamDestroy(stream)); + + free(host_rkey_cpy); +} + +void GDABackend::cleanup_heap_memory_rkey() { + int ret = ibv_dereg_mr(heap_mr); + CHECK_ZERO(ret, "ibv_dereg_mr"); + + CHECK_HIP(hipHostFree(heap_rkey)); +} + +void GDABackend::setup_gpu_qps() { + CHECK_HIP(hipMalloc(&gpu_qps, sizeof(QueuePair) * (maximum_num_contexts_ + 1) * num_pes)); + for (int i = 0; i < (maximum_num_contexts_ + 1) * num_pes; i++) { + QueuePair qp(ib_state->pd_orig); + CHECK_HIP(hipMemcpy(&gpu_qps[i], &qp, sizeof(QueuePair), hipMemcpyDefault)); + initialize_gpu_qp(&gpu_qps[i], i); + } +} + +void GDABackend::cleanup_gpu_qps() { + //TODO need to destruct qp[i]? + CHECK_HIP(hipFree(gpu_qps)); + gpu_qps = nullptr; +} + +//TODO this ifdef sequence should go in a nic-specific file, like it is for bnxt, maybe whats above too? +#ifndef GDA_BNXT +void GDABackend::ib_init(struct ibv_device* ib_dev, uint8_t port) { + ib_state = new ib_state_t; + CHECK_NNULL(ib_state, "ib_state object create"); + + ib_state->context = ibv_open_device(ib_dev); + CHECK_NNULL(ib_state->context, "ib open device"); + dump_ibv_context(ib_state->context); + dump_ibv_device(ib_state->context->device); + + ib_state->pd_orig = ibv_alloc_pd(ib_state->context); + CHECK_NNULL(ib_state->pd_orig, "ib allocate pd"); + dump_ibv_pd(ib_state->pd_orig); + + ibv_parent_domain_init_attr pattr{}; + init_parent_domain_attr(&pattr); + ib_state->pd_parent = ibv_alloc_parent_domain(ib_state->context, &pattr); + CHECK_NNULL(ib_state->pd_parent, "ibv_alloc_parent_domain"); + dump_ibv_pd(ib_state->pd_parent); + +#ifdef GDA_IONIC + ionic_dv_pd_set_sqcmb(ib_state->pd_parent, false, false, false); + ionic_dv_pd_set_rqcmb(ib_state->pd_parent, false, false, false); + + for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) { + ib_state->pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(ib_state->context, &pattr); + CHECK_NNULL(ib_state->pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)"); + + ionic_dv_pd_set_sqcmb(ib_state->pd_uxdma[uxdma_i], false, false, false); + ionic_dv_pd_set_rqcmb(ib_state->pd_uxdma[uxdma_i], false, false, false); + ionic_dv_pd_set_udma_mask(ib_state->pd_uxdma[uxdma_i], 1u << uxdma_i); + } +#endif + + int err = ibv_query_port(ib_state->context, port, &ib_state->portinfo); + CHECK_ZERO(err, "ibv_query_port"); + dump_ibv_port_attr(&ib_state->portinfo); + + /* Must init after querying port */ + init_gid_index(port); + +#ifdef GDA_IONIC + ionic_dv_ctx dvctx; + ionic_dv_get_ctx(&dvctx, ib_state->context); + + int hip_dev_id = 0; + CHECK_HIP(hipGetDevice(&hip_dev_id)); + + void* gpu_db_page = nullptr; + rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id); + + uint64_t *db_page_u64 = reinterpret_cast(dvctx.db_page); + uint64_t *gpu_db_page_u64 = reinterpret_cast(gpu_db_page); + + uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64]; + + ib_state->gpu_db_page = gpu_db_page; + ib_state->gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype]; + ib_state->gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype]; +#endif +} + +template +void GDABackend::try_to_modify_qp(ibv_qp* qp, StateType state) { + int err = ibv_modify_qp(qp, &state.exp_qp_attr, state.exp_attr_mask); + CHECK_ZERO(err, "ibv_modify_qp"); +} + +void GDABackend::init_qp_status(ibv_qp* qp, uint8_t port) { + try_to_modify_qp(qp, initqp(port)); +} + +void GDABackend::change_status_rtr(ibv_qp* qp, dest_info_t* dest, uint8_t port) { + try_to_modify_qp(qp, rtr(dest, port)); +} + +void GDABackend::change_status_rts(ibv_qp* qp, dest_info_t* dest) { + try_to_modify_qp(qp, rts(dest)); +} + +void GDABackend::create_qps(uint8_t port, ibv_port_attr* ib_port_att) { + ibv_qp_cap cap{}; + cap.max_send_wr = sq_size; + cap.max_send_sge = 1; + cap.max_inline_data = 0; +#ifdef GDA_IONIC + // TODO allow zero sges in the driver + cap.max_recv_sge = 1; +#endif + QPInitAttr qp_init_attr{qpattr(cap)}; + cqs.resize((maximum_num_contexts_ + 1) * num_pes); + qps.resize((maximum_num_contexts_ + 1) * num_pes); + int max_num_cqe = qp_init_attr.attr.cap.max_send_wr; + for (int i = 0; i < qps.size(); i++) { +#ifdef GDA_IONIC + int uxdma_i = ((i + 1) / 2) & 1; + cqs[i] = create_cq(ib_state->context, ib_state->pd_uxdma[uxdma_i], max_num_cqe << 1); + CHECK_NNULL(cqs[i], "create_cq"); + qps[i] = create_qp(ib_state->pd_uxdma[uxdma_i], ib_state->context, &qp_init_attr.attr, cqs[i]); +#else + cqs[i] = create_cq(ib_state->context, ib_state->pd_parent, max_num_cqe); + CHECK_NNULL(cqs[i], "create_cq"); + qps[i] = create_qp(ib_state->pd_parent, ib_state->context, &qp_init_attr.attr, cqs[i]); +#endif + CHECK_NNULL(qps[i], "create_qp"); + init_qp_status(qps[i], port); + dest_info[i].lid = ib_port_att->lid; + dest_info[i].qpn = qps[i]->qp_num; + dest_info[i].psn = 0; + dest_info[i].gid = gid; + } +} + +void* GDABackend::pd_alloc(struct ibv_pd* pd, void* pd_context, size_t size, size_t alignment, uint64_t resource_type) { + void* dev_ptr{nullptr}; + //TODO make this configurable, presumably we want it on device for all types? +#ifdef GDA_IONIC + CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast(&dev_ptr), size, hipDeviceMallocUncached)); +#else + CHECK_HIP(hipHostMalloc(reinterpret_cast(&dev_ptr), size, hipHostMallocDefault)); +#endif + memset(dev_ptr, 0, size); + return dev_ptr; +} + +void GDABackend::pd_release(struct ibv_pd* pd, void* pd_context, void* ptr, uint64_t resource_type) { + CHECK_HIP(hipFree(ptr)); +} + +void GDABackend::init_parent_domain_attr(ibv_parent_domain_init_attr* attr1) { + attr1->pd = ib_state->pd_orig; + attr1->td = nullptr; + attr1->comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS; + attr1->alloc = GDABackend::pd_alloc; + attr1->free = GDABackend::pd_release; + attr1->pd_context = nullptr; +} + +ibv_cq* GDABackend::create_cq(ibv_context* context, ibv_pd* pd, int cqe) { + ibv_cq_init_attr_ex cq_attr; + memset(&cq_attr, 0, sizeof(ibv_cq_init_attr_ex)); + cq_attr.cqe = cqe; + cq_attr.cq_context = nullptr; + cq_attr.channel = nullptr; + cq_attr.comp_vector = 0; + cq_attr.flags = 0; // see ibv_exp_cq_create_flags + cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_PD; + cq_attr.parent_domain = pd; + ibv_cq_ex* cq_ex = ibv_create_cq_ex(context, &cq_attr); + CHECK_NNULL(cq_ex, "ibv_create_cq_ex"); + ibv_cq *cq = ibv_cq_ex_to_cq(cq_ex); + CHECK_NNULL(cq, "ibv_cq_ex_to_cq"); + return cq; +} + +void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { + int hip_dev_id{-1}; + CHECK_HIP(hipGetDevice(&hip_dev_id)); + +#ifdef GDA_IONIC + uint8_t udma_idx = ionic_dv_qp_get_udma_idx(qps[conn_num]); + + ionic_dv_cq dvcq; + ionic_dv_get_cq(&dvcq, cqs[conn_num], udma_idx); + + gpu_qp->cq_dbreg = ib_state->gpu_db_cq; + gpu_qp->cq_dbval = dvcq.q.db_val; + gpu_qp->cq_mask = dvcq.q.mask; + + gpu_qp->cq_buf = reinterpret_cast(dvcq.q.ptr); + + ionic_dv_qp dvqp; + ionic_dv_get_qp(&dvqp, qps[conn_num]); + + gpu_qp->sq_dbreg = ib_state->gpu_db_sq; + gpu_qp->sq_dbval = dvqp.sq.db_val; + gpu_qp->sq_mask = dvqp.sq.mask; + gpu_qp->sq_buf = reinterpret_cast(dvqp.sq.ptr); + + gpu_qp->qp_num = qps[conn_num]->qp_num; + gpu_qp->lkey = heap_mr->lkey; + gpu_qp->rkey = heap_rkey[conn_num % num_pes]; + gpu_qp->inline_threshold = 32; +#else // !GDA_IONIC + mlx5dv_cq cq_out; + mlx5dv_obj mlx_obj; + mlx_obj.cq.in = cqs[conn_num]; + mlx_obj.cq.out = &cq_out; + mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_CQ); + dump_mlx5dv_cq(&cq_out, conn_num); + + /* + * struct mlx5dv_cq { + * void *buf; + * __be32 *dbrec; + * uint32_t cqe_cnt; + * uint32_t cqe_size; + * void *cq_uar; + * uint32_t cqn; + * uint64_t comp_mask; + * }; + */ + + gpu_qp->cq_buf = reinterpret_cast(cq_out.buf); + gpu_qp->cq_cnt = cq_out.cqe_cnt; + gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt); + gpu_qp->cq_dbrec = cq_out.dbrec; + + mlx5dv_qp qp_out; + mlx_obj.qp.in = qps[conn_num]; + mlx_obj.qp.out = &qp_out; + mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP); + dump_mlx5dv_qp(&qp_out, conn_num); + + /* + * struct mlx5dv_qp { + * __be32 *dbrec; + * struct { + * void *buf; + * uint32_t wqe_cnt; + * uint32_t stride; + * } sq; + * struct { + * void *buf; + * uint32_t wqe_cnt; + * uint32_t stride; + * } rq; + * struct { + * void *reg; + * uint32_t size; + * } bf; + * uint64_t comp_mask; + * off_t uar_mmap_offset; + * uint32_t tirn; + * uint32_t tisn; + * uint32_t rqn; + * uint32_t sqn; + * uint64_t tir_icm_addr; + * }; + */ + + gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR + gpu_qp->sq_buf = reinterpret_cast(qp_out.sq.buf); + gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt; + gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]); + gpu_qp->lkey = htobe32(heap_mr->lkey); + gpu_qp->qp_num = qps[conn_num]->qp_num; + // The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers + void* gpu_ptr{nullptr}; + rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id); + gpu_qp->db.ptr = reinterpret_cast(gpu_ptr); +#endif // !GDA_IONIC +} + +ibv_qp* GDABackend::create_qp(ibv_pd* pd, ibv_context* context, ibv_qp_init_attr_ex* qp_attr, ibv_cq* cq) { + ibv_qp* qp{nullptr}; + assert(pd); + assert(context); + assert(qp_attr); + qp_attr->send_cq = cq; + qp_attr->recv_cq = cq; + qp_attr->pd = pd; + qp_attr->comp_mask = IBV_QP_INIT_ATTR_PD; + qp = ibv_create_qp_ex(context, qp_attr); + CHECK_NNULL(qp, "ibv_create_qp_ex"); + return qp; +} + +GDABackend::InitQPState GDABackend::initqp(uint8_t port) { + InitQPState init{}; + init.exp_qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; + init.exp_qp_attr.port_num = port; + init.exp_attr_mask |= IBV_QP_ACCESS_FLAGS; + return init; +} + +GDABackend::RtrState GDABackend::rtr(dest_info_t* dest, uint8_t port) { + RtrState rtr{}; + rtr.exp_qp_attr.dest_qp_num = dest->qpn; + rtr.exp_qp_attr.rq_psn = dest->psn; + rtr.exp_qp_attr.ah_attr.port_num = port; + rtr.exp_qp_attr.path_mtu = ib_state->portinfo.active_mtu; + if (ib_state->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { + rtr.exp_qp_attr.ah_attr.dlid = dest->lid; + } else { + rtr.exp_qp_attr.ah_attr.is_global = 1; + rtr.exp_qp_attr.ah_attr.grh.dgid = dest->gid; + rtr.exp_qp_attr.ah_attr.grh.sgid_index = gid_index; + rtr.exp_qp_attr.ah_attr.grh.hop_limit = 1; + } + rtr.exp_attr_mask |= IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; + return rtr; +} + +GDABackend::RtsState GDABackend::rts(dest_info_t* dest) { + RtsState rts{}; + rts.exp_qp_attr.sq_psn = dest->psn; + rts.exp_attr_mask |= IBV_QP_SQ_PSN; + return rts; +} + +GDABackend::QPInitAttr GDABackend::qpattr(ibv_qp_cap cap) { + QPInitAttr qpattr(cap); + qpattr.attr.qp_type = IBV_QPT_RC; + return qpattr; +} +#endif + +void GDABackend::init_gid_index(uint8_t port_num) { + struct ibv_gid_entry *gid_entries; + struct ibv_gid_entry *gid_entry; + union ibv_gid current_gid; + union ibv_gid selected_gid; + uint32_t gid_type; + int err; + + const uint8_t local_gid_prefix[2] = {0xFE, 0x80}; + uint32_t selected_gid_type = IBV_GID_TYPE_ROCE_V1; + int selected_gid_index = -1; + ssize_t gid_tbl_entries = 0; + + int gid_tbl_len = ib_state->portinfo.gid_tbl_len; + struct ibv_context *ctx = ib_state->context; + + gid_entries = (struct ibv_gid_entry*) calloc(gid_tbl_len, sizeof(struct ibv_gid_entry)); + + gid_tbl_entries = ibv_query_gid_table(ctx, gid_entries, gid_tbl_len, 0); + if (gid_tbl_entries < 0) { + fprintf(stderr, "[Warning] ibv_query_gid_table failed. No available GIDs\n"); + free(gid_entries); + return; + } + + for (int i = 0; i < gid_tbl_entries; i++) { + gid_type = gid_entries[i].gid_type; + + /* rocSHMEM does not use GIDs for IB mode */ + if (gid_type == IBV_GID_TYPE_IB) { + break; + } + + current_gid = gid_entries[i].gid; + + err = ibv_query_gid(ctx, port_num, i, ¤t_gid); + CHECK_ZERO(err, "ibv_query_gid"); + + /* We don't want local GIDs */ + if (memcmp(current_gid.raw, &local_gid_prefix, 2) == 0) { + continue; + } + + /* Initialize using first available GID */ + if (selected_gid_index == -1) { + selected_gid_index = i; + selected_gid_type = gid_type; + selected_gid = current_gid; + } + /* Choose RoCEv2 over RoCEv1 */ + else if (gid_type > selected_gid_type) { + selected_gid_index = i; + selected_gid_type = gid_type; + selected_gid = current_gid; + } + } + + gid_index = selected_gid_index; + gid = selected_gid; + + free(gid_entries); +} + +static void dump_ibv_context(struct ibv_context* x) { + /* + * struct ibv_context { + * struct ibv_device *device; + * struct ibv_context_ops ops; + * int cmd_fd; + * int async_fd; + * int num_comp_vectors; + * pthread_mutex_t mutex; + * void *abi_compat; + * }; + */ + DPRINTF("\n" + "===============================================\n" + " IBV_CONTEXT\n" + "===============================================\n" + " (ibv_device*) device = %p\n" + " (int) cmd_fd = %d\n" + " (int) async_fd = %d\n" + " (int) num_comp_vectors = %d\n" + " (void*) abi_compat = %p\n", + x->device, x->cmd_fd, x->async_fd, x->num_comp_vectors, x->abi_compat); +}; + +static void dump_ibv_device(struct ibv_device* x) { + /* + * struct ibv_device { + * struct _ibv_device_ops _ops; + * enum ibv_node_type node_type; + * enum ibv_transport_type transport_type; + * char name[IBV_SYSFS_NAME_MAX]; + * char dev_name[IBV_SYSFS_NAME_MAX]; + * char dev_path[IBV_SYSFS_PATH_MAX]; + * char ibdev_path[IBV_SYSFS_PATH_MAX]; + * }; + */ + DPRINTF("\n" + "===============================================\n" + " IBV_DEVICE\n" + "===============================================\n" + " (enum ibv_node_type) node_type = %d\n" + " (enum ibv_transport_type) transport_type = %d\n" + " (char[]) name = %s\n" + " (char[]) dev_name = %s\n" + " (char[]) dev_path = %s\n" + " (char[]) ibdev_path = %s\n", + x->node_type, x->transport_type, x->name, x->dev_name, x->dev_path, x->ibdev_path); +} + +static void dump_ibv_pd(struct ibv_pd* x) { + /* + * struct ibv_pd { + * struct ibv_context *context; + * uint32_t handle; + * }; + */ + DPRINTF("\n" + "===============================================\n" + " IBV_PD\n" + "===============================================\n" + " (ibv_context*) context = %p\n" + " (uint32_t) handle = 0x%x\n", + x->context, x->handle); +} + +static void dump_ibv_port_attr(struct ibv_port_attr* x) { + /* + * struct ibv_port_attr { + * enum ibv_port_state state; + * enum ibv_mtu max_mtu; + * enum ibv_mtu active_mtu; + * int gid_tbl_len; + * uint32_t port_cap_flags; + * uint32_t max_msg_sz; + * uint32_t bad_pkey_cntr; + * uint32_t qkey_viol_cntr; + * uint16_t pkey_tbl_len; + * uint16_t lid; + * uint16_t sm_lid; + * uint8_t lmc; + * uint8_t max_vl_num; + * uint8_t sm_sl; + * uint8_t subnet_timeout; + * uint8_t init_type_reply; + * uint8_t active_width; + * uint8_t active_speed; + * uint8_t phys_state; + * uint8_t link_layer; + * uint8_t flags; + * uint16_t port_cap_flags2; + * }; + */ + DPRINTF("\n" + "===============================================\n" + " IBV_PORT_ATTR\n" + "===============================================\n" + " (enum ibv_port_state) state = %u\n" + " (enum ibv_mtu) max_mtu = %u\n" + " (enum ibv_mtu) active_mtu = %u\n" + " (int) gid_tbl_len = %u\n" + " (uint32_t) port_cap_flags = 0x%x\n" + " (uint32_t) max_msg_sz = %u\n" + " (uint32_t) bad_pkey_cntr = %u\n" + " (uint32_t) qkey_viol_cntr = %u\n" + " (uint16_t) pkey_tbl_len = %u\n" + " (uint16_t) lid = 0x%x\n" + " (uint16_t) sm_lid = 0x%x\n" + " (uint8_t) lmc = 0x%x\n" + " (uint8_t) max_vl_num = 0x%x\n" + " (uint8_t) sm_sl = 0x%x\n" + " (uint8_t) subnet_timeout = 0x%x\n" + " (uint8_t) init_type_reply = 0x%x\n" + " (uint8_t) active_width = 0x%x\n" + " (uint8_t) active_speed = 0x%x\n" + " (uint8_t) phys_state = 0x%x\n" + " (uint8_t) link_layer = 0x%x\n" + " (uint8_t) flags = 0x%x\n" + " (uint16_t) port_cap_flags2 = 0x%x\n", + x->state, x->max_mtu, x->active_mtu, x->gid_tbl_len, x->port_cap_flags, x->max_msg_sz, + x->bad_pkey_cntr, x->qkey_viol_cntr, x->pkey_tbl_len, x->lid, x->sm_lid, x->lmc, x->max_vl_num, + x->sm_sl, x->subnet_timeout, x->init_type_reply, x->active_width, x->active_speed, x->phys_state, + x->link_layer, x->flags, x->port_cap_flags2); +} + +void dump_ibv_qp(struct ibv_qp *qp, int conn_num) { + /* + * struct ibv_qp { + * struct ibv_context *context; + * void *qp_context; + * struct ibv_pd *pd; + * struct ibv_cq *send_cq; + * struct ibv_cq *recv_cq; + * struct ibv_srq *srq; + * uint32_t handle; + * uint32_t qp_num; + * enum ibv_qp_state state; + * enum ibv_qp_type qp_type; + * pthread_mutex_t mutex; + * pthread_cond_t cond; + * uint32_t events_completed; + * }; + */ + DPRINTF("\n"); + DPRINTF("============== QP_DUMP CONNECTION#%d ==========\n", conn_num); + DPRINTF(" (ibv_context*) context = %p\n", qp->context); + DPRINTF(" (void*) qp_context = %p\n", qp->qp_context); + DPRINTF(" (ibv_pd*) pd = %p\n", qp->pd); + DPRINTF(" (ibv_cq*) send_cq = %p\n", qp->send_cq); + DPRINTF(" (ibv_cq*) recv_cq = %p\n", qp->recv_cq); + DPRINTF(" (ibv_srq*) srq = %p\n", qp->srq); + DPRINTF(" (uint32_t) handle = 0x%x\n", qp->handle); + DPRINTF(" (uint32_t) qp_num = 0x%x\n", qp->qp_num); + DPRINTF(" (enum ibv_qp_state) state = %u\n", qp->state); + DPRINTF(" (enum_ibv_qp_type) qp_type = %u\n", qp->qp_type); + DPRINTF(" (uint32_t) events_completed = %u\n", qp->events_completed); + DPRINTF("=========== QP_DUMP_END CONNECTION#%d ========\n", conn_num); +} + +#if !defined(GDA_IONIC) && !defined(GDA_BNXT) +void dump_mlx5dv_qp(struct mlx5dv_qp *qp_dv, int conn_num) { + DPRINTF("\n"); + DPRINTF("===============================================\n"); + DPRINTF(" INITIALIZED MLXDV_QP FOR CONNECTION#%d\n", conn_num); + DPRINTF("===============================================\n"); + DPRINTF("=================== QP_DUMP ===================\n"); + DPRINTF(" (__be32*) dbrec = %p\n", qp_dv->dbrec); + DPRINTF(" (void*) sq.buf = %p\n", qp_dv->sq.buf); + DPRINTF(" (uint32_t) sq.wqe_cnt = %u\n", qp_dv->sq.wqe_cnt); + DPRINTF(" (uint32_t) sq.stride = %u\n", qp_dv->sq.stride); + DPRINTF(" (void*) rq.buf = %p\n", qp_dv->rq.buf); + DPRINTF(" (uint32_t) rq.wqe_cnt = %u\n", qp_dv->rq.wqe_cnt); + DPRINTF(" (uint32_t) rq.stride = %u\n", qp_dv->rq.stride); + DPRINTF(" (void*) bf.reg = %p\n", qp_dv->bf.reg); + DPRINTF(" (uint32_t) bf.size = 0x%x\n", qp_dv->bf.size); + DPRINTF(" (uint64_t) comp_mask = 0x%lx\n", qp_dv->comp_mask); + DPRINTF(" (off_t) uar_mmap_offset = 0x%lx\n", qp_dv->uar_mmap_offset); + DPRINTF(" (uint32_t) tirn = 0x%x\n", qp_dv->tirn); + DPRINTF(" (uint32_t) tisn = 0x%x\n", qp_dv->tisn); + DPRINTF(" (uint32_t) rqn = 0x%x\n", qp_dv->rqn); + DPRINTF(" (uint32_t) sqn = 0x%x\n", qp_dv->sqn); + DPRINTF(" (uint64_t) tir_icm_addr = 0x%lx\n", qp_dv->tir_icm_addr); + DPRINTF("================== QP_DUMP_END ================\n"); +} + +void dump_mlx5dv_cq(struct mlx5dv_cq *cq_dv, int conn_num) { + DPRINTF("\n"); + DPRINTF("===============================================\n"); + DPRINTF(" INITIALIZED MLX5DV_CQ FOR CONNECTION#%d\n", conn_num); + DPRINTF("===============================================\n"); + DPRINTF("=================== CQ_DUMP ===================\n"); + DPRINTF(" (void*) buf = %p\n", cq_dv->buf); + DPRINTF(" (__be32*) dbrec = %p\n", cq_dv->dbrec); + DPRINTF(" (uint32_t) cqe_cnt = %u\n", cq_dv->cqe_cnt); + DPRINTF(" (uint32_t) cqe_size = %u\n", cq_dv->cqe_size); + DPRINTF(" (void*) cq_uar = %p\n", cq_dv->cq_uar); + DPRINTF(" (uint32_t) cqn = 0x%x\n", cq_dv->cqn); + DPRINTF(" (uint64_t) comp_mask = 0x%lx\n", cq_dv->comp_mask); + DPRINTF("================== CQ_DUMP_END ================\n"); +} +#endif // !GDA_IONIC + +} // namespace rocshmem diff --git a/src/gda/backend_gda.hpp b/src/gda/backend_gda.hpp new file mode 100644 index 0000000000..becdcfdb46 --- /dev/null +++ b/src/gda/backend_gda.hpp @@ -0,0 +1,485 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_BACKEND_HPP_ +#define LIBRARY_SRC_GDA_BACKEND_HPP_ + +#include "backend_bc.hpp" +#include "containers/free_list_impl.hpp" +#include "hdp_proxy.hpp" //TODO useless? +#include "memory/hip_allocator.hpp" +#include "context_incl.hpp" +#include "gda_context_proxy.hpp" +#include "queue_pair.hpp" +#include "bootstrap/bootstrap.hpp" + +namespace rocshmem { + +class GDAContext; +class GDAHostContext; +class QueuePair; +class HostInterface; + +class GDABackend : public Backend { + private: + typedef struct ib_state { + struct ibv_context* context; + struct ibv_pd* pd_orig; +#ifndef GDA_BNXT + struct ibv_pd* pd_parent; +#endif +#ifdef GDA_IONIC + struct ibv_pd* pd_uxdma[2]; +#endif + struct ibv_mr* mr; + struct ibv_port_attr portinfo; + +#ifdef GDA_IONIC + void *gpu_db_page; + uint64_t *gpu_db_cq; + uint64_t *gpu_db_sq; +#endif + } ib_state_t; + + typedef struct dest_info { + int lid; + int qpn; + int psn; + union ibv_gid gid; + } dest_info_t; + +#ifndef GDA_BNXT + class State { + public: + ibv_qp_attr exp_qp_attr{}; + uint64_t exp_attr_mask{}; + }; + + class InitQPState : public State { + public: + InitQPState() { + exp_qp_attr.qp_state = IBV_QPS_INIT; + exp_qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; + exp_attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; + } + }; + + class RtrState : public State { + public: + RtrState() { + exp_qp_attr.qp_state = IBV_QPS_RTR; + exp_qp_attr.ah_attr.sl = 1; + exp_qp_attr.max_dest_rd_atomic = GDA_MAX_ATOMIC; + exp_qp_attr.min_rnr_timer = 12; + exp_attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU; + } + }; + + class RtsState : public State { + public: + RtsState() { + exp_qp_attr.qp_state = IBV_QPS_RTS; + exp_qp_attr.timeout = 14; + exp_qp_attr.retry_cnt = 7; + exp_qp_attr.rnr_retry = 7; + exp_qp_attr.max_rd_atomic = GDA_MAX_ATOMIC; + exp_attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC; + } + }; + + class QPInitAttr { + public: + explicit QPInitAttr(ibv_qp_cap cap) { + attr.cap = cap; + attr.sq_sig_all = 0; + } + ibv_qp_init_attr_ex attr{}; + }; +#endif + + /** + * @brief Common code invoked from the different constructors + */ + void read_env(); + void setup_ibv(); + void cleanup_ibv(); + + public: + friend GDAContext; + + /** + * @copydoc Backend::Backend(unsigned) + */ + explicit GDABackend(MPI_Comm comm); + explicit GDABackend(TcpBootstrap *bootstr); + + /** + * @copydoc Backend::~Backend() + */ + virtual ~GDABackend(); + + __device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx); + + /** + * @brief Destroy a `rocshmem_ctx_t` context and returns it back to the + * context free list. + */ + __device__ void destroy_ctx(rocshmem_ctx_t *ctx); + + /** + * @copydoc Backend::ctx_create + */ + void ctx_create(int64_t options, void **ctx) override; + + /** + * @copydoc Backend::ctx_destroy + */ + void ctx_destroy(Context *ctx) override; + + /** + * @brief Abort the application. + * + * @param[in] status Exit code. + * + * @return void. + * + * @note This routine terminates the entire application. + */ + void global_exit(int status) override; + + /** + * @copydoc Backend::create_new_team + */ + void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent, + TeamInfo *team_info_wrt_world, int num_pes, + int my_pe_in_new_team, MPI_Comm team_comm, + rocshmem_team_t *new_team) override; + + /** + * @copydoc Backend::team_destroy(rocshmem_team_t) + */ + void team_destroy(rocshmem_team_t team) override; + + /** + * @brief Accessor for work/sync bases + * + * @return Vector containing the addresses of the work/sync bases + */ + char** get_wrk_sync_bases() { return wrk_sync_pool_bases_; } //TODO UNUSED + + /** + * @brief The host-facing interface that will be used + * by all contexts of the GDABackend + */ + std::shared_ptr host_interface{nullptr}; + + /** + * @brief Scratchpad for the internal barrier algorithms. + */ + int64_t *barrier_sync{nullptr}; + + /** + * @brief Handle for raw memory for barrier sync + */ + long *barrier_pSync_pool{nullptr}; + + /** + * @brief Handle for raw memory for reduce sync + */ + long *reduce_pSync_pool{nullptr}; + + /** + * @brief Handle for raw memory for broadcast sync + */ + long *bcast_pSync_pool{nullptr}; + + /** + * @brief Handle for raw memory for alltoall sync + */ + long *alltoall_pSync_pool{nullptr}; + + /** + * @brief Handle for raw memory for work + */ + void *pWrk_pool{nullptr}; + + /** + * @brief Handle for raw memory for alltoall + */ + void *pAta_pool{nullptr}; + + /** + * @brief Handle for raw memory for fence/quiet + */ + int *fence_pool{nullptr}; + + protected: + /** + * @copydoc Backend::dump_backend_stats() + */ + void dump_backend_stats() override; + + /** + * @copydoc Backend::reset_backend_stats() + */ + void reset_backend_stats() override; + + /** + * @brief Allocates uncacheable host memory for the hdp policy. + * + * @note Internal data ownership is managed by the proxy + */ + HdpProxy hdp_proxy_{}; + + /** + * @brief Holds a copy of the default context for host functions + */ + std::unique_ptr default_host_ctx{nullptr}; + + /** + * @brief Allocate and initialize team world. + */ + void setup_team_world(); + + /** + * @brief Initialize the resources required to support teams + */ + void setup_teams(); + + /** + * @brief Destruct the resources required to support teams + */ + void cleanup_teams(); + + /** + * @brief Allocation and initialization of backend contexts. + */ + void setup_ctxs(); + void cleanup_ctxs(); + void setup_host_ctx(); + void setup_default_ctx(); + + /** + * @brief Allocate and initialize barrier operation addresses on + * symmetric heap. + * + * When this method completes, the barrier_sync member will be available + * for use. + */ + void setup_collectives(); + + /** + * @brief Allocate buffer for fence/quiet operation + */ + void setup_fence_buffer(); + + void setup_heap_memory_rkey(); + void cleanup_heap_memory_rkey(); + + void initialize_gpu_qp(QueuePair* qp, int conn_num); + +#ifndef GDA_BNXT + InitQPState initqp(uint8_t port); + + RtrState rtr(dest_info_t* dest, uint8_t port); + + RtsState rts(dest_info_t* dest); + + QPInitAttr qpattr(ibv_qp_cap cap); + + void init_qp_status(ibv_qp* qp, uint8_t port); +#endif + + void change_status_rtr(ibv_qp* qp, dest_info_t* dest, uint8_t port); + + void change_status_rts(ibv_qp* qp, dest_info_t* dest); + + void create_qps(uint8_t port, ibv_port_attr* ib_port_att); + +#ifdef GDA_BNXT + void init_qp_status(uint8_t port); + + void create_cqs(int ncqs, int cqe); + + void create_qps_impl(int nqps); + + int ibv_mtu_to_int(enum ibv_mtu mtu); +#else + template + void try_to_modify_qp(ibv_qp* qp, T state); + + static void* pd_alloc(ibv_pd* pd, void* pd_context, size_t size, size_t alignment, uint64_t resource_type); + + static void pd_release(ibv_pd* pd, void* pd_context, void* ptr, uint64_t resource_type); + + void init_parent_domain_attr(ibv_parent_domain_init_attr* attr); + + ibv_cq* create_cq(ibv_context* context, ibv_pd* pd, int cqe); + + ibv_qp* create_qp(ibv_pd* pd, ibv_context* context, ibv_qp_init_attr_ex* qp_attr, ibv_cq* rcq); +#endif + + void ib_init(ibv_device* ib_dev, uint8_t port); + + void init_gid_index(uint8_t port); + + void setup_gpu_qps(); + void cleanup_gpu_qps(); + + char* requested_dev{nullptr}; + + ibv_device** dev_list{nullptr}; + + ib_state_t* ib_state{nullptr}; + + std::vector dest_info; + + private: + /** + * @brief Common code invoked from the different constructors + */ + void init(); + + /** + * @brief Proxy for the default context + * + * @note Internal data ownership is managed by the proxy + */ + GDADefaultContextProxyT default_context_proxy_; // init handled in constructor + + /** + * @brief An array of @ref ROContexts that backs the context FreeList. + */ + GDAContext *ctx_array{nullptr}; + + /** + * @brief A free-list containing contexts. + */ + FreeListProxy ctx_free_list{}; + + /** + * @brief Holds maximum number of contexts used in library + */ + size_t maximum_num_contexts_{32}; + + /** + * @brief The bitmask representing the availability of teams in the pool + */ + char *team_pool_bitmask_{nullptr}; + + /** + * @brief Bitmask to store the reduced result of bitmasks on pariticipating + * PEs + * + * With no thread-safety for this bitmask, multithreaded creation of teams is + * not supported. + */ + char *team_reduced_bitmask_{nullptr}; + + /** + * @brief Size of the bitmask + */ + int team_bitmask_size_{-1}; + + /** + * Fine grained memory allocator for buffers used in collectives Routines + */ + HIPDefaultFinegrainedAllocator fine_grained_allocator_ {}; + + /** + * @brief Collective routines work/sync buffer size + */ + size_t wrk_sync_pool_size_{}; + + /** + * @brief Collective routines work/sync buffer base ptr + */ + char* const wrk_sync_pool_{nullptr}; + + /** + * @brief Temporary buffer pointer pointing to the same address as + * wrk_sync_pool_, used to calculate the starting addresses of + * different work and sync buffers. + */ + char *wrk_sync_pool_top_{nullptr}; + + /** + * @brief Array containing the addresses of the work/sync buffer bases + * of other PEs + */ + char** wrk_sync_pool_bases_{nullptr};//TODO UNUSED, maybe used again later when we decouple the sync from the main heap + + /** + * @brief Initialize memory required for work/sync buffers and open GDA + * handle on PE's wrk_sync_pool. + */ + void setup_wrk_sync_buffer(); + + /** + * @brief Close GDA memory handles for work/sync buffers and deallocate + * work/sync buffer. + */ + void cleanup_wrk_sync_buffer(); + + /** + * @brief rte all-to-all + */ + void Alltoall_char_inplace (char *inoutbuf, size_t num_bytes, rocshmem_team_t team); + + /** + * @brief rte allreduce for teams + */ + void Allreduce_char_BAND (char* inbuf, char *outbuf, size_t num_bytes, Team *team); + + /** + * @brief rte barrier for initialization + */ + void rte_barrier(); + + QueuePair *gpu_qps{nullptr}; + + std::vector qps; + + std::vector cqs; + + uint32_t sq_size{1024}; + + uint32_t *heap_rkey{nullptr}; + + ibv_mr *heap_mr{nullptr}; + + union ibv_gid gid; + int gid_index; + +#ifdef GDA_BNXT + std::vector bnxt_qps; + std::vector bnxt_cqs; + + struct bnxt_re_dv_db_region_attr db_region_attr; +#endif +}; + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_BACKEND_HPP_ diff --git a/src/gda/bnxt/CMakeLists.txt b/src/gda/bnxt/CMakeLists.txt new file mode 100644 index 0000000000..6db1d7904f --- /dev/null +++ b/src/gda/bnxt/CMakeLists.txt @@ -0,0 +1,29 @@ +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### +target_sources( + ${PROJECT_NAME} + PRIVATE + backend_gda_bnxt.cpp + queue_pair_bnxt.cpp +) diff --git a/src/gda/bnxt/backend_gda_bnxt.cpp b/src/gda/bnxt/backend_gda_bnxt.cpp new file mode 100644 index 0000000000..b9ea02dec5 --- /dev/null +++ b/src/gda/bnxt/backend_gda_bnxt.cpp @@ -0,0 +1,366 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "gda/backend_gda.hpp" +#include "util.hpp" +#include // getpagesize() + +namespace rocshmem { + +int GDABackend::ibv_mtu_to_int(enum ibv_mtu mtu) { + switch (mtu) { + case IBV_MTU_256: return 256; + case IBV_MTU_512: return 512; + case IBV_MTU_1024: return 1024; + case IBV_MTU_2048: return 2048; + case IBV_MTU_4096: return 4096; + default: { + fprintf(stderr, "[ERROR] Invalid ibv_mtu\n"); + return 0; + } + } +} + +void GDABackend::ib_init(struct ibv_device* ib_dev, uint8_t port) { + int err; + + ib_state = new ib_state_t; + CHECK_NNULL(ib_state, "ib_state object create"); + + ib_state->context = ibv_open_device(ib_dev); + CHECK_NNULL(ib_state->context, "ibv_open_device"); + + ib_state->pd_orig = ibv_alloc_pd(ib_state->context); + CHECK_NNULL(ib_state->pd_orig, "ibv_alloc_pd"); + + err = ibv_query_port(ib_state->context, port, &ib_state->portinfo); + CHECK_ZERO(err, "ibv_query_port"); + + init_gid_index(port); +} + +void GDABackend::init_qp_status(uint8_t port) { + int err; + struct ibv_qp_attr attr; + int attr_mask; + + memset(&attr, 0, sizeof(struct ibv_qp_attr)); + + attr.qp_state = IBV_QPS_INIT; + attr.pkey_index = 0; + attr.port_num = port; + attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE + | IBV_ACCESS_LOCAL_WRITE + | IBV_ACCESS_REMOTE_READ + | IBV_ACCESS_REMOTE_ATOMIC; + + attr_mask = IBV_QP_STATE + | IBV_QP_PKEY_INDEX + | IBV_QP_PORT + | IBV_QP_ACCESS_FLAGS; + + for (int i =0; i < qps.size() ; i++) { + err = bnxt_re_dv_modify_qp(qps[i], &attr, attr_mask, 0, 0); + CHECK_ZERO(err, "bnxt_re_dv_modify_qp"); + } +} + +void GDABackend::change_status_rtr(ibv_qp *qp, dest_info_t *dest, uint8_t port) { + int err; + struct ibv_qp_attr attr; + int attr_mask; + + memset(&attr, 0, sizeof(struct ibv_qp_attr)); + attr.qp_state = IBV_QPS_RTR; + attr.path_mtu = ib_state->portinfo.active_mtu; + attr.rq_psn = dest->psn; + attr.dest_qp_num = dest->qpn; + + memcpy(&attr.ah_attr.grh.dgid, &dest->gid, 16); + attr.ah_attr.grh.sgid_index = gid_index; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.sl = 1; + attr.ah_attr.is_global = 1; + attr.ah_attr.port_num = port; + + attr.max_dest_rd_atomic = GDA_MAX_ATOMIC; + attr.min_rnr_timer = 12; + + attr_mask = IBV_QP_STATE + | IBV_QP_PATH_MTU + | IBV_QP_RQ_PSN + | IBV_QP_DEST_QPN + | IBV_QP_AV + | IBV_QP_MAX_DEST_RD_ATOMIC + | IBV_QP_MIN_RNR_TIMER; + + err = bnxt_re_dv_modify_qp(qp, &attr, attr_mask, 0, 0); + CHECK_ZERO(err, "bnxt_re_dv_modify_qp"); +} + +void GDABackend::change_status_rts(ibv_qp* qp, dest_info_t* dest) { + int err; + struct ibv_qp_attr attr; + int attr_mask; + + memset(&attr, 0, sizeof(struct ibv_qp_attr)); + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = dest->psn; + attr.max_rd_atomic = GDA_MAX_ATOMIC; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + + attr_mask = IBV_QP_STATE + | IBV_QP_SQ_PSN + | IBV_QP_MAX_QP_RD_ATOMIC + | IBV_QP_TIMEOUT + | IBV_QP_RETRY_CNT + | IBV_QP_RNR_RETRY; + + err = bnxt_re_dv_modify_qp(qp, &attr, attr_mask, 0, 0); + CHECK_ZERO(err, "bnxt_re_dv_modify_qp"); +} + +void GDABackend::create_qps(uint8_t port, ibv_port_attr* ib_port_att) { + int resize_length = (maximum_num_contexts_ + 1) * num_pes; + + cqs.resize(resize_length); + bnxt_cqs.resize(resize_length); + + bnxt_qps.resize(resize_length); + qps.resize(resize_length); + + create_cqs(qps.size(), sq_size); + create_qps_impl(qps.size()); + init_qp_status(port); + + for (int i{0}; i < qps.size(); i++) { + dest_info[i].lid = ib_port_att->lid; + dest_info[i].qpn = qps[i]->qp_num; + dest_info[i].psn = 0; + dest_info[i].gid = gid; + } +} + +void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { + struct bnxt_re_dv_obj dv_obj; + struct bnxt_re_dv_cq dv_cq; + struct bnxt_re_dv_qp dv_qp; + struct ibv_context *context; + struct ibv_qp *ib_qp; + int err; + + context = ib_state->context; + ib_qp = qps[conn_num]; + + /* Export CQ */ + memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj)); + dv_obj.cq.in = cqs[conn_num]; + dv_obj.cq.out = &dv_cq; + + err = bnxt_re_dv_init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ); + CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)"); + + memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq)); + gpu_qp->cq.buf = bnxt_cqs[conn_num].buf; + gpu_qp->cq.depth = bnxt_cqs[conn_num].depth; + gpu_qp->cq.id = dv_cq.cqn; + gpu_qp->cq.phase = BNXT_RE_QUEUE_START_PHASE; + + /* Export QP */ + memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj)); + dv_obj.qp.in = ib_qp; + dv_obj.qp.out = &dv_qp; + + err = bnxt_re_dv_init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP); + CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)"); + + memset(&gpu_qp->sq, 0, sizeof(bnxt_device_sq)); + gpu_qp->sq.buf = bnxt_qps[conn_num].sq_buf; + gpu_qp->sq.depth = bnxt_qps[conn_num].mem_info.sq_slots; + + if ((gpu_qp->sq.depth % BNXT_RE_STATIC_WQE_BB) != 0) { + fprintf(stderr, + "[WARNING] SQ depth not divisible by BNXT_RE_STATIC_WQE_BB. " + "There may be runtime errors.\n"); + } + + gpu_qp->sq.id = ib_qp->qp_num; + gpu_qp->sq.msntbl = bnxt_qps[conn_num].msntbl; + gpu_qp->sq.msn_tbl_sz = bnxt_qps[conn_num].msn_tbl_sz; + gpu_qp->sq.psn_sz_log2 = std::log2(bnxt_qps[conn_num].mem_info.sq_psn_sz); + gpu_qp->sq.mtu = ibv_mtu_to_int(ib_state->portinfo.active_mtu); + + /* Export DB */ + err = bnxt_re_dv_get_default_db_region(context, &db_region_attr); + CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)"); + + CHECK_HIP(hipHostRegister(db_region_attr.dbr, getpagesize(), hipHostRegisterDefault)); + CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->dbr, db_region_attr.dbr, 0)); + + /* Export Memory Keys */ + gpu_qp->lkey = heap_mr->lkey; + gpu_qp->rkey = heap_rkey[conn_num % num_pes]; +} + +void GDABackend::create_cqs(int ncqs, int cqe) { + struct bnxt_re_dv_cq_attr cq_attr; + struct bnxt_re_dv_cq_init_attr cq_init_attr; + struct bnxt_re_dv_umem_reg_attr umem_attr; + struct ibv_context *context; + + context = ib_state->context; + + for (int i = 0; i < ncqs; i++) { + /* Allocate CQ mem */ + memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr)); + bnxt_cqs[i].handle = bnxt_re_dv_cq_mem_alloc(context, cqe, &cq_attr); + CHECK_NNULL(bnxt_cqs[i].handle, "bnxt_re_dv_cq_mem_alloc"); + + /* Allocate CQ UMEM */ + bnxt_cqs[i].length = cq_attr.ncqe * cq_attr.cqe_size; + bnxt_cqs[i].depth = cq_attr.ncqe; + CHECK_HIP(hipExtMallocWithFlags(&bnxt_cqs[i].buf, bnxt_cqs[i].length, hipDeviceMallocUncached)); + + /* Register CQ UMEM */ + memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr)); + umem_attr.addr = bnxt_cqs[i].buf; + umem_attr.size = bnxt_cqs[i].length; + umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE; + + bnxt_cqs[i].umem_handle = bnxt_re_dv_umem_reg(context, &umem_attr); + CHECK_NNULL(bnxt_cqs[i].umem_handle, "bnxt_re_dv_umem_reg(cq_buf)"); + + /* Create CQ */ + memset(&cq_init_attr, 0, sizeof(struct bnxt_re_dv_cq_init_attr)); + cq_init_attr.cq_handle = (uint64_t) bnxt_cqs[i].handle; + cq_init_attr.umem_handle = bnxt_cqs[i].umem_handle; + cq_init_attr.ncqe = cq_attr.ncqe; + + cqs[i] = bnxt_re_dv_create_cq(context, &cq_init_attr); + CHECK_NNULL(cqs[i], "bnxt_re_dv_create_cq"); + } +} + +void GDABackend::create_qps_impl(int nqps) { + struct ibv_pd *pd; + struct ibv_context *context; + struct ibv_qp_init_attr ib_qp_attr; + struct bnxt_re_dv_umem_reg_attr umem_attr; + void *sq_ptr; + void *rq_ptr; + void* sq_umem_handle; + void* rq_umem_handle; + uint64_t msntbl_len; + uint64_t msntbl_offset; + int err; + + pd = ib_state->pd_orig; + context = ib_state->context; + + for (int i = 0; i < nqps; i++) { + /* IB QP Init Attr */ + memset(&ib_qp_attr, 0, sizeof(struct ibv_qp_init_attr)); + ib_qp_attr.send_cq = cqs[i]; + ib_qp_attr.recv_cq = cqs[i]; + ib_qp_attr.cap.max_send_wr = sq_size; + ib_qp_attr.cap.max_recv_wr = 0; + ib_qp_attr.cap.max_send_sge = 1; + ib_qp_attr.cap.max_recv_sge = 0; + ib_qp_attr.cap.max_inline_data = 0; + ib_qp_attr.qp_type = IBV_QPT_RC; + ib_qp_attr.sq_sig_all = 0; + + /* Alloc qp_mem_info */ + memset(&bnxt_qps[i].mem_info, 0, sizeof(struct bnxt_re_dv_qp_mem_info)); + err = bnxt_re_dv_qp_mem_alloc(pd, &ib_qp_attr, &bnxt_qps[i].mem_info); + CHECK_ZERO(err, "bnxt_re_dv_qp_mem_alloc"); + + /* Alloc SQ */ + CHECK_HIP(hipExtMallocWithFlags(&sq_ptr, bnxt_qps[i].mem_info.sq_len, hipDeviceMallocUncached)); + bnxt_qps[i].mem_info.sq_va = (uint64_t) sq_ptr; + bnxt_qps[i].sq_buf = sq_ptr; + + /* Obtain MSN Table Pointer */ + msntbl_len = (bnxt_qps[i].mem_info.sq_psn_sz * bnxt_qps[i].mem_info.sq_npsn); + msntbl_offset = bnxt_qps[i].mem_info.sq_len - msntbl_len; + bnxt_qps[i].msntbl = (void*) ((char*) bnxt_qps[i].sq_buf + msntbl_offset); + bnxt_qps[i].msn_tbl_sz = bnxt_qps[i].mem_info.sq_npsn; + + /* Alloc RQ */ + CHECK_HIP(hipExtMallocWithFlags(&rq_ptr, bnxt_qps[i].mem_info.rq_len, hipDeviceMallocUncached)); + bnxt_qps[i].mem_info.rq_va = (uint64_t) rq_ptr; + bnxt_qps[i].rq_buf = rq_ptr; + + /* Register UMEM */ + memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr)); + umem_attr.addr = (void*) bnxt_qps[i].mem_info.sq_va; + umem_attr.size = bnxt_qps[i].mem_info.sq_len; + umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE; + + sq_umem_handle = bnxt_re_dv_umem_reg(context, &umem_attr); + CHECK_NNULL(sq_umem_handle, "bnxt_re_dv_umem_reg(sq)"); + + memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr)); + umem_attr.addr = (void*) bnxt_qps[i].mem_info.rq_va; + umem_attr.size = bnxt_qps[i].mem_info.rq_len; + umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE; + + rq_umem_handle = bnxt_re_dv_umem_reg(context, &umem_attr); + CHECK_NNULL(rq_umem_handle, "bnxt_re_dv_umem_reg(rq)"); + + /* IB DV QP Init Attr */ + memset(&bnxt_qps[i].attr, 0, sizeof(struct bnxt_re_dv_qp_init_attr)); + bnxt_qps[i].attr.send_cq = ib_qp_attr.send_cq; + bnxt_qps[i].attr.recv_cq = ib_qp_attr.recv_cq; + bnxt_qps[i].attr.max_send_wr = ib_qp_attr.cap.max_send_wr; + bnxt_qps[i].attr.max_recv_wr = ib_qp_attr.cap.max_recv_wr; + bnxt_qps[i].attr.max_send_sge = ib_qp_attr.cap.max_send_sge; + bnxt_qps[i].attr.max_recv_sge = ib_qp_attr.cap.max_recv_sge; + bnxt_qps[i].attr.max_inline_data = ib_qp_attr.cap.max_inline_data; + bnxt_qps[i].attr.qp_type = ib_qp_attr.qp_type; + + bnxt_qps[i].attr.qp_handle = bnxt_qps[i].mem_info.qp_handle; + bnxt_qps[i].attr.sq_umem_handle = sq_umem_handle; + bnxt_qps[i].attr.sq_len = bnxt_qps[i].mem_info.sq_len; + bnxt_qps[i].attr.sq_slots = bnxt_qps[i].mem_info.sq_slots; + bnxt_qps[i].attr.sq_wqe_sz = bnxt_qps[i].mem_info.sq_wqe_sz; + bnxt_qps[i].attr.sq_psn_sz = bnxt_qps[i].mem_info.sq_psn_sz; + bnxt_qps[i].attr.sq_npsn = bnxt_qps[i].mem_info.sq_npsn; + + bnxt_qps[i].attr.rq_umem_handle = rq_umem_handle; + bnxt_qps[i].attr.rq_len = bnxt_qps[i].mem_info.rq_len; + bnxt_qps[i].attr.rq_slots = bnxt_qps[i].mem_info.rq_slots; + bnxt_qps[i].attr.rq_wqe_sz = bnxt_qps[i].mem_info.rq_wqe_sz; + bnxt_qps[i].attr.comp_mask = bnxt_qps[i].mem_info.comp_mask; + + /* Alloc QP */ + qps[i] = bnxt_re_dv_create_qp(pd, &bnxt_qps[i].attr); + CHECK_NNULL(qps[i], "bnxt_re_dv_create_qp"); + } +} + +} // namespace rocshmem + diff --git a/src/gda/bnxt/provider_gda_bnxt.hpp b/src/gda/bnxt/provider_gda_bnxt.hpp new file mode 100644 index 0000000000..f255f21a13 --- /dev/null +++ b/src/gda/bnxt/provider_gda_bnxt.hpp @@ -0,0 +1,92 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_ +#define LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_ + +extern "C" { +#include +#include +} + +#define GDA_DEFAULT_GID 3 +#define GDA_MAX_ATOMIC 1 +#define GDA_OP_RDMA_WRITE BNXT_RE_WR_OPCD_RDMA_WRITE +#define GDA_OP_ATOMIC_FA BNXT_RE_WR_OPCD_ATOMIC_FA +#define GDA_OP_ATOMIC_CS BNXT_RE_WR_OPCD_ATOMIC_CS + +#define bnxt_re_get_cqe_sz() (sizeof(struct bnxt_re_req_cqe) + \ + sizeof(struct bnxt_re_bcqe)) + +#define bnxt_re_is_cqe_valid(valid, phase) \ + (((valid) & BNXT_RE_BCQE_PH_MASK) == (phase)) + +struct bnxt_device_wq { + void *buf; + uint32_t depth; + uint32_t head; + uint32_t tail; + uint32_t flags; + uint32_t id; + + uint32_t lock; + + uint32_t db_cnt {0}; +} __attribute__((packed)); + +struct bnxt_device_cq : public bnxt_device_wq { + uint32_t phase; +} __attribute__((packed)); + +struct bnxt_device_sq : public bnxt_device_wq { + uint32_t psn; + volatile uint32_t posted; + + void *msntbl; + uint32_t msn; + uint32_t msn_tbl_sz; + uint32_t psn_sz_log2; + uint64_t mtu; +} __attribute__((packed)); + +struct bnxt_host_cq { + void *buf; + void *handle; + void *umem_handle; + uint64_t length; + uint32_t depth; +} __attribute__((packed)); + +struct bnxt_host_qp { + struct bnxt_re_dv_qp_mem_info mem_info; + struct bnxt_re_dv_qp_init_attr attr; + void *sq_buf; + void *rq_buf; + void *msntbl; + uint32_t msn_tbl_sz; +} __attribute__((packed)); + +/*****************************************************************************/ + +#endif //LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_ diff --git a/src/gda/bnxt/queue_pair_bnxt.cpp b/src/gda/bnxt/queue_pair_bnxt.cpp new file mode 100644 index 0000000000..a141e3340e --- /dev/null +++ b/src/gda/bnxt/queue_pair_bnxt.cpp @@ -0,0 +1,381 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "gda/queue_pair.hpp" +#include "util.hpp" + +namespace rocshmem { + +__device__ static inline void bnxt_re_init_db_hdr(struct bnxt_re_db_hdr *hdr, + uint32_t indx, uint32_t toggle, + uint32_t qid, uint32_t typ) { + uint64_t key_lo; + uint64_t key_hi; + + key_lo = (indx | toggle); + + key_hi = (qid & BNXT_RE_DB_QID_MASK) + | ((typ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT) + | (0x1UL << BNXT_RE_DB_VALID_SHIFT); + + hdr->typ_qid_indx = (key_lo | (key_hi << 32)); +} + +__device__ static inline struct bnxt_re_msns* bnxt_re_pull_psn_buff(struct bnxt_device_sq *sq) { + return (struct bnxt_re_msns*)(((char *) sq->msntbl) + ((sq->msn) << sq->psn_sz_log2)); +} + +__device__ static inline uint64_t bnxt_re_update_msn_tbl(uint32_t st_idx, uint32_t npsn, + uint32_t start_psn) { + return ((((uint64_t)(st_idx) << BNXT_RE_SQ_MSN_SEARCH_START_IDX_SHIFT) & + BNXT_RE_SQ_MSN_SEARCH_START_IDX_MASK) | + (((uint64_t)(npsn) << BNXT_RE_SQ_MSN_SEARCH_NEXT_PSN_SHIFT) & + BNXT_RE_SQ_MSN_SEARCH_NEXT_PSN_MASK) | + (((start_psn) << BNXT_RE_SQ_MSN_SEARCH_START_PSN_SHIFT) & + BNXT_RE_SQ_MSN_SEARCH_START_PSN_MASK)); +} + +__device__ static inline void bnxt_re_fill_psns_for_msntbl(struct bnxt_device_sq *sq, + uint32_t msg_len) { + uint32_t npsn = 0, start_psn = 0, next_psn = 0; + struct bnxt_re_msns msns; + uint64_t *msns_ptr; + uint32_t pkt_cnt = 0; + /* Start slot index of the WQE */ + uint32_t st_idx = sq->tail; // * BNXT_RE_STATIC_WQE_SIZE_SLOTS; Do we need this? + // Get the MSN table address + msns_ptr = (uint64_t *)bnxt_re_pull_psn_buff(sq); + // Start PSN is the last recorded PSN + // Calculate the packet count based on the len of the WQE/MTU + msns.start_idx_next_psn_start_psn = 0; + start_psn = sq->psn; + pkt_cnt = (msg_len / sq->mtu); + + if (msg_len % sq->mtu) + pkt_cnt++; + + /* Increment the psn even for 0 len packets + * e.g. for opcode rdma-write-with-imm-data + * with length field = 0 + */ + if (msg_len == 0) + pkt_cnt = 1; + + /* make it 24 bit */ + next_psn = sq->psn + pkt_cnt; + npsn = next_psn; + sq->psn = next_psn; + msns.start_idx_next_psn_start_psn |= bnxt_re_update_msn_tbl(st_idx, npsn, start_psn); + sq->msn++; + sq->msn %= sq->msn_tbl_sz; + + memcpy(msns_ptr, &msns, sizeof(uint64_t)); +} + +__device__ static inline void bnxt_re_incr_tail(struct bnxt_device_sq *sq, uint8_t cnt) +{ + sq->tail += cnt; + if (sq->tail >= sq->depth) { + sq->tail %= sq->depth; + /* Rolled over, Toggle Tail bit in epoch flags */ + sq->flags ^= 1UL << BNXT_RE_FLAG_EPOCH_TAIL_SHIFT; + } +} + +__device__ static inline void* bnxt_re_get_hwqe(struct bnxt_device_sq *sq, uint32_t idx) +{ + idx += sq->tail; + if (idx >= sq->depth) + idx -= sq->depth; + return (void *)((char*)sq->buf + (idx << 4)); +} + +__device__ static inline void bnxt_re_incr_head(struct bnxt_device_cq *cq, uint8_t cnt) +{ + cq->head += cnt; + if (cq->head >= cq->depth) { + cq->head %= cq->depth; + /* Rolled over, Toggle HEAD bit in epoch flags */ + cq->flags ^= 1UL << BNXT_RE_FLAG_EPOCH_HEAD_SHIFT; + } +} + +__device__ static inline void bnxt_re_change_cq_phase(struct bnxt_device_cq *cq) +{ + if (!cq->head) { + cq->phase = !(cq->phase & BNXT_RE_BCQE_PH_MASK); + } +} + +__device__ static inline void aquire_lock(uint32_t *lock) { + uint32_t expected; + + do { + expected = 0; + } while (0 == __hip_atomic_compare_exchange_strong(lock, &expected, 1, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST, + __HIP_MEMORY_SCOPE_SYSTEM)); +} + +__device__ static inline void release_lock(uint32_t *lock) { + *lock = 0; +} + +__device__ void QueuePair::ring_cq_doorbell(uint32_t slot_idx) { + struct bnxt_re_db_hdr hdr; + uint32_t epoch; + + epoch = (cq.flags & BNXT_RE_FLAG_EPOCH_HEAD_MASK) << BNXT_RE_DB_EPOCH_HEAD_SHIFT; + + bnxt_re_init_db_hdr(&hdr, (slot_idx | epoch), 0, cq.flags, BNXT_RE_QUE_TYPE_CQ); + + __threadfence_system(); + __hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ void QueuePair::ring_sq_doorbell(uint32_t slot_idx) { + struct bnxt_re_db_hdr hdr; + uint32_t epoch; + + epoch = (sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT; + + bnxt_re_init_db_hdr(&hdr, (slot_idx | epoch), 0, sq.id, BNXT_RE_QUE_TYPE_SQ); + + __threadfence_system(); + __hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); +} + +__device__ int QueuePair::poll_cq() { + struct bnxt_re_bcqe *hdr; + void *cqe; + uint32_t flg_val; + int type; + uint8_t status; + + cqe = (void*) ((char*) cq.buf + (cq.head * bnxt_re_get_cqe_sz())); + hdr = (struct bnxt_re_bcqe*) ((char*)cqe + sizeof(struct bnxt_re_req_cqe)); + + flg_val = hdr->flg_st_typ_ph; + + __threadfence_system(); + + if (bnxt_re_is_cqe_valid(flg_val, cq.phase)) { + // Is the CQE valid? + status = (flg_val >> BNXT_RE_BCQE_STATUS_SHIFT) + & BNXT_RE_BCQE_STATUS_MASK; + + if (status != BNXT_RE_REQ_ST_OK) { + printf("CQ Error (%x)\n", status); + abort(); + return -1; + } + + /* Update the CQ Ptr */ + bnxt_re_incr_head(&cq, 1); + bnxt_re_change_cq_phase(&cq); + + /* Ring Doorbell */ + ring_cq_doorbell(cq.head); + + __hip_atomic_fetch_sub(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); + + return 1; + } + + return 0; +} + +__device__ void QueuePair::quiet() { + uint64_t active_lane_mask; + uint8_t active_lane_id; + + active_lane_mask = get_active_lane_mask(); + active_lane_id = get_active_lane_num(active_lane_mask); + + if (0 == active_lane_id) { + aquire_lock(&cq.lock); + while (__hip_atomic_load(&sq.posted, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT)) { + poll_cq(); + } + release_lock(&cq.lock); + } +} + +__device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { + uint64_t active_lane_mask; + uint8_t active_lane_count; + uint8_t active_lane_id; + + active_lane_mask = get_active_lane_mask(); + active_lane_count = get_active_lane_count(active_lane_mask); + active_lane_id = get_active_lane_num(active_lane_mask); + + if (0 == active_lane_id) { + aquire_lock(&sq.lock); + } + + for (int i = 0; i < active_lane_count; i++) { + if (i == active_lane_id) { + struct bnxt_re_bsqe hdr; + struct bnxt_re_rdma rdma; + struct bnxt_re_sge sge; + struct bnxt_re_bsqe *hdr_ptr; + struct bnxt_re_rdma *rdma_ptr; + struct bnxt_re_sge *sge_ptr; + uint32_t wqe_size; + uint32_t wqe_type; + uint32_t hdr_flags; + uint32_t rma_slots = 3; // (Three slots: hdr, rdma) + + hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0); + rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1); + sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2); + + /* Populate Header Segment */ + wqe_size = BNXT_RE_HDR_WS_MASK & rma_slots; + hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK) + & ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED); + wqe_type = BNXT_RE_HDR_WT_MASK & opcode; + + hdr.rsv_ws_fl_wt = (wqe_size << BNXT_RE_HDR_WS_SHIFT) + | (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT) + | wqe_type; + hdr.key_immd = 0; + hdr.lhdr.qkey_len = length; + + /* Populate RDMA Segment */ + rdma.rva = (uint64_t) raddr; + rdma.rkey = rkey; + + /* Populate SG Segment */ + sge.pa = (uint64_t) laddr; + sge.lkey = lkey; + sge.length = length; + + /* Write WQE to SQ */ + memcpy(hdr_ptr, &hdr, sizeof(struct bnxt_re_bsqe)); + memcpy(rdma_ptr, &rdma, sizeof(struct bnxt_re_rdma)); + memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge)); + + /* Populate MSN Table */ + bnxt_re_fill_psns_for_msntbl(&sq, length); + + /* Update SQ Pointer */ + bnxt_re_incr_tail(&sq, rma_slots); + + /* Ring Doorbell */ + ring_sq_doorbell(sq.tail); + + __hip_atomic_fetch_add(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); + + } + __threadfence_system(); + quiet(); + } + + if (0 == active_lane_id) { + release_lock(&sq.lock); + } +} + +__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t length, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { + uint64_t active_lane_mask; + uint8_t active_lane_count; + uint8_t active_lane_id; + + active_lane_mask = get_active_lane_mask(); + active_lane_count = get_active_lane_count(active_lane_mask); + active_lane_id = get_active_lane_num(active_lane_mask); + + if (0 == active_lane_id) { + aquire_lock(&sq.lock); + } + + for (int i = 0; i < active_lane_count; i++) { + if (i == active_lane_id) { + struct bnxt_re_bsqe hdr; + struct bnxt_re_atomic amo; + struct bnxt_re_sge sge; + struct bnxt_re_bsqe *hdr_ptr; + struct bnxt_re_atomic *amo_ptr; + struct bnxt_re_sge *sge_ptr; + uint32_t wqe_size; + uint32_t wqe_type; + uint32_t hdr_flags; + uint32_t amo_slots = 3; // (Three slots: hdr, amo, sge) + + hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0); + amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&sq, 1); + sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2); + + /* Populate Header Segment */ + wqe_size = BNXT_RE_HDR_WS_MASK & amo_slots; + hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK) + & ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED); + wqe_type = BNXT_RE_HDR_WT_MASK & opcode; + + hdr.rsv_ws_fl_wt = (wqe_size << BNXT_RE_HDR_WS_SHIFT) + | (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT) + | wqe_type; + hdr.key_immd = rkey; + hdr.lhdr.rva = (uint64_t) raddr; + + /* Populate AMO Segment */ + amo.swp_dt = atomic_data; + + /* Populate SG Segment - (Return address of atomic) */ + sge.pa = (uint64_t) nonfetching_atomic; + sge.lkey = nonfetching_atomic_lkey; + sge.length = length; + + /* Write WQE to SQ */ + memcpy(hdr_ptr, &hdr, sizeof(struct bnxt_re_bsqe)); + memcpy(amo_ptr, &amo, sizeof(struct bnxt_re_atomic)); + memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge)); + + /* Populate MSN Table */ + bnxt_re_fill_psns_for_msntbl(&sq, length); + + /* Update SQ Pointer */ + bnxt_re_incr_tail(&sq, amo_slots); + + /* Ring Doorbell */ + ring_sq_doorbell(sq.tail); + + __hip_atomic_fetch_add(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); + } + __threadfence_system(); + quiet(); + } + + if (0 == active_lane_id) { + release_lock(&sq.lock); + } + + return 0; +} + +} // namespace rocshmem diff --git a/src/gda/context_gda_device.cpp b/src/gda/context_gda_device.cpp new file mode 100644 index 0000000000..52eef85623 --- /dev/null +++ b/src/gda/context_gda_device.cpp @@ -0,0 +1,306 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include +#include + +#include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) +#include "rocshmem/rocshmem.hpp" +#include "backend_gda.hpp" +#include "context_gda_device.hpp" +#include "context_gda_tmpl_device.hpp" +#include "queue_pair.hpp" + +namespace rocshmem { + +__host__ GDAContext::GDAContext(Backend *b, unsigned int ctx_id) + : Context(b, false) { + GDABackend *backend{static_cast(b)}; + base_heap = backend->heap.get_heap_bases().data(); + + barrier_sync = backend->barrier_sync; + wrk_sync_pool_bases_ = backend->get_wrk_sync_bases(); + + CHECK_HIP(hipMalloc(&qps, sizeof(QueuePair) * num_pes)); + CHECK_HIP(hipMemset(qps, 0, sizeof(QueuePair) * num_pes)); + for (int i = 0; i < num_pes; i++) { + int offset = num_pes * ctx_id + i; + CHECK_HIP(hipMemcpy(&qps[i], &backend->gpu_qps[offset], sizeof(QueuePair), hipMemcpyDefault)); + qps[i].base_heap = base_heap; + } + ctx_id_ = ctx_id; +} + +__host__ GDAContext::~GDAContext() { + CHECK_HIP(hipFree(qps)); +} + +__device__ void GDAContext::ctx_create() { +} + +__device__ void GDAContext::ctx_destroy(){ +} + +__device__ void GDAContext::putmem(void *dest, const void *source, size_t nelems, + int pe) { + uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; + bool need_turn {true}; + uint64_t turns = __ballot(need_turn); + while (turns) { + uint8_t lane = __ffsll((unsigned long long)turns) - 1; + int pe_turn = __shfl(pe, lane); + if (pe_turn == pe) { + qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe); + qps[pe].quiet(); + need_turn = false; + } + turns = __ballot(need_turn); + } +} + +__device__ void GDAContext::getmem(void *dest, const void *source, size_t nelems, + int pe) { + printf("rocshmem::gda:getmem not implemented\n"); + abort(); +} + +__device__ void GDAContext::putmem_nbi(void *dest, const void *source, + size_t nelems, int pe) { + uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; + bool need_turn {true}; + uint64_t turns = __ballot(need_turn); + while (turns) { + uint8_t lane = __ffsll((unsigned long long)turns) - 1; + int pe_turn = __shfl(pe, lane); + if (pe_turn == pe) { + qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe); + need_turn = false; + } + turns = __ballot(need_turn); + } +} + +__device__ void GDAContext::getmem_nbi(void *dest, const void *source, + size_t nelems, int pe) { + printf("rocshmem::gda:getmem_nbi not implemented\n"); + abort(); +} + +__device__ void GDAContext::fence() { //TODO: optimize + for (int i = 0; i < num_pes; i++) { + qps[i].quiet(); + } + __threadfence_system(); +} + +__device__ void GDAContext::fence(int pe) { + fence(); //TODO: optimize +} + +__device__ void GDAContext::quiet() { + for (int i = 0; i < num_pes; i++) { + qps[i].quiet(); + } +} + +__device__ void *GDAContext::shmem_ptr(const void *dest, int pe) { + return nullptr; +} + +__device__ void GDAContext::putmem_wg(void *dest, const void *source, + size_t nelems, int pe) { + if (is_thread_zero_in_block()) { + printf("rocshmem::gda:putmem_wg not implemented\n"); + abort(); + } +} + +__device__ void GDAContext::getmem_wg(void *dest, const void *source, + size_t nelems, int pe) { + if (is_thread_zero_in_block()) { + printf("rocshmem::gda:getmem_wg not implemented\n"); + abort(); + } +} + +__device__ void GDAContext::putmem_nbi_wg(void *dest, const void *source, + size_t nelems, int pe) { + if (is_thread_zero_in_block()) { + printf("rocshmem::gda:putmem_nbi_wg not implemented\n"); + abort(); + } +} + +__device__ void GDAContext::getmem_nbi_wg(void *dest, const void *source, + size_t nelems, int pe) { + if (is_thread_zero_in_block()) { + printf("rocshmem::gda:getmem_nbi_wg not implemented\n"); + abort(); + } +} + +__device__ void GDAContext::putmem_wave(void *dest, const void *source, + size_t nelems, int pe) { + uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; + if (is_thread_zero_in_wave()) { + qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe); + qps[pe].quiet(); + } +} + +__device__ void GDAContext::getmem_wave(void *dest, const void *source, + size_t nelems, int pe) { + if (is_thread_zero_in_wave()) { + printf("rocshmem::gda:getmem_wave not implemented\n"); + abort(); + } +} + +__device__ void GDAContext::putmem_nbi_wave(void *dest, const void *source, + size_t nelems, int pe) { + uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; + if (is_thread_zero_in_wave()) { + qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe); + } +} + +__device__ void GDAContext::getmem_nbi_wave(void *dest, const void *source, + size_t nelems, int pe) { + if (is_thread_zero_in_wave()) { + printf("rocshmem::gda:getmem_nbi_wave not implemented\n"); + abort(); + } +} + + +//TODO: copied from IPC, needs review +__device__ void GDAContext::putmem_signal(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem(dest, source, nelems, pe); + fence(); + + switch (sig_op) { + case ROCSHMEM_SIGNAL_SET: + amo_set(static_cast(sig_addr), signal, pe); + break; + case ROCSHMEM_SIGNAL_ADD: + amo_add(static_cast(sig_addr), signal, pe); + break; + default: + DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op); + break; + } + //TODO: missing quiet_pe? +} + +__device__ void GDAContext::putmem_signal_wg(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_wg(dest, source, nelems, pe); + fence(); + + if (is_thread_zero_in_block()) { + switch (sig_op) { + case ROCSHMEM_SIGNAL_SET: + amo_set(static_cast(sig_addr), signal, pe); + break; + case ROCSHMEM_SIGNAL_ADD: + amo_add(static_cast(sig_addr), signal, pe); + break; + default: + DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op); + break; + } + //TODO: missing quiet_pe? + } +} + +__device__ void GDAContext::putmem_signal_wave(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_wave(dest, source, nelems, pe); + fence(); + + if (is_thread_zero_in_wave()) { + switch (sig_op) { + case ROCSHMEM_SIGNAL_SET: + amo_set(static_cast(sig_addr), signal, pe); + break; + case ROCSHMEM_SIGNAL_ADD: + amo_add(static_cast(sig_addr), signal, pe); + break; + default: + DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op); + break; + } + //TODO: missing quiet_pe? + } +} + +__device__ void GDAContext::putmem_signal_nbi(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_signal(dest, source, nelems, sig_addr, signal, sig_op, pe); //TODO: optimize +} + +__device__ void GDAContext::putmem_signal_nbi_wg(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_signal_wg(dest, source, nelems, sig_addr, signal, sig_op, pe); //TODO: optimize +} + +__device__ void GDAContext::putmem_signal_nbi_wave(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_signal_wave(dest, source, nelems, sig_addr, signal, sig_op, pe); //TODO: optimize +} + +__device__ uint64_t GDAContext::signal_fetch(const uint64_t *sig_addr) { + uint64_t *dst = const_cast(sig_addr); + return amo_fetch_add(static_cast(dst), 0, my_pe); +} + +__device__ uint64_t GDAContext::signal_fetch_wg(const uint64_t *sig_addr) { + __shared__ uint64_t value; + if (is_thread_zero_in_block()) { + uint64_t *dst = const_cast(sig_addr); + value = amo_fetch_add(static_cast(dst), 0, my_pe); + } + __threadfence_block(); + return value; +} + +__device__ uint64_t GDAContext::signal_fetch_wave(const uint64_t *sig_addr) { + uint64_t value; + if (is_thread_zero_in_wave()) { + uint64_t *dst = const_cast(sig_addr); + value = amo_fetch_add(static_cast(dst), 0, my_pe); + } + __threadfence_block(); + value = __shfl(value, 0); + return value; +} + +} // namespace rocshmem diff --git a/src/gda/context_gda_device.hpp b/src/gda/context_gda_device.hpp new file mode 100644 index 0000000000..b555e84a58 --- /dev/null +++ b/src/gda/context_gda_device.hpp @@ -0,0 +1,309 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_CONTEXT_DEVICE_HPP_ +#define LIBRARY_SRC_GDA_CONTEXT_DEVICE_HPP_ + +#include "context.hpp" +#include "team.hpp" + +namespace rocshmem { + +class QueuePair; + +class GDAContext : public Context { + public: + __host__ GDAContext(Backend *b, unsigned int ctx_id); + + __host__ ~GDAContext(); + + __device__ GDAContext(Backend *b, unsigned int ctx_id); //TODO is this used? + + __device__ void ctx_create(); + + __device__ void ctx_destroy(); + + __device__ void putmem(void *dest, const void *source, size_t nelems, int pe); + + __device__ void getmem(void *dest, const void *source, size_t nelems, int pe); + + __device__ void putmem_nbi(void *dest, const void *source, size_t nelems, + int pe); + + __device__ void getmem_nbi(void *dest, const void *source, size_t size, + int pe); + + __device__ void fence(); + + __device__ void fence(int pe); + + __device__ void quiet(); + + __device__ void *shmem_ptr(const void *dest, int pe); + + __device__ void barrier_all(); + + __device__ void barrier_all_wave(); + + __device__ void barrier_all_wg(); + + __device__ void barrier(rocshmem_team_t team); + + __device__ void barrier_wave(rocshmem_team_t team); + + __device__ void barrier_wg(rocshmem_team_t team); + + __device__ void sync_all(); + + __device__ void sync_all_wave(); + + __device__ void sync_all_wg(); + + __device__ void sync(rocshmem_team_t team); + + __device__ void sync_wave(rocshmem_team_t team); + + __device__ void sync_wg(rocshmem_team_t team); + + template + __device__ void p(T *dest, T value, int pe); + + template + __device__ void put(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ void put_nbi(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ T g(const T *source, int pe); + + template + __device__ void get(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe); + + // Atomic operations + template + __device__ void amo_add(void *dst, T value, int pe); + + template + __device__ void amo_set(void *dst, T value, int pe); + + template + __device__ T amo_swap(void *dst, T value, int pe); + + template + __device__ T amo_fetch_and(void *dst, T value, int pe); + + template + __device__ void amo_and(void *dst, T value, int pe); + + template + __device__ T amo_fetch_or(void *dst, T value, int pe); + + template + __device__ void amo_or(void *dst, T value, int pe); + + template + __device__ T amo_fetch_xor(void *dst, T value, int pe); + + template + __device__ void amo_xor(void *dst, T value, int pe); + + template + __device__ void amo_cas(void *dst, T value, T cond, int pe); + + template + __device__ T amo_fetch_add(void *dst, T value, int pe); + + template + __device__ T amo_fetch_cas(void *dst, T value, T cond, int pe); + + // Collectives + template + __device__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce); + + template + __device__ void broadcast(rocshmem_team_t team, T *dest, const T *source, + int nelems, int pe_root); + + template + __device__ void alltoall(rocshmem_team_t team, T *dest, const T *source, + int nelems); + template + __device__ void fcollect(rocshmem_team_t team, T *dest, const T *source, + int nelems); + + + // Block/wave functions + __device__ void putmem_wg(void *dest, const void *source, size_t nelems, + int pe); + + __device__ void getmem_wg(void *dest, const void *source, size_t nelems, + int pe); + + __device__ void putmem_nbi_wg(void *dest, const void *source, size_t nelems, + int pe); + + __device__ void getmem_nbi_wg(void *dest, const void *source, size_t size, + int pe); + + __device__ void putmem_wave(void *dest, const void *source, size_t nelems, + int pe); + + __device__ void getmem_wave(void *dest, const void *source, size_t nelems, + int pe); + + __device__ void putmem_nbi_wave(void *dest, const void *source, size_t nelems, + int pe); + + __device__ void getmem_nbi_wave(void *dest, const void *source, size_t size, + int pe); + + template + __device__ void put_wg(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ void put_nbi_wg(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ void put_wave(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ void put_nbi_wave(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ void get_wg(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ void get_nbi_wg(T *dest, const T *source, size_t nelems, int pe); + + + template + __device__ void get_wave(T *dest, const T *source, size_t nelems, int pe); + + template + __device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe); + +#define GDA_CONTEXT_PUT_SIGNAL_DEC(SUFFIX) \ + template \ + __device__ void put_signal##SUFFIX(T *dest, const T *source, size_t nelems, \ + uint64_t *sig_addr, uint64_t signal, int sig_op, \ + int pe); \ + \ + __device__ void putmem_signal##SUFFIX(void *dest, const void *source, size_t nelems, \ + uint64_t *sig_addr, uint64_t signal, int sig_op, \ + int pe); + + GDA_CONTEXT_PUT_SIGNAL_DEC() + GDA_CONTEXT_PUT_SIGNAL_DEC(_wg) + GDA_CONTEXT_PUT_SIGNAL_DEC(_wave) + GDA_CONTEXT_PUT_SIGNAL_DEC(_nbi) + GDA_CONTEXT_PUT_SIGNAL_DEC(_nbi_wg) + GDA_CONTEXT_PUT_SIGNAL_DEC(_nbi_wave) + + __device__ uint64_t signal_fetch(const uint64_t *sig_addr); + __device__ uint64_t signal_fetch_wg(const uint64_t *sig_addr); + __device__ uint64_t signal_fetch_wave(const uint64_t *sig_addr); + + private: + + //internal functions used by collective operations + template + __device__ void internal_broadcast(T *dest, const T *source, int nelems, int pe_root, + int pe_start, int stride, int pe_size, + long *p_sync); // NOLINT(runtime/int) + + template + __device__ void internal_put_broadcast(T *dst, const T *src, int nelems, + int pe_root, int PE_start, + int logPE_stride, int PE_size); // NOLINT(runtime/int) + + template + __device__ void internal_get_broadcast(T *dst, const T *src, int nelems, + int pe_root); // NOLINT(runtime/int) + + template + __device__ void fcollect_linear(rocshmem_team_t team, T *dest, + const T *source, int nelems); + + template + __device__ void alltoall_linear(rocshmem_team_t team, T *dest, + const T *source, int nelems); + + __device__ void internal_sync(int pe, int PE_start, int stride, int PE_size, + int64_t *pSync); + + __device__ void internal_sync_wave(int pe, int PE_start, int stride, int PE_size, + int64_t *pSync); + + __device__ void internal_sync_wg(int pe, int PE_start, int stride, int PE_size, + int64_t *pSync); + + __device__ void internal_direct_barrier(int pe, int PE_start, int stride, + int n_pes, int64_t *pSync); + + __device__ void internal_atomic_barrier(int pe, int PE_start, int stride, + int n_pes, int64_t *pSync); + + template + __device__ void internal_direct_allreduce(T *dst, const T *src, + int nelems, GDATeam *team_obj); + template + __device__ void internal_ring_allreduce(T *dst, const T *src, + int nelems, GDATeam *team_obj, + int n_seg, int seg_size, int chunk_size); + + + //Temporary scratchpad memory used by internal barrier algorithms. + int64_t *barrier_sync{nullptr}; + + /** + * @brief Array containing the addresses of the work/sync buffer bases + * of other PEs + */ + char **wrk_sync_pool_bases_{nullptr}; + + /** + * @brief Device context Id + */ + unsigned int ctx_id_{}; + + public: + QueuePair *qps{nullptr}; + + char *const *base_heap{nullptr}; + + //TODO(Avinash): + //Make tinfo private variable, it requires changes to the context + //creation API in backend + + //Team information for the team associated with the context + TeamInfo *tinfo{nullptr}; +}; + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_CONTEXT_DEVICE_HPP_ diff --git a/src/gda/context_gda_device_coll.cpp b/src/gda/context_gda_device_coll.cpp new file mode 100644 index 0000000000..be224fb302 --- /dev/null +++ b/src/gda/context_gda_device_coll.cpp @@ -0,0 +1,242 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "rocshmem/rocshmem.hpp" +#include "context_incl.hpp" +#include "context_gda_tmpl_device.hpp" +#include "util.hpp" +#include "gda_team.hpp" + +namespace rocshmem { + +__device__ void GDAContext::internal_direct_barrier(int pe, int PE_start, + int stride, int n_pes, + int64_t *pSync) { + int64_t flag_val{1}; + if (pe == PE_start) { + // Go through all PE offsets (except current offset = 0) + // and wait until they all reach +#if defined(__gfx90a__) + __threadfence_system(); +#endif /* __gfx90a__ */ + for (int i = 1; i < n_pes; i++) { + wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val); + pSync[i] = ROCSHMEM_SYNC_VALUE; + } + __threadfence_system(); + + // Announce to other PEs that all have reached + for (int i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) { + pSync[0] = flag_val; + put(&pSync[0], &pSync[0], 1, j); +#if defined(__gfx90a__) + __threadfence_system(); +#endif /* __gfx90a__ */ + } + pSync[0] = ROCSHMEM_SYNC_VALUE; + } else { + // Mark current PE offset as reached + size_t pe_offset = (pe - PE_start) / stride; + pSync[pe_offset] = flag_val; + put(&pSync[pe_offset], &pSync[pe_offset], 1, PE_start); +#if defined(__gfx90a__) + __threadfence_system(); +#endif /* __gfx90a__ */ + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); + pSync[0] = ROCSHMEM_SYNC_VALUE; + pSync[pe_offset] = ROCSHMEM_SYNC_VALUE; + __threadfence_system(); + } +} + +__device__ void GDAContext::internal_atomic_barrier(int pe, int PE_start, + int stride, int n_pes, + int64_t *pSync) { + int64_t flag_val{1}; + if (pe == PE_start) { + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1)); + pSync[0] = ROCSHMEM_SYNC_VALUE; + __threadfence_system(); + + pSync[0] = flag_val; + for (int i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) { + put_nbi(&pSync[0], &pSync[0], 1, j); + } + quiet(); + pSync[0] = ROCSHMEM_SYNC_VALUE; + } else { + amo_add(&pSync[0], flag_val, PE_start); + wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val); + pSync[0] = ROCSHMEM_SYNC_VALUE; + __threadfence_system(); + } +} + +__device__ void GDAContext::internal_sync(int pe, int PE_start, int stride, + int PE_size, int64_t *pSync) { + if (PE_size < 64) { + internal_direct_barrier(pe, PE_start, stride, PE_size, pSync); + } else { + internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync); + } +} + +__device__ void GDAContext::internal_sync_wave(int pe, int PE_start, int stride, + int PE_size, int64_t *pSync) { + if (is_thread_zero_in_wave()) { + if (PE_size < 64) { + internal_direct_barrier(pe, PE_start, stride, PE_size, pSync); + } else { + internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync); + } + } +} + +__device__ void GDAContext::internal_sync_wg(int pe, int PE_start, int stride, + int PE_size, int64_t *pSync) { + __syncthreads(); + if (is_thread_zero_in_block()) { + if (PE_size < 64) { + internal_direct_barrier(pe, PE_start, stride, PE_size, pSync); + } else { + internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync); + } + } + __threadfence_system(); + __syncthreads(); +} + +__device__ void GDAContext::sync(rocshmem_team_t team) { + GDATeam *team_obj = reinterpret_cast(team); + + int pe = team_obj->my_pe_in_world; + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_stride = team_obj->tinfo_wrt_world->stride; + int pe_size = team_obj->num_pes; + long *p_sync = team_obj->barrier_pSync; + + internal_sync(pe, pe_start, pe_stride, pe_size, p_sync); +} + +__device__ void GDAContext::sync_wave(rocshmem_team_t team) { + GDATeam *team_obj = reinterpret_cast(team); + + int pe = team_obj->my_pe_in_world; + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_stride = team_obj->tinfo_wrt_world->stride; + int pe_size = team_obj->num_pes; + long *p_sync = team_obj->barrier_pSync; + + internal_sync_wave(pe, pe_start, pe_stride, pe_size, p_sync); +} + +__device__ void GDAContext::sync_wg(rocshmem_team_t team) { + GDATeam *team_obj = reinterpret_cast(team); + + int pe = team_obj->my_pe_in_world; + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_stride = team_obj->tinfo_wrt_world->stride; + int pe_size = team_obj->num_pes; + long *p_sync = team_obj->barrier_pSync; + + internal_sync_wg(pe, pe_start, pe_stride, pe_size, p_sync); +} + +__device__ void GDAContext::sync_all() { + internal_sync(my_pe, 0, 1, num_pes, barrier_sync); +} + +__device__ void GDAContext::sync_all_wave() { + internal_sync_wave(my_pe, 0, 1, num_pes, barrier_sync); +} + +__device__ void GDAContext::sync_all_wg() { + internal_sync_wg(my_pe, 0, 1, num_pes, barrier_sync); +} + +__device__ void GDAContext::barrier_all() { + quiet(); + sync_all(); +} + +__device__ void GDAContext::barrier_all_wave() { + if (is_thread_zero_in_wave()) { + quiet(); + } + sync_all_wave(); +} + +__device__ void GDAContext::barrier_all_wg() { + if (is_thread_zero_in_block()) { + quiet(); + } + sync_all_wg(); + __syncthreads(); +} + +__device__ void GDAContext::barrier(rocshmem_team_t team) { + GDATeam *team_obj = reinterpret_cast(team); + + int pe = team_obj->my_pe_in_world; + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_stride = team_obj->tinfo_wrt_world->stride; + int pe_size = team_obj->num_pes; + long *p_sync = team_obj->barrier_pSync; + + quiet(); + internal_sync(pe, pe_start, pe_stride, pe_size, p_sync); +} + +__device__ void GDAContext::barrier_wave(rocshmem_team_t team) { + GDATeam *team_obj = reinterpret_cast(team); + + int pe = team_obj->my_pe_in_world; + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_stride = team_obj->tinfo_wrt_world->stride; + int pe_size = team_obj->num_pes; + long *p_sync = team_obj->barrier_pSync; + + if (is_thread_zero_in_wave()) { + quiet(); + } + internal_sync_wave(pe, pe_start, pe_stride, pe_size, p_sync); +} + +__device__ void GDAContext::barrier_wg(rocshmem_team_t team) { + GDATeam *team_obj = reinterpret_cast(team); + + int pe = team_obj->my_pe_in_world; + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_stride = team_obj->tinfo_wrt_world->stride; + int pe_size = team_obj->num_pes; + long *p_sync = team_obj->barrier_pSync; + + if (is_thread_zero_in_block()) { + quiet(); + } + internal_sync_wg(pe, pe_start, pe_stride, pe_size, p_sync); + __syncthreads(); +} + +} // namespace rocshmem diff --git a/src/gda/context_gda_host.cpp b/src/gda/context_gda_host.cpp new file mode 100644 index 0000000000..5190e8637f --- /dev/null +++ b/src/gda/context_gda_host.cpp @@ -0,0 +1,94 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "context_gda_host.hpp" + +#include + +#include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) +#include "backend_type.hpp" +#include "context_incl.hpp" +#include "backend_gda.hpp" +#include "host/host.hpp" + +namespace rocshmem { + +__host__ GDAHostContext::GDAHostContext(Backend *backend, + [[maybe_unused]] int64_t options) + : Context(backend, true) { + GDABackend *b{static_cast(backend)}; + + host_interface = b->host_interface; + + context_window_info = host_interface->acquire_window_context(); +} + +__host__ GDAHostContext::~GDAHostContext() { + host_interface->release_window_context(context_window_info); +} + +__host__ void GDAHostContext::putmem_nbi(void *dest, const void *source, + size_t nelems, int pe) { + host_interface->putmem_nbi(dest, source, nelems, pe, context_window_info); +} + +__host__ void GDAHostContext::getmem_nbi(void *dest, const void *source, + size_t nelems, int pe) { + host_interface->getmem_nbi(dest, source, nelems, pe, context_window_info); +} + +__host__ void GDAHostContext::putmem(void *dest, const void *source, + size_t nelems, int pe) { + host_interface->putmem(dest, source, nelems, pe, context_window_info); +} + +__host__ void GDAHostContext::getmem(void *dest, const void *source, + size_t nelems, int pe) { + host_interface->getmem(dest, source, nelems, pe, context_window_info); +} + +__host__ void GDAHostContext::fence() { + host_interface->fence(context_window_info); +} + +__host__ void GDAHostContext::quiet() { + host_interface->quiet(context_window_info); +} + +__host__ void *GDAHostContext::shmem_ptr(const void *dest, int pe) { + void *ret = nullptr; + //not implemented, returning nullptr is spec-valid + //TODO: copy ipc handover from RO when IPC+GDA is implemented + return ret; +} + +__host__ void GDAHostContext::sync_all() { + host_interface->sync_all(context_window_info); +} + +__host__ void GDAHostContext::barrier_all() { + host_interface->barrier_all(context_window_info); +} + +} // namespace rocshmem diff --git a/src/gda/context_gda_host.hpp b/src/gda/context_gda_host.hpp new file mode 100644 index 0000000000..7f7f86b4d6 --- /dev/null +++ b/src/gda/context_gda_host.hpp @@ -0,0 +1,152 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_CONTEXT_HOST_HPP_ +#define LIBRARY_SRC_GDA_CONTEXT_HOST_HPP_ + +#include "context.hpp" + +namespace rocshmem { + +class GDAHostContext : public Context { + public: + __host__ GDAHostContext(Backend *b, int64_t options); + + __host__ ~GDAHostContext(); + + template + __host__ void p(T *dest, T value, int pe); + + template + __host__ T g(const T *source, int pe); + + template + __host__ void put(T *dest, const T *source, size_t nelems, int pe); + + template + __host__ void get(T *dest, const T *source, size_t nelems, int pe); + + template + __host__ void put_nbi(T *dest, const T *source, size_t nelems, int pe); + + template + __host__ void get_nbi(T *dest, const T *source, size_t nelems, int pe); + + __host__ void putmem(void *dest, const void *source, size_t nelems, int pe); + + __host__ void getmem(void *dest, const void *source, size_t nelems, int pe); + + __host__ void putmem_nbi(void *dest, const void *source, size_t nelems, + int pe); + + __host__ void getmem_nbi(void *dest, const void *source, size_t size, int pe); + + template + __host__ void amo_add(void *dst, T value, int pe); + + template + __host__ void amo_cas(void *dst, T value, T cond, int pe); + + template + __host__ T amo_fetch_add(void *dst, T value, int pe); + + template + __host__ T amo_fetch_cas(void *dst, T value, T cond, int pe); + + __host__ void fence(); + + __host__ void quiet(); + + __host__ void *shmem_ptr(const void *dest, int pe); + + __host__ void barrier_all(); + + __host__ void sync_all(); + + template + __host__ void broadcast(T *dest, const T *source, int nelems, int pe_root, + int pe_start, int log_pe_stride, int pe_size, + long *p_sync); + + template + __host__ void broadcast(rocshmem_team_t team, T *dest, const T *source, + int nelems, int pe_root); + + template + __host__ void to_all(T *dest, const T *source, int nreduce, int pe_start, + int log_pe_stride, int pe_size, T *p_wrk, + long *p_sync); + + template + __host__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce); + + template + __host__ void wait_until(T *ivars, int cmp, T val); + + template + __host__ size_t wait_until_any(T *ivars, size_t nelems, + const int *status, + int cmp, T val); + + template + __host__ void wait_until_all(T *ivars, size_t nelems, + const int *status, + int cmp, T val); + + template + __host__ size_t wait_until_some(T *ivars, size_t nelems, + size_t* indices, + const int *status, + int cmp, T val); + + template + __host__ void wait_until_all_vector(T *ivars, size_t nelems, + const int *status, + int cmp, T* vals); + + template + __host__ size_t wait_until_any_vector(T *ivars, size_t nelems, + const int *status, + int cmp, T* vals); + + template + __host__ size_t wait_until_some_vector(T *ivars, size_t nelems, + size_t* indices, + const int *status, + int cmp, T* vals); + + template + __host__ int test(T *ivars, int cmp, T val); + + public: + /* Shared pointer to the backend's host interface */ + std::shared_ptr host_interface{nullptr}; + + /* An MPI Window implements a context */ + WindowInfo *context_window_info{nullptr}; +}; + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_CONTEXT_HOST_HPP_ diff --git a/src/gda/context_gda_tmpl_device.hpp b/src/gda/context_gda_tmpl_device.hpp new file mode 100644 index 0000000000..265afb232d --- /dev/null +++ b/src/gda/context_gda_tmpl_device.hpp @@ -0,0 +1,636 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_CONTEXT_TMPL_DEVICE_HPP_ +#define LIBRARY_SRC_GDA_CONTEXT_TMPL_DEVICE_HPP_ + +#include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) +#include "rocshmem/rocshmem.hpp" +#include "util.hpp" +#include "context_gda_device.hpp" +#include "gda_team.hpp" +#include "queue_pair.hpp" +#include "rocshmem_calc.hpp" + +#include + +namespace rocshmem { + +/****************************************************************************** + ************************** TEMPLATE SPECIALIZATIONS ************************** + *****************************************************************************/ +template +__device__ void GDAContext::p(T *dest, T value, int pe) { + printf("rocshmem::gda:p not implemented\n"); + abort(); + //TODO the following is incorrect because value is not ibv registered memory + //putmem_nbi(dest, &value, sizeof(T), pe); +} + +template +__device__ void GDAContext::put(T *dest, const T *source, size_t nelems, int pe) { + putmem(dest, source, nelems * sizeof(T), pe); +} + +template +__device__ void GDAContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) { + putmem_nbi(dest, source, sizeof(T) * nelems, pe); +} + +template +__device__ T GDAContext::g(const T *source, int pe) { + T ret; + printf("rocshmem::gda:g not implemented\n"); + abort(); + //TODO the following is incorrect because ret is not ibv registered memory + //getmem(&ret, source, sizeof(T), pe); + return ret; +} + +template +__device__ void GDAContext::get(T *dest, const T *source, size_t nelems, int pe) { + getmem(dest, source, sizeof(T) * nelems, pe); +} + +template +__device__ void GDAContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) { + getmem_nbi(dest, source, sizeof(T) * nelems, pe); +} + +// Atomics +template +__device__ void GDAContext::amo_add(void *dst, T value, int pe) { + if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_add not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t + uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; + bool need_turn {true}; + uint64_t turns = __ballot(need_turn); + while (turns) { + uint8_t lane = __ffsll((unsigned long long)turns) - 1; + int pe_turn = __shfl(pe, lane); + if (pe_turn == pe) { + qps[pe].atomic_nofetch(base_heap[pe] + L_offset, value, 0, pe, GDA_OP_ATOMIC_FA); + need_turn = false; + } + turns = __ballot(need_turn); + } +} + +template +__device__ void GDAContext::amo_set(void *dst, T value, int pe) { + if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_set not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t + uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; + T ret_val; + T cond = 0; + for (int i = 0; i < WF_SIZE; i++) { //TODO: this looks wrong + while ((ret_val = qps[pe].atomic_fetch(base_heap[pe] + L_offset, value, cond, pe, GDA_OP_ATOMIC_CS))) { + if (ret_val == cond) { break; } + cond = ret_val; + } + } +} + +template +__device__ T GDAContext::amo_swap(void *dst, T value, int pe) { + printf("rocshmem::gda:amo_swap not implemented\n"); + abort(); + return 0; +} + +template +__device__ T GDAContext::amo_fetch_and(void *dst, T value, int pe) { + printf("rocshmem::gda:amo_fetch_and not implemented\n"); + abort(); + return 0; +} + +template +__device__ void GDAContext::amo_and(void *dst, T value, int pe) { + printf("rocshmem::gda:amo_and not implemented\n"); + abort(); +} + +template +__device__ T GDAContext::amo_fetch_or(void *dst, T value, int pe) { + printf("rocshmem::gda:amo_fetch_or not implemented\n"); + abort(); + return 0; +} + +template +__device__ void GDAContext::amo_or(void *dst, T value, int pe) { + printf("rocshmem::gda:amo_or not implemented\n"); + abort(); +} + +template +__device__ T GDAContext::amo_fetch_xor(void *dst, T value, int pe) { + printf("rocshmem::gda:amo_fetch_xor not implemented\n"); + abort(); + return 0; +} + +template +__device__ void GDAContext::amo_xor(void *dst, T value, int pe) { + printf("rocshmem::gda:amo_xor not implemented\n"); + abort(); +} + +template +__device__ void GDAContext::amo_cas(void *dst, T value, T cond, int pe) { + if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_cas not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t + uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; + for (int i = 0; i < WF_SIZE; i++) { //TODO: this looks wrong + qps[pe].atomic_nofetch(base_heap[pe] + L_offset, value, cond, pe, GDA_OP_ATOMIC_CS); + } +} + +template +__device__ T GDAContext::amo_fetch_add(void *dst, T value, int pe) { + if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_fadd not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t + uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; + T ret_val = 0; + bool need_turn {true}; + uint64_t turns = __ballot(need_turn); + while (turns) { + uint8_t lane = __ffsll((unsigned long long)turns) - 1; + int pe_turn = __shfl(pe, lane); + if (pe_turn == pe) { + ret_val = qps[pe].atomic_fetch(base_heap[pe] + L_offset, value, 0, pe, GDA_OP_ATOMIC_FA); + need_turn = false; + } + turns = __ballot(need_turn); + } + return ret_val; +} + +template +__device__ T GDAContext::amo_fetch_cas(void *dst, T value, T cond, int pe) { + if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_fcas not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t + uint64_t L_offset = reinterpret_cast(dst) - base_heap[my_pe]; + T ret_val; + for (int i = 0; i < WF_SIZE; i++) { + ret_val = qps[pe].atomic_fetch(base_heap[pe] + L_offset, value, cond, pe, GDA_OP_ATOMIC_CS); + } + return ret_val; +} + +// Collectives TODO: loosely adapted from IPC, needs review +template +__device__ void compute_reduce(T *src, T *dst, int size, int wg_id, int wg_size) { + for (int i = wg_id; i < size; i += wg_size) { + OpWrap::Calc(src, dst, i); + } + __syncthreads(); +} + +template +__device__ void GDAContext::internal_direct_allreduce( + T *dst, const T *src, int nelems, GDATeam *team_obj) { // NOLINT(runtime/int) + + int stride = team_obj->tinfo_wrt_world->stride; + int PE_start = team_obj->tinfo_wrt_world->pe_start; + int PE_size = team_obj->tinfo_wrt_world->size; + long *pSync = team_obj->barrier_pSync; + T *pWrk = reinterpret_cast(team_obj->pWrk); + + int finish = PE_start + stride * PE_size; + int pe = my_pe; + + int wg_id = get_flat_block_id(); + int wg_size = get_flat_block_size(); + int64_t flag_val = 1; + + for (int i = wg_id; i < nelems; i += wg_size) { + dst[i] = src[i]; + } + __syncthreads(); + + for (int i = PE_start; i < finish; i += stride) { + if (i != pe) { + putmem_wg(&pWrk[pe * nelems], reinterpret_cast(src), + nelems * sizeof(T), i); + + if (is_thread_zero_in_block()) { + fence(); + putmem(&pSync[pe], &flag_val, sizeof(*pSync), i); + } + } + } + threadfence_system(); + __syncthreads(); + + // Do the compute and pSync reset in parallel. + for (int i = PE_start; i < finish; i += stride) { + if (i != pe) { + // Wait for leader thread to see that the buffer is ready. + if (is_thread_zero_in_block()) { + wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val); + } + __syncthreads(); + + T *ptr = &pWrk[i * nelems]; + compute_reduce(ptr, dst, nelems, wg_id, wg_size); + threadfence_system(); + } + } + + __syncthreads(); + + for (int i = wg_id; i < num_pes; i += wg_size) { + pSync[i] = ROCSHMEM_SYNC_VALUE; + } + threadfence_system(); + __syncthreads(); +} + +/* + * Visual representation of the ring_allreduce algorithm below + * assuming 4 PEs and a single segment. + * + * Initial state + * PE# 0 1 2 3 + * [00] [10] [20] [30] + * [01] [11] [21] [31] + * [02] [12] [22] [32] + * [03] [13] [23] [33] + * + * Loop 1: + * iter 0 + * PE# 0 1 2 3 + * [00+30] [10] [20] [30] + * [01] [01+11] [21] [31] + * [02] [12] [12+22] [32] + * [03] [13] [23] [23+33] + * + * iter 1 + * PE# 0 1 2 3 + * [00+30] [00+10+30] [20] [30] + * [01] [01+11] [01+11+21] [31] + * [02] [12] [12+22] [12+22+32] + * [03+23+33] [13] [23] [23+33] + * + * iter 2 + * PE# 0 1 2 3 + * [00+30] [00+10+30] [00+10+20+30] [30] + * [01] [01+11] [01+11+21] [01+11+21+31] + * [02+12+22+32] [12] [12+22] [12+22+32] + * [03+23+33] [03+13+23+33] [23] [23+33] + * + * Loop 2: + * + * iter 3 + * PE# 0 1 2 3 + * [00+30] [00+10+30] [00+10+20+30] [00+10+20+30] + * [01+11+21+31] [01+11] [01+11+21] [01+11+21+31] + * [02+12+22+32] [02+12+22+32] [12+22] [12+22+32] + * [03+23+33] [03+13+23+33] [03+13+23+33] [23+33] + * + * iter 4 + * PE# 0 1 2 3 + * [00+10+20+30] [00+10+30] [00+10+20+30] [00+10+20+30] + * [01+11+21+31] [01+11+21+31] [01+11+21] [01+11+21+31] + * [02+12+22+32] [02+12+22+32] [02+12+22+32] [12+22+32] + * [03+23+33] [03+13+23+33] [03+13+23+33] [03+13+23+33] + * + * iter 5 + * PE# 0 1 2 3 + * [00+10+20+30] [00+10+20+30] [00+10+20+30] [00+10+20+30] + * [01+11+21+31] [01+11+21+31] [01+11+21+31] [01+11+21+31] + * [02+12+22+32] [02+12+22+32] [02+12+22+32] [02+12+22+32] + * [03+13+23+33] [03+13+23+33] [03+13+23+33] [03+13+23+33] + */ +template +__device__ void GDAContext::internal_ring_allreduce( + T *dst, const T *src, int nelems, GDATeam *team_obj, // NOLINT(runtime/int) + int n_seg, int seg_size, int chunk_size) { + + int stride = team_obj->tinfo_wrt_world->stride; + int PE_start = team_obj->tinfo_wrt_world->pe_start; + int PE_size = team_obj->tinfo_wrt_world->size; + long *pSync = team_obj->barrier_pSync; + T *pWrk = reinterpret_cast(team_obj->pWrk); + int my_pe_in_team = team_obj->my_pe; + + int off_seg, off_send, off_recv; + int send_pe = (my_pe_in_team + 1) % PE_size; + // send_pe is relative to team, convert it relative to team world + send_pe = team_obj->get_pe_in_world(send_pe); + long wait_val; // NOLINT(runtime/int) + + int wg_size = get_flat_block_size(); + int wg_id = get_flat_block_id(); + + for (int i = wg_id; i < nelems; i += wg_size) { + dst[i] = src[i]; + } + __syncthreads(); + + for (int seg = 0; seg < n_seg; seg++) { + off_seg = seg * seg_size; + // Loop 2 in the algorithm above + for (int iter = 0; iter < PE_size - 1; iter++) { + off_send = (((my_pe_in_team + 1 - iter + 2 * PE_size) % PE_size) * chunk_size); + off_recv = (((my_pe_in_team - iter + 2 * PE_size) % PE_size) * chunk_size); + + putmem_wg(reinterpret_cast(&pWrk[off_send]), + reinterpret_cast(&dst[off_send + off_seg]), + chunk_size * sizeof(T), send_pe); + + if (is_thread_zero_in_block()) { + fence(); + + wait_val = seg + 100; + putmem(&pSync[iter], &wait_val, sizeof(*pSync), send_pe); +#if defined(__gfx90a__) + __threadfence_system(); +#endif /* __gfx90a__ */ + wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val); + } + __syncthreads(); + compute_reduce(&pWrk[off_recv], &dst[off_seg + off_recv], + chunk_size, wg_id, wg_size); + } + + // Loop 2 in the example above + for (int iter = PE_size - 1; iter < 2 * PE_size - 2; iter++) { + off_send = (((my_pe_in_team + 1 - iter + 2 * PE_size) % PE_size) * chunk_size); + putmem_nbi_wg(reinterpret_cast(&dst[off_send + off_seg]), + reinterpret_cast(&dst[off_send + off_seg]), + chunk_size * sizeof(T), send_pe); + + if (is_thread_zero_in_block()) { + fence(); + wait_val = seg + 100; + putmem(&pSync[iter], &wait_val, sizeof(*pSync), send_pe); +#if defined(__gfx90a__) + __threadfence_system(); +#endif /* __gfx90a__ */ + wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val); + } + __syncthreads(); + } + } + __syncthreads(); + + for (int i = wg_id; i < 2 * num_pes - 2; i += wg_size) { + pSync[i] = ROCSHMEM_SYNC_VALUE; + } + __syncthreads(); +} + +template +__device__ int GDAContext::reduce(rocshmem_team_t team, T *dest, + const T *source, int nreduce) { + GDATeam *team_obj = reinterpret_cast(team); + + int PE_size = team_obj->tinfo_wrt_world->size; + + size_t direct_pWrk = PE_size * nreduce; + size_t direct_pSync = PE_size; + size_t ring_pSync = 2 * PE_size; + size_t provided_pWrk = max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE); + size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE; + + if (provided_pWrk >= direct_pWrk && provided_pSync >= direct_pSync) { + internal_direct_allreduce(dest, source, nreduce, team_obj); + } else { + if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) { + size_t ring_pWrk = ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE; + // integer division truncating value + int chunk_size = ring_pWrk / PE_size; + int seg_size = chunk_size * PE_size; + + // integer division truncating value + int n_seg = nreduce / seg_size; + // integer division rounding up + int n_seg_up = (nreduce - 1) / seg_size + 1; + // recalculate chunk_size + chunk_size = seg_size / PE_size; + if (n_seg == 0) { + n_seg = 1; + } + internal_ring_allreduce(dest, source, nreduce, team_obj, n_seg, + seg_size, chunk_size); + if (n_seg_up > n_seg) { + T *p_dst = (dest + (n_seg * seg_size)); + const T *p_src = (source + (n_seg * seg_size)); + int p_count = nreduce - (n_seg * seg_size); + int p_chunk = p_count / PE_size; + + internal_ring_allreduce(p_dst, p_src, p_count, team_obj, 1, + (p_chunk * PE_size), p_chunk); + + if ((p_chunk * PE_size) < p_count) { + // Final elements need to use direct_allreduce + p_count -= (p_chunk * PE_size); + p_dst += (p_chunk * PE_size); + const T *p_src2 = p_src + (p_chunk * PE_size); + + internal_direct_allreduce(p_dst, p_src2, p_count, team_obj); + } + } + } else { + GPU_DPRINTF("Unsupported reduction size for GDA conduit.\n"); + return ROCSHMEM_ERROR; + } + } + return ROCSHMEM_SUCCESS; +} + +template +__device__ void GDAContext::internal_put_broadcast( + T *dst, const T *src, int nelems, int pe_root, int pe_start, + int stride, int pe_size) { // NOLINT(runtime/int) + if (my_pe == pe_root) { + int finish = pe_start + stride * pe_size; + for (int i = pe_start; i < finish; i += stride) { + if (i != my_pe) { + put_nbi_wg(dst, src, nelems, i); + } + } + } +} + +template +__device__ void GDAContext::internal_get_broadcast( + T *dst, const T *src, int nelems, int pe_root) { // NOLINT(runtime/int) + if (my_pe != pe_root) { + get_wg(dst, src, nelems, pe_root); + } +} + +template +__device__ void GDAContext::broadcast(rocshmem_team_t team, T *dst, + const T *src, int nelems, int pe_root) { + GDATeam *team_obj = reinterpret_cast(team); + + int stride = team_obj->tinfo_wrt_world->stride; + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_size = team_obj->tinfo_wrt_world->size; + long *p_sync = team_obj->bcast_pSync; + + // Passed pe_root is relative to team, convert to world root + int pe_root_world = team_obj->get_pe_in_world(pe_root); + internal_broadcast(dst, src, nelems, pe_root_world, pe_start, stride, + pe_size, p_sync); +} + +template +__device__ void GDAContext::internal_broadcast(T *dst, const T *src, int nelems, + int pe_root, int pe_start, + int stride, int pe_size, + long *p_sync) { // NOLINT(runtime/int) + if (num_pes < 4) { //TODO: optimized for IPC + internal_put_broadcast(dst, src, nelems, pe_root, pe_start, stride, + pe_size); + } else { + internal_get_broadcast(dst, src, nelems, pe_root); + } + + // Synchronize on completion of broadcast + internal_sync_wg(my_pe, pe_start, stride, pe_size, p_sync); +} + +template +__device__ void GDAContext::alltoall(rocshmem_team_t team, T *dst, + const T *src, int nelems) { + alltoall_linear(team, dst, src, nelems); +} + +template +__device__ void GDAContext::alltoall_linear(rocshmem_team_t team, T *dst, + const T *src, int nelems) { + GDATeam *team_obj = reinterpret_cast(team); + + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_size = team_obj->num_pes; + int stride = team_obj->tinfo_wrt_world->stride; + long *pSync = team_obj->alltoall_pSync; + int my_pe_in_team = team_obj->my_pe; + + // Have each PE put their designated data to the other PEs + for (int j = 0; j < pe_size; j++) { + int dest_pe = team_obj->get_pe_in_world(j); + put_nbi_wg(&dst[my_pe_in_team * nelems], &src[j * nelems], nelems, dest_pe); + } + if (is_thread_zero_in_block()) { + quiet(); + } + // wait until everyone has obtained their designated data + internal_sync_wg(my_pe, pe_start, stride, pe_size, pSync); +} + +template +__device__ void GDAContext::fcollect(rocshmem_team_t team, T *dst, + const T *src, int nelems) { + fcollect_linear(team, dst, src, nelems); +} + +template +__device__ void GDAContext::fcollect_linear(rocshmem_team_t team, T *dst, + const T *src, int nelems) { + GDATeam *team_obj = reinterpret_cast(team); + + int pe_start = team_obj->tinfo_wrt_world->pe_start; + int pe_size = team_obj->num_pes; + int stride = team_obj->tinfo_wrt_world->stride; + long *pSync = team_obj->alltoall_pSync; + int my_pe_in_team = team_obj->my_pe; + + // Have each PE put their designated data to the other PEs + for (int j = 0; j < pe_size; j++) { + int dest_pe = team_obj->get_pe_in_world(j); + put_nbi_wg(&dst[my_pe_in_team * nelems], src, nelems, dest_pe); + } + + if (is_thread_zero_in_block()) { + quiet(); + } + // wait until everyone has obtained their designated data + internal_sync_wg(my_pe, pe_start, stride, pe_size, pSync); +} + +// Block/wave functions +template +__device__ void GDAContext::put_wg(T *dest, const T *source, size_t nelems, int pe) { + putmem_wg(dest, source, nelems * sizeof(T), pe); +} + +template +__device__ void GDAContext::put_nbi_wg(T *dest, const T *source, size_t nelems, int pe) { + putmem_nbi_wg(dest, source, nelems * sizeof(T), pe); +} + + template +__device__ void GDAContext::put_wave(T *dest, const T *source, size_t nelems, int pe) { + putmem_wave(dest, source, nelems * sizeof(T), pe); +} + +template +__device__ void GDAContext::put_nbi_wave(T *dest, const T *source, size_t nelems, int pe) { + putmem_nbi_wave(dest, source, nelems * sizeof(T), pe); +} + +template +__device__ void GDAContext::get_wg(T *dest, const T *source, size_t nelems, int pe) { + getmem_wg(dest, source, nelems * sizeof(T), pe); +} + +template +__device__ void GDAContext::get_nbi_wg(T *dest, const T *source, size_t nelems, int pe) { + getmem_nbi_wg(dest, source, nelems * sizeof(T), pe); +} + +template +__device__ void GDAContext::get_wave(T *dest, const T *source, size_t nelems, int pe) { + getmem_wave(dest, source, nelems * sizeof(T), pe); +} + +template +__device__ void GDAContext::get_nbi_wave(T *dest, const T *source, size_t nelems, int pe) { + getmem_nbi_wave(dest, source, nelems * sizeof(T), pe); +} + +#define GDA_CONTEXT_PUT_SIGNAL_DEF(SUFFIX) \ + template \ + __device__ void GDAContext::put_signal##SUFFIX(T *dest, const T *source, size_t nelems, \ + uint64_t *sig_addr, uint64_t signal, int sig_op, \ + int pe) { \ + putmem_signal##SUFFIX(dest, source, nelems * sizeof(T), sig_addr, signal, sig_op, pe); \ + } \ + \ + template \ + __device__ void GDAContext::put_signal_nbi##SUFFIX(T *dest, const T *source, size_t nelems, \ + uint64_t *sig_addr, uint64_t signal, int sig_op, \ + int pe) { \ + putmem_signal##SUFFIX(dest, source, nelems * sizeof(T), sig_addr, signal, sig_op, pe); \ + } + +GDA_CONTEXT_PUT_SIGNAL_DEF() +GDA_CONTEXT_PUT_SIGNAL_DEF(_wg) +GDA_CONTEXT_PUT_SIGNAL_DEF(_wave) + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_CONTEXT_TMPL_DEVICE_HPP_ diff --git a/src/gda/context_gda_tmpl_host.hpp b/src/gda/context_gda_tmpl_host.hpp new file mode 100644 index 0000000000..b4006331b4 --- /dev/null +++ b/src/gda/context_gda_tmpl_host.hpp @@ -0,0 +1,169 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_CONTEXT_TMPL_HOST_HPP_ +#define LIBRARY_SRC_GDA_CONTEXT_TMPL_HOST_HPP_ + +#include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) +#include "host/host_templates.hpp" + +namespace rocshmem { + +template +__host__ void GDAHostContext::p(T *dest, T value, int pe) { + host_interface->p(dest, value, pe, context_window_info); +} + +template +__host__ T GDAHostContext::g(const T *source, int pe) { + return host_interface->g(source, pe, context_window_info); +} + +template +__host__ void GDAHostContext::put(T *dest, const T *source, size_t nelems, int pe) { + host_interface->put(dest, source, nelems, pe, context_window_info); +} + +template +__host__ void GDAHostContext::get(T *dest, const T *source, size_t nelems, int pe) { + host_interface->get(dest, source, nelems, pe, context_window_info); +} + +template +__host__ void GDAHostContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) { + host_interface->put_nbi(dest, source, nelems, pe, context_window_info); +} + +template +__host__ void GDAHostContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) { + host_interface->get_nbi(dest, source, nelems, pe, context_window_info); +} + +template +__host__ void GDAHostContext::amo_add(void *dst, T value, int pe) { + host_interface->amo_add(dst, value, pe, context_window_info); +} + +template +__host__ void GDAHostContext::amo_cas(void *dst, T value, T cond, int pe) { + host_interface->amo_cas(dst, value, cond, pe, context_window_info); +} + +template +__host__ T GDAHostContext::amo_fetch_add(void *dst, T value, int pe) { + return host_interface->amo_fetch_add(dst, value, pe, context_window_info); +} + +template +__host__ T GDAHostContext::amo_fetch_cas(void *dst, T value, T cond, int pe) { + return host_interface->amo_fetch_cas(dst, value, cond, pe, context_window_info); +} + +template +__host__ void GDAHostContext::broadcast( + T *dest, const T *source, int nelems, int pe_root, int pe_start, + int log_pe_stride, int pe_size, + long *p_sync) { // NOLINT(runtime/int) + host_interface->broadcast(dest, source, nelems, pe_root, pe_start, + log_pe_stride, pe_size, p_sync); +} + +template +__host__ void GDAHostContext::broadcast(rocshmem_team_t team, T *dest, + const T *source, int nelems, + int pe_root) { + host_interface->broadcast(team, dest, source, nelems, pe_root); +} + +template +__host__ void GDAHostContext::to_all(T *dest, const T *source, int nreduce, + int pe_start, int log_pe_stride, + int pe_size, T *p_wrk, + long *p_sync) { // NOLINT(runtime/int) + host_interface->to_all(dest, source, nreduce, pe_start, log_pe_stride, + pe_size, p_wrk, p_sync); +} + +template +__host__ int GDAHostContext::reduce(rocshmem_team_t team, T *dest, + const T *source, int nreduce) { + return host_interface->reduce(team, dest, source, nreduce); +} + +template +__host__ void GDAHostContext::wait_until(T *ivars, int cmp, T val) { + host_interface->wait_until(ivars, cmp, val, context_window_info); +} + +template +__host__ void GDAHostContext::wait_until_all(T *ivars, size_t nelems, + const int* status, + int cmp, T val) { + host_interface->wait_until_all(ivars, nelems, status, cmp, val, context_window_info); +} + +template +__host__ size_t GDAHostContext::wait_until_any(T *ivars, size_t nelems, + const int* status, + int cmp, T val) { + return host_interface->wait_until_any(ivars, nelems, status, cmp, val, context_window_info); +} + +template +__host__ size_t GDAHostContext::wait_until_some(T *ivars, size_t nelems, size_t* indices, + const int* status, + int cmp, T val) { + return host_interface->wait_until_some(ivars, nelems, indices, status, cmp, val, context_window_info); +} + +template +__host__ void GDAHostContext::wait_until_all_vector(T *ivars, size_t nelems, + const int* status, + int cmp, T* vals) { + host_interface->wait_until_all_vector(ivars, nelems, status, cmp, vals, context_window_info); +} + +template +__host__ size_t GDAHostContext::wait_until_any_vector(T *ivars, size_t nelems, + const int* status, + int cmp, T* vals) { + return host_interface->wait_until_any_vector(ivars, nelems, status, cmp, vals, context_window_info); +} + +template +__host__ size_t GDAHostContext::wait_until_some_vector(T *ivars, size_t nelems, + size_t* indices, + const int* status, + int cmp, T* vals) { + return host_interface->wait_until_some_vector(ivars, nelems, indices, status, cmp, vals, context_window_info); +} + +template +__host__ int GDAHostContext::test(T *ivars, int cmp, T val) { + return host_interface->test(ivars, cmp, val, context_window_info); +} + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_CONTEXT_TMPL_HOST_HPP_ diff --git a/src/gda/endian.cpp b/src/gda/endian.cpp new file mode 100644 index 0000000000..c8baeb18d5 --- /dev/null +++ b/src/gda/endian.cpp @@ -0,0 +1,81 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "endian.hpp" + +namespace rocshmem { + +template +__device__ void swap_endian_store(T *dst, const T val) { + typedef union U { + T val; + uint8_t bytes[sizeof(T)]; + } union_type; + union_type src; + union_type dst_tmp; + + src.val = val; + std::reverse_copy(src.bytes, src.bytes + sizeof(T), dst_tmp.bytes); + *dst = dst_tmp.val; +} + +template <> +__device__ void swap_endian_store(uint64_t *dst, const uint64_t val) { + uint64_t new_val = ((val << 8) & 0xFF00FF00FF00FF00ULL) | + ((val >> 8) & 0x00FF00FF00FF00FFULL); + + new_val = ((new_val << 16) & 0xFFFF0000FFFF0000ULL) | + ((new_val >> 16) & 0x0000FFFF0000FFFFULL); + + *dst = (new_val << 32) | (new_val >> 32); +} + +template <> +__device__ void swap_endian_store(int64_t *dst, const int64_t val) { + swap_endian_store(reinterpret_cast(dst), (const uint64_t)val); +} + +template <> +__device__ void swap_endian_store(uint32_t *dst, const uint32_t val) { + uint32_t new_val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF); + + *dst = (new_val << 16) | (new_val >> 16); +} + +template <> +__device__ void swap_endian_store(int32_t *dst, const int32_t val) { + swap_endian_store(reinterpret_cast(dst), (const uint32_t)val); +} + +template <> +__device__ void swap_endian_store(uint16_t *dst, const uint16_t val) { + *dst = ((val << 8) & 0xFF00) | ((val >> 8) & 0x00FF); +} + +template <> +__device__ void swap_endian_store(int16_t *dst, const int16_t val) { + swap_endian_store(reinterpret_cast(dst), (const uint16_t)val); +} + +} // namespace rocshmem diff --git a/src/gda/endian.hpp b/src/gda/endian.hpp new file mode 100644 index 0000000000..61663cdbcb --- /dev/null +++ b/src/gda/endian.hpp @@ -0,0 +1,62 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_ENDIAN_HPP_ +#define LIBRARY_SRC_GDA_ENDIAN_HPP_ + +#include + +namespace rocshmem { + +template +__device__ void swap_endian_store(T *dst, const T val); + +template <> +__device__ void swap_endian_store(uint64_t *dst, const uint64_t val); + +template <> +__device__ void swap_endian_store(int64_t *dst, const int64_t val); + +template <> +__device__ void swap_endian_store(uint32_t *dst, const uint32_t val); + +template <> +__device__ void swap_endian_store(int32_t *dst, const int32_t val); + +template <> +__device__ void swap_endian_store(uint16_t *dst, const uint16_t val); + +template <> +__device__ void swap_endian_store(int16_t *dst, const int16_t val); + +template +__device__ T swap_endian_val(const T val) { + T dst; + swap_endian_store(&dst, val); + return dst; +} + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_ENDIAN_HPP_ diff --git a/src/gda/gda_context_proxy.hpp b/src/gda/gda_context_proxy.hpp new file mode 100644 index 0000000000..14cac518f5 --- /dev/null +++ b/src/gda/gda_context_proxy.hpp @@ -0,0 +1,102 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_CONTEXT_PROXY_HPP_ +#define LIBRARY_SRC_GDA_CONTEXT_PROXY_HPP_ + + +#include "device_proxy.hpp" +#include "backend_gda.hpp" + +namespace rocshmem { + +class GDABackend; + +template +class GDADefaultContextProxy { + using ProxyT = DeviceProxy; + + public: + GDADefaultContextProxy() = default; + + /* + * Placement new the memory which is allocated by proxy_ + */ + explicit GDADefaultContextProxy(GDABackend* backend, TeamInfo *tinfo, + size_t num_elems = 1) + : constructed_{true}, proxy_{num_elems} { + auto ctx{proxy_.get()}; + new (ctx) GDAContext(reinterpret_cast(backend), 0); + ctx->tinfo = tinfo; + rocshmem_ctx_t local{ctx, tinfo}; + set_internal_ctx(&local); + } + + /* + * Since placement new is called in the constructor, then + * delete must be called manually. + */ + ~GDADefaultContextProxy() { + if (constructed_) { + proxy_.get()->~GDAContext(); + } + } + + GDADefaultContextProxy(const GDADefaultContextProxy& other) = delete; + + GDADefaultContextProxy& operator=(const GDADefaultContextProxy& other) = delete; + + GDADefaultContextProxy(GDADefaultContextProxy&& other) = default; + + GDADefaultContextProxy& operator=(GDADefaultContextProxy&& other) { + if (this != &other) { + proxy_ = std::move(other.proxy_); + constructed_ = true; + other.constructed_ = false; + } + return *this; + } + + /* + * @brief Provide access to the memory referenced by the proxy + */ + __host__ __device__ Context* get() { return proxy_.get(); } + + private: + /* + * @brief Memory managed by the lifetime of this object + */ + ProxyT proxy_{}; + + /* + * @brief denotes if an objects was constructed in proxy + */ + bool constructed_{false}; +}; + +using GDADefaultContextProxyT = GDADefaultContextProxy; + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_CONTEXT_PROXY_HPP_ diff --git a/src/gda/gda_team.cpp b/src/gda/gda_team.cpp new file mode 100644 index 0000000000..d419ada7a7 --- /dev/null +++ b/src/gda/gda_team.cpp @@ -0,0 +1,54 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "gda_team.hpp" + +#include "constants.hpp" +#include "backend_type.hpp" +#include "backend_gda.hpp" + +namespace rocshmem { + +GDATeam::GDATeam(Backend *backend, TeamInfo *team_info_parent, + TeamInfo *team_info_world, int num_pes, int my_pe, + MPI_Comm mpi_comm, int pool_index) + : Team(backend, team_info_parent, team_info_world, num_pes, my_pe, + mpi_comm) { + type = BackendType::GDA_BACKEND; + const GDABackend *b = static_cast(backend); + + pool_index_ = pool_index; + + barrier_pSync = &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]); + reduce_pSync = &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]); + bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]); + alltoall_pSync = &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]); + + pWrk = reinterpret_cast(b->pWrk_pool) + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index; + pAta = reinterpret_cast(b->pAta_pool) + ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index; +} + +GDATeam::~GDATeam() {} + +} // namespace rocshmem diff --git a/src/gda/gda_team.hpp b/src/gda/gda_team.hpp new file mode 100644 index 0000000000..4d4a4e54b0 --- /dev/null +++ b/src/gda/gda_team.hpp @@ -0,0 +1,52 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_TEAM_HPP_ +#define LIBRARY_SRC_GDA_TEAM_HPP_ + +#include "team.hpp" + +namespace rocshmem { + +class GDATeam : public Team { + public: + GDATeam(Backend* handle, TeamInfo* team_info_wrt_parent, + TeamInfo* team_info_wrt_world, int num_pes, int my_pe, + MPI_Comm team_comm, int pool_index); + + virtual ~GDATeam(); + + long* barrier_pSync{nullptr}; + long* reduce_pSync{nullptr}; + long* bcast_pSync{nullptr}; + long* alltoall_pSync{nullptr}; + void* pWrk{nullptr}; + void* pAta{nullptr}; + + int pool_index_{-1}; +}; + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_TEAM_HPP_ diff --git a/src/gda/queue_pair.cpp b/src/gda/queue_pair.cpp new file mode 100644 index 0000000000..e44292d364 --- /dev/null +++ b/src/gda/queue_pair.cpp @@ -0,0 +1,623 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "queue_pair.hpp" + +#include + +#include "backend_gda.hpp" +#include "endian.hpp" +#if !defined(GDA_IONIC) && !defined(GDA_BNXT) +#include "segment_builder.hpp" +#endif +#include "util.hpp" +#include "constants.hpp" + +namespace rocshmem { + +QueuePair::QueuePair(struct ibv_pd* pd) { + allocator.allocate((void**)&nonfetching_atomic, 8); + CHECK_HIP(hipMemset(nonfetching_atomic, 0, 8)); + int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; + + ibv_mr *mr = ibv_reg_mr(pd, nonfetching_atomic, 8, access); + CHECK_NNULL(mr, "ibv_reg_mr"); + +#if defined(GDA_IONIC) || defined(GDA_BNXT) + nonfetching_atomic_lkey = mr->lkey; +#else + nonfetching_atomic_lkey = htobe32(mr->lkey); +#endif + + allocator.allocate((void**)&fetching_atomic, 8 * FETCHING_ATOMIC_CNT); + CHECK_HIP(hipMemset(fetching_atomic, 0, 8 * FETCHING_ATOMIC_CNT)); + access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; + mr = ibv_reg_mr(pd, fetching_atomic, 8 * FETCHING_ATOMIC_CNT, access); + CHECK_NNULL(mr, "ibv_reg_mr"); +#if defined(GDA_IONIC) || defined(GDA_BNXT) + fetching_atomic_lkey = mr->lkey; +#else + fetching_atomic_lkey = htobe32(mr->lkey); +#endif + + allocator.allocate((void**)&fetching_atomic_freelist, sizeof(FreeListT*)); + new (fetching_atomic_freelist) FreeListT(); + for(int i{0}; i < FETCHING_ATOMIC_CNT; i+=WF_SIZE) { + fetching_atomic_freelist->push_back(fetching_atomic + i); + } +} + + +/****************************************************************************** + ************************ PROVIDER-SPECIFIC HELPERS *************************** + *****************************************************************************/ +#ifdef GDA_IONIC +__device__ uint64_t QueuePair::get_same_qp_lane_mask() { + uint64_t lane_mask = get_active_lane_mask(); + uintptr_t this_val = reinterpret_cast(this); + + // exclude threads operating on a different qp from this thread lane mask + #pragma unroll + for (int i = 0; i < 64; ++i) { + uint64_t bit_i = 1ull << i; + if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) { + lane_mask &= ~bit_i; + } + } + + return lane_mask; +} + +__device__ bool QueuePair::cq_lock_try_acquire(uint64_t activemask) { + uint32_t cq_lock_val = SPIN_LOCK_INVALID; + + if (is_first_active_lane(activemask)) { + cq_lock_val = SPIN_LOCK_UNLOCKED; + __hip_atomic_compare_exchange_strong(&cq_lock, &cq_lock_val, SPIN_LOCK_LOCKED, + __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT); + } + cq_lock_val = __shfl(cq_lock_val, get_first_active_lane_id(activemask)); + + return (cq_lock_val == SPIN_LOCK_UNLOCKED); +} + +__device__ void QueuePair::cq_lock_release(uint64_t activemask) { + if (is_first_active_lane(activemask)) { + __hip_atomic_store(&cq_lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); + } +} + +__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) { + uint32_t my_sq_prod = 0; + + // reserve space for wqes in sq + if (is_first_active_lane(activemask)) { + my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask)); + + // wait for that space to be available + quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask); + + return my_sq_prod; +} + +__device__ uint32_t QueuePair::commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe) { + uint32_t dbprod = my_sq_prod + num_wqes; + + if (last) { + // signal last wqe before the doorbell + wqe->base.flags |= swap_endian_val(IONIC_V1_FLAG_SIG); + + while (__hip_atomic_load(&sq_dbprod, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT) != my_sq_prod) { + // spin + } + + ring_doorbell(dbprod); + + __hip_atomic_exchange(&sq_dbprod, dbprod, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); + } + + return dbprod; +} + +__device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { + uint32_t my_logical_lane_id = get_active_lane_num(activemask); + uint32_t my_cq_pos = cq_pos + my_logical_lane_id; + + /* Look at the cqe at the current position in the cq buffer */ + struct ionic_v1_cqe *cqe = &cq_buf[my_cq_pos & cq_mask]; + + /* Determine expected color based on cq wrap count */ + uint32_t qtf_color_bit = swap_endian_val(IONIC_V1_CQE_COLOR); + uint32_t qtf_color_exp = qtf_color_bit; + if (my_cq_pos & (cq_mask + 1)) { + qtf_color_exp = 0; + } + + /* Wait for at least one thread cqe color == expected color */ + uint32_t qtf_be; + bool ready; + uint64_t ballot_ready; + do { + qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags); + ready = (qtf_be & qtf_color_bit) == qtf_color_exp; + ballot_ready = __ballot(ready); + } while (!ballot_ready); + + /* Other threads saw a ready cqe, but not this thread */ + if (!ready) { + return; + } + + uint32_t msn = swap_endian_val(cqe->send.msg_msn); + + /* Report if the completion indicates an error. */ + if (!!(qtf_be & swap_endian_val(IONIC_V1_CQE_ERROR))) { +#ifdef DEBUG + uint32_t qtf = swap_endian_val(qtf_be); + uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT; + uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; + uint32_t flag = qtf & 0xf; + uint32_t status = swap_endian_val(cqe->status_length); + uint64_t npg = swap_endian_val(cqe->send.npg_wqe_id); + + printf("QUIET ERROR: qid %u type %u flag %#x status %u msn %u npg %lu\n", + qid, type, flag, status, msn, npg); +#endif + /* No other way to signal an error, so just crash. */ + abort(); + } + + /* Only proceed with the furthest ahead cqe to update the sq state */ + uint64_t my_lane_mask = 1ull << __lane_id(); + uint64_t lesser_lane_mask = my_lane_mask - 1; + if (my_lane_mask != (ballot_ready & ~lesser_lane_mask)) { + return; + } + + /* update position in the cq */ + cq_pos = my_cq_pos + 1; + + /* + * Ring cq doorbell frequently enough to avoid cq full. + * + * NB: IONIC_CQ_GRACE is 100 + */ + if (((cq_pos - cq_dbpos) & cq_mask) >= 100) { + cq_dbpos = cq_pos; + __atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed? + } + + sq_msn = msn; +} + +__device__ void QueuePair::quiet_internal(uint64_t activemask, uint32_t cons) { + /* wait for sq_msn to catch up or pass cons. */ + /* 0x800000 - sign bit for 24-bit fields */ + while ((sq_msn - cons) & 0x800000) { + if (!cq_lock_try_acquire(activemask)) { + continue; + } + + /* with lock acquired, this wave polls cqes until caught up */ + while ((sq_msn - cons) & 0x800000) { + poll_wave_cqes(activemask); + } + + cq_lock_release(activemask); + break; + } +} +#endif // GDA_IONIC + +#ifndef GDA_BNXT +#ifdef GDA_IONIC +__device__ void QueuePair::ring_doorbell(uint32_t pos) { + // TODO When threads write at once to the same address, not all writes reach the bus. + for (int i = 0; i < 64; ++i) { + if (__lane_id() == i) { + __threadfence(); + __atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST); + } + } + __threadfence(); +} +#else // !GDA_IONIC +__device__ void QueuePair::ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) { + swap_endian_store(const_cast(dbrec), (uint32_t)my_sq_counter); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + __hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); + uint64_t db_uint = __hip_atomic_load(&db.uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + db_uint ^= 0x100; + __hip_atomic_store(&db.uint, db_uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} +#endif // !GDA_IONIC +#endif // !GDA_BNXT + +#ifndef GDA_BNXT +#ifdef GDA_IONIC +__device__ void QueuePair::quiet() { + quiet_internal(get_same_qp_lane_mask(), sq_prod); +} +#else // !GDA_IONIC +__device__ void QueuePair::quiet() { + constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE; + __shared__ uint64_t wqe_broadcast[BROADCAST_SIZE]; + uint8_t wavefront_id = get_flat_block_id() / WF_SIZE; + wqe_broadcast[wavefront_id] = 0; + + uint64_t activemask = get_active_lane_mask(); + uint8_t num_active_lanes = get_active_lane_count(activemask); + uint8_t my_logical_lane_id = get_active_lane_num(activemask); + bool is_leader{my_logical_lane_id == 0}; + const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); + + while (true) { + bool done{false}; + uint64_t quiet_amount{0}; + uint64_t wave_cq_consumer{0}; + while (!done) { + uint64_t active = __hip_atomic_load(&quiet_active, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + uint64_t posted = __hip_atomic_load(&quiet_posted, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + uint64_t completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + if (!(posted - completed)) { + return; + } + int64_t quiet_val = posted - active; + if (quiet_val <= 0) { + continue; + } + quiet_amount = min(num_active_lanes, quiet_val); + if (is_leader) { + done = __hip_atomic_compare_exchange_strong(&quiet_active, &active, active + quiet_amount, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + if (done) { + wave_cq_consumer = __hip_atomic_fetch_add(&cq_consumer, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + } + done = __shfl(done, leader_phys_lane_id); + } + wave_cq_consumer = __shfl(wave_cq_consumer, leader_phys_lane_id); + uint64_t my_cq_consumer = wave_cq_consumer + my_logical_lane_id; + uint64_t my_cq_index = my_cq_consumer % cq_cnt; + + if (my_logical_lane_id < quiet_amount) { + volatile mlx5_cqe64 *cqe_entry = &cq_buf[my_cq_index]; + uint16_t be_wqe_counter{0}; + uint8_t op_own{0}; + uint8_t owner_bit = (my_cq_consumer >> cq_log_cnt) & 1; + bool vote_failed{true}; + + while (vote_failed) { + op_own = *((volatile uint8_t*)&cqe_entry->op_own); + bool my_ownership_vote = (op_own & 1) == owner_bit; + bool my_opcode_vote = (op_own >> 4) != MLX5_CQE_INVALID; + uint64_t votes = __ballot(my_ownership_vote && my_opcode_vote); + vote_failed = __popcll(votes) < quiet_amount; + if (!vote_failed) { + be_wqe_counter = *((volatile uint16_t*)&cqe_entry->wqe_counter); + } + } + + uint16_t wqe_counter; + swap_endian_store(const_cast(&wqe_counter), reinterpret_cast(be_wqe_counter)); + uint64_t wqe_id = outstanding_wqes[wqe_counter]; + __hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit; + *((volatile uint8_t*)&cqe_entry->op_own) = mlx5_invld_bits; + __atomic_signal_fence(__ATOMIC_SEQ_CST); + } + if (is_leader) { + uint64_t completed {0}; + do { + completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } while (completed != wave_cq_consumer); + + swap_endian_store(const_cast(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount)); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id]; + __hip_atomic_fetch_max(&sq_sunk, sunk_wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_add(&quiet_completed, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + } +} +#endif // !GDA_IONIC +#endif // !GDA_BNXT + +#ifndef GDA_BNXT +#ifdef GDA_IONIC +__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { + uint64_t activemask = get_same_qp_lane_mask(); + uint32_t num_wqes = get_active_lane_count(activemask); + uint32_t my_logical_lane_id = get_active_lane_num(activemask); + uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); + uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; + struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask]; + + // TODO why is this needed? + if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) { + size = 1; + } + + wqe->base.wqe_id = my_sq_pos; + wqe->base.op = opcode; + wqe->base.num_sge_key = size ? 1 : 0; + wqe->base.flags = swap_endian_val(0); + wqe->base.imm_data_key = swap_endian_val(0); + + wqe->common.rdma.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); + wqe->common.rdma.remote_va_low = swap_endian_val(reinterpret_cast(raddr)); + wqe->common.rdma.remote_rkey = swap_endian_val(rkey); + wqe->common.length = swap_endian_val(size); + + if (size) { + if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) { + wqe->base.flags |= swap_endian_val(IONIC_V1_FLAG_INL); + wqe->base.num_sge_key = 0; + if (!laddr) { + // TODO why is this needed? + wqe->common.pld.data[0] = 1; + } else { + memcpy(wqe->common.pld.data, laddr, size); + } + } else { + wqe->common.pld.sgl[0].va = swap_endian_val(reinterpret_cast(laddr)); + wqe->common.pld.sgl[0].len = swap_endian_val(size); + wqe->common.pld.sgl[0].lkey = swap_endian_val(lkey); + } + } + + commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe); +} +#else // !GDA_IONIC +__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { + uint64_t activemask = get_active_lane_mask(); + uint8_t num_active_lanes = get_active_lane_count(activemask); + uint8_t my_logical_lane_id = get_active_lane_num(activemask); + bool is_leader{my_logical_lane_id == 0}; + const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); + uint8_t num_wqes{num_active_lanes}; + uint64_t wave_sq_counter{0}; + + if (is_leader) { + wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); + } + wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id); + uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id; + uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt; + + while (true) { + uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + int64_t num_active_sq_entries = db_touched - sunk; + if (num_active_sq_entries < 0) { + continue; + } + uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries; + uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched; + if (num_free_entries > num_entries_until_wave_last_entry) { + break; + } + quiet(); + } + + outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; + + SegmentBuilder seg_build(my_sq_index, sq_buf); + seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0); + seg_build.update_raddr_seg(raddr, rkey); + seg_build.update_data_seg(laddr, size, lkey); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + if (is_leader) { + uint64_t db_touched {0}; + do { + db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } while (db_touched != wave_sq_counter); + + uint8_t *base_ptr = reinterpret_cast(sq_buf); + uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]); + ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); + + __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } +} +#endif // !GDA_IONIC +#endif // !GDA_BNXT + +#ifndef GDA_BNXT +#ifdef GDA_IONIC +__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { + uint64_t activemask = get_same_qp_lane_mask(); + uint32_t num_wqes = get_active_lane_count(activemask); + uint32_t my_logical_lane_id = get_active_lane_num(activemask); + bool is_leader{my_logical_lane_id == 0}; + const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); + uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); + uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; + struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask]; + uint32_t cons; + + uint64_t* wave_fetch_atomic{nullptr}; + if (fetching) { + if (is_leader) { + auto res = fetching_atomic_freelist->pop_front(); + while (!res.success) { + res = fetching_atomic_freelist->pop_front(); + } + wave_fetch_atomic = res.value; + } + wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id); + } + + wqe->base.wqe_id = my_sq_pos; + wqe->base.op = opcode; + wqe->base.num_sge_key = 1; + wqe->base.flags = swap_endian_val(0); + wqe->base.imm_data_key = swap_endian_val(0); + + wqe->atomic_v2.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); + wqe->atomic_v2.remote_va_low = swap_endian_val(reinterpret_cast(raddr)); + wqe->atomic_v2.remote_rkey = swap_endian_val(rkey); + wqe->atomic_v2.swap_add_high = swap_endian_val(atomic_data >> 32); + wqe->atomic_v2.swap_add_low = swap_endian_val(atomic_data); + wqe->atomic_v2.compare_high = swap_endian_val(atomic_cmp >> 32); + wqe->atomic_v2.compare_low = swap_endian_val(atomic_cmp); + + if (fetching) { + wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(wave_fetch_atomic + my_logical_lane_id)); + wqe->atomic_v2.lkey = swap_endian_val(fetching_atomic_lkey); + } else { + wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(nonfetching_atomic)); + wqe->atomic_v2.lkey = swap_endian_val(nonfetching_atomic_lkey); + } + + cons = commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe); + + uint64_t ret{0}; + if (fetching) { + quiet_internal(activemask, cons); + ret = wave_fetch_atomic[my_logical_lane_id]; + __atomic_signal_fence(__ATOMIC_SEQ_CST); + if (is_leader) { + fetching_atomic_freelist->push_back(wave_fetch_atomic); + } + } + return ret; +} +#else // !GDA_IONIC || !GDA_BNXT +__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { + uint64_t activemask = get_active_lane_mask(); + uint8_t num_active_lanes = get_active_lane_count(activemask); + uint8_t my_logical_lane_id = get_active_lane_num(activemask); + bool is_leader{my_logical_lane_id == 0}; + const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); + uint8_t num_wqes{num_active_lanes}; + uint64_t wave_sq_counter{0}; + + if (is_leader) { + wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id); + uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id; + uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt; + + while (true) { + uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + int64_t num_active_sq_entries = db_touched - sunk; + if (num_active_sq_entries < 0) { + continue; + } + uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries; + uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched; + if (num_free_entries > num_entries_until_wave_last_entry) { + break; + } + quiet(); + } + + uint64_t* wave_fetch_atomic{nullptr}; + if (fetching) { + if (is_leader) { + uint64_t db_touched {0}; + do { + db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } while (db_touched != wave_sq_counter); + + auto res = fetching_atomic_freelist->pop_front(); + while (!res.success) { + res = fetching_atomic_freelist->pop_front(); + } + wave_fetch_atomic = res.value; + } + wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id); + } + + outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; + + SegmentBuilder seg_build(my_sq_index, sq_buf); + seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 4, 0, 0); + seg_build.update_raddr_seg(raddr, rkey); + seg_build.update_atomic_seg(atomic_data, atomic_cmp); + if (fetching) { + seg_build.update_data_seg(wave_fetch_atomic + my_logical_lane_id, 8, fetching_atomic_lkey); + } else { + seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey); + } + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + if (is_leader) { + uint64_t db_touched {0}; + do { + db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } while (db_touched != wave_sq_counter); + + uint8_t *base_ptr = reinterpret_cast(sq_buf); + uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]); + ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); + + __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + + uint64_t ret{0}; + if (fetching) { + quiet(); + ret = wave_fetch_atomic[my_logical_lane_id]; + __atomic_signal_fence(__ATOMIC_SEQ_CST); + if (is_leader) { + fetching_atomic_freelist->push_back(wave_fetch_atomic); + } + } + return ret; +} +#endif // !GDA_IONIC +#endif // !GDA_BNXT + +/****************************************************************************** + ****************************** SHMEM INTERFACE ******************************* + *****************************************************************************/ +__device__ void QueuePair::put_nbi(void *dest, const void *source, size_t nelems, int pe) { + uintptr_t *src = reinterpret_cast(const_cast(source)); + uintptr_t *dst = reinterpret_cast(dest); + post_wqe_rma(pe, nelems, src, dst, GDA_OP_RDMA_WRITE); +} + +__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe, uint8_t atomic_op) { + uintptr_t *dst = reinterpret_cast(dest); + return post_wqe_amo(pe, sizeof(int64_t), dst, atomic_op, atomic_data, atomic_cmp, true); +} + +__device__ void QueuePair::atomic_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe, uint8_t atomic_op) { + uintptr_t *dst = reinterpret_cast(dest); + post_wqe_amo(pe, sizeof(int64_t), dst, atomic_op, atomic_data, atomic_cmp, false); +} + +} // namespace rocshmem diff --git a/src/gda/queue_pair.hpp b/src/gda/queue_pair.hpp new file mode 100644 index 0000000000..eac8107648 --- /dev/null +++ b/src/gda/queue_pair.hpp @@ -0,0 +1,310 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_ +#define LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_ + +/** + * @file queue_pair.hpp + * + * @section DESCRIPTION + * An IB QueuePair (SQ and CQ) that the device can use to perform network + * operations. Most important rocSHMEM operations are performed by this + * class. + */ + +#include "rocshmem_config.h" +#include "endian.h" +#include "constants.hpp" +#ifdef GDA_IONIC +extern "C" { +#include +#include +} +#elif defined(GDA_BNXT) +#include "bnxt/provider_gda_bnxt.hpp" +#else +#include +#endif + +#include "containers/free_list.hpp" +#include "memory/hip_allocator.hpp" + +#ifdef GDA_IONIC +#define GDA_MAX_ATOMIC 15 +#define GDA_OP_RDMA_WRITE IONIC_V2_OP_RDMA_WRITE +#define GDA_OP_ATOMIC_FA IONIC_V2_OP_ATOMIC_FA +#define GDA_OP_ATOMIC_CS IONIC_V2_OP_ATOMIC_CS +#elif !defined(GDA_BNXT) +#define GDA_MAX_ATOMIC 1 +#define GDA_OP_RDMA_WRITE MLX5_OPCODE_RDMA_WRITE +#define GDA_OP_ATOMIC_FA MLX5_OPCODE_ATOMIC_FA +#define GDA_OP_ATOMIC_CS MLX5_OPCODE_ATOMIC_CS +#endif + +namespace rocshmem { + +class GDABackend; + +typedef union db_reg { + uint64_t *ptr; + uintptr_t uint; +} db_reg_t; + +#define SPIN_LOCK_INVALID 0xdead +#define SPIN_LOCK_UNLOCKED 0x1234 +#define SPIN_LOCK_LOCKED 0xabcd + +class QueuePair { + public: + friend GDABackend; + + /** + * @brief Constructor. + */ + explicit QueuePair(struct ibv_pd* pd); + + /** + * @brief Create and enqueue a non-blocking put work queue entry (wqe). + * + * @param[in] dest Destination address for data transmission. + * @param[in] source Source address for data transmission. + * @param[in] nelems Size in bytes of data transmission. + * @param[in] pe Destination processing element of data transmission. + */ + __device__ void put_nbi(void *dest, const void *source, size_t nelems, int pe); + + /** + * @brief Empty all completions from the completion queue. + */ + __device__ void quiet(); + + /** + * @brief Create and enqueue an atomic fetch work queue entry (wqe). + * + * @param[in] dest Destination address for data transmission. + * @param[in] value Data value for the atomic operation. + * @param[in] cond Used in atomic comparisons. + * @param[in] pe Destination processing element of data transmission. + * @param[in] atomic_op The atomic operation to perform. + * + * @return An atomic value + */ + __device__ int64_t atomic_fetch(void *dest, int64_t value, int64_t cond, int pe, uint8_t atomic_op); + + /** + * @brief Create and enqueue an atomic fetch work queue entry (wqe). + * + * @param[in] dest Destination address for data transmission. + * @param[in] value Data value for the atomic operation. + * @param[in] cond Used in atomic comparisons. + * @param[in] pe Destination processing element of data transmission. + * @param[in] atomic_op The atomic operation to perform. + */ + __device__ void atomic_nofetch(void *dest, int64_t value, int64_t cond, int pe, uint8_t atomic_op); + + char *const *base_heap{nullptr}; + + private: + /** + * @brief Helper method to build work requests for the send queue. + * + * @param[in] pe Destination processing element of data transmission. + * @param[in] size Size in bytes of data transmission. + * @param[in] raddr Remote address. + * @param[in] opcode Operation to be performed. + * @param[in] atomic_data An atomic data value to be used. + * @param[in] atomic_cmp An atomic comparison operation to be performed. + * @param[in] fetching True if the operation returns a value. + */ + __device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); + + /** + * @brief Helper method to build work requests for the send queue. + * + * @param[in] pe Destination processing element of data transmission. + * @param[in] size Size in bytes of data transmission. + * @param[in] laddr Local address. + * @param[in] raddr Remote address. + * @param[in] opcode Operation to be performed. + */ + __device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); + + /** + * @brief Helper method to ring the doorbell + * + * @param[in] db_val Doorbell value is written by method. + */ +#if defined(GDA_IONIC) + __device__ void ring_doorbell(uint32_t pos); +#elif defined(GDA_BNXT) + __device__ void ring_sq_doorbell(uint32_t slot_idx); + __device__ void ring_cq_doorbell(uint32_t slot_idx); +#else + __device__ void ring_doorbell(uint64_t db_val, uint64_t my_sq_counter); +#endif + +#ifdef GDA_IONIC + __device__ uint64_t get_same_qp_lane_mask(); + + __device__ bool cq_lock_try_acquire(uint64_t active_lane_mask); + __device__ void cq_lock_release(uint64_t active_lane_mask); + + /** + * @brief Reserve space in the sq to post this many wqes. + * @param my_tid my logical thread id. + * @param num_wqes number of sq wqes to reserve for this wave. + * @return position of my_tid=0's wqe. + */ + __device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes); + + /** + * @brief Ring the sq doorbell maintaining order between waves. + * @param last this is the last wqe posted in this wave. + * @param my_sq_prod position of my_tid=0's wqe. + * @param num_wqes number of sq wqes posted in this wave. + * @param wqe this thread's wqe. + * @return doorbell producer index. + */ + __device__ uint32_t commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe); + + /** + * @brief Helper method to poll the next completion queue entry. + */ + __device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask); + + /** + * @brief Helper method to drain completion queue entries. + * @param cons wait for sq_msn to catch up to this position. + */ + __device__ __attribute__((noinline)) void quiet_internal(uint64_t active_lane_mask, uint32_t cons); + + uint64_t *cq_dbreg{nullptr}; + uint64_t cq_dbval{0}; + uint64_t cq_mask{0}; + struct ionic_v1_cqe *cq_buf{nullptr}; + uint32_t cq_lock{SPIN_LOCK_UNLOCKED}; + uint32_t cq_pos{0}; + uint32_t cq_dbpos{0}; + + uint64_t *sq_dbreg{nullptr}; + uint64_t sq_dbval{0}; + uint64_t sq_mask{0}; + struct ionic_v1_wqe *sq_buf{nullptr}; + uint32_t sq_dbprod{0}; + uint32_t sq_prod{0}; + uint32_t sq_msn{0}; + + uint32_t inline_threshold{0}; + +#elif defined(GDA_BNXT) + uint64_t *dbr; + struct bnxt_device_cq cq; + struct bnxt_device_sq sq; + + __device__ int poll_cq(); +#else // !GDA_IONIC && !GDA_BNXT + + db_reg_t db{}; + + uint64_t cq_consumer{0}; + uint64_t quiet_posted{0}; + uint64_t quiet_active{0}; + uint64_t quiet_completed{0}; + + /* + * struct mlx5dv_cq { + * void *buf; + * __be32 *dbrec; + * uint32_t cqe_cnt; + * uint32_t cqe_size; + * void *cq_uar; + * uint32_t cqn; + * uint64_t comp_mask; + * }; + */ + mlx5_cqe64 *cq_buf{nullptr}; + volatile uint32_t *cq_dbrec{nullptr}; + uint32_t cq_cnt{0}; + uint32_t cq_log_cnt{0}; + + /* + * struct mlx5dv_qp { + * __be32 *dbrec; + * struct { + * void *buf; + * uint32_t wqe_cnt; + * uint32_t stride; + * } sq; + * struct { + * void *buf; + * uint32_t wqe_cnt; + * uint32_t stride; + * } rq; + * struct { + * void *reg; + * uint32_t size; + * } bf; + * uint64_t comp_mask; + * off_t uar_mmap_offset; + * uint32_t tirn; + * uint32_t tisn; + * uint32_t rqn; + * uint32_t sqn; + * uint64_t tir_icm_addr; + * }; + */ + volatile uint32_t *dbrec{nullptr}; + uint64_t *sq_buf{nullptr}; + uint16_t sq_wqe_cnt{0}; + uint64_t sq_posted{0}; + uint64_t sq_db_touched{0}; + uint64_t sq_sunk{0}; + + static constexpr size_t OUTSTANDING_TABLE_SIZE = 65536; + uint64_t outstanding_wqes[OUTSTANDING_TABLE_SIZE]{0}; + +#endif // GDA_IONIC + + uint32_t qp_num{0}; + uint32_t rkey{0}; + uint32_t lkey{0}; + + uint64_t* nonfetching_atomic{nullptr}; + uint32_t nonfetching_atomic_lkey{0}; + + uint64_t* fetching_atomic{nullptr}; + uint32_t fetching_atomic_lkey{0}; + + static const uint32_t FETCHING_ATOMIC_CNT{1024}; + static_assert(FETCHING_ATOMIC_CNT % WF_SIZE == 0); + using FreeListT = FreeList; + FreeListT* fetching_atomic_freelist{nullptr}; + + HIPAllocator allocator{}; +}; + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_ diff --git a/src/gda/segment_builder.cpp b/src/gda/segment_builder.cpp new file mode 100644 index 0000000000..ca3033d103 --- /dev/null +++ b/src/gda/segment_builder.cpp @@ -0,0 +1,118 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "segment_builder.hpp" + +#include "util.hpp" +#include "endian.hpp" + +namespace rocshmem { + +__device__ SegmentBuilder::SegmentBuilder(uint64_t wqe_idx, void *base) { + mlx5_segment *base_ptr = static_cast(base); + size_t segment_offset = wqe_idx * SEGMENTS_PER_WQE; + segp = &base_ptr[segment_offset]; +} + +/* + * Control segment - contains some control information for the current WQE. + * + * Output: + * seg - control segment to be filled + * Input: + * pi - WQEBB number of the first block of this WQE. + * This number should wrap at 0xffff, regardless of + * size of the WQ. + * opcode - Opcode of this WQE. Encodes the type of operation + * to be executed on the QP. + * opmod - Opcode modifier. + * qp_num - QP/SQ number this WQE is posted to. + * fm_ce_se - FM (fence mode), CE (completion and event mode) + * and SE (solicited event). + * ds - WQE size in octowords (16-byte units). DS accounts for all + * the segments in the WQE as summarized in WQE construction. + * signature - WQE signature. + * imm - Immediate data/Invalidation key/UMR mkey. + */ +/* + * static MLX5DV_ALWAYS_INLINE + * void mlx5dv_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *seg, uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm) + * { + * seg->opmod_idx_opcode = htobe32(((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode); + * seg->qpn_ds = htobe32((qp_num << 8) | ds); + * seg->fm_ce_se = fm_ce_se; + * seg->signature = signature; + * // The caller should prepare "imm" in advance based on WR opcode. + * // For IBV_WR_SEND_WITH_IMM and IBV_WR_RDMA_WRITE_WITH_IMM, + * // the "imm" should be assigned as is. + * // For the IBV_WR_SEND_WITH_INV, it should be htobe32(imm). + * seg->imm = imm; + * } + */ +__device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm) { + segp->ctrl_seg = {0}; + swap_endian_store(&segp->ctrl_seg.opmod_idx_opcode, ((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode); + swap_endian_store(&segp->ctrl_seg.qpn_ds, qp_num << 8 | ds); + segp->ctrl_seg.fm_ce_se = fm_ce_se; + segp->ctrl_seg.signature = signature; + segp->ctrl_seg.imm = imm; + segp++; +} + +__device__ void SegmentBuilder::update_raddr_seg(uintptr_t *raddr, uint32_t rkey) { + segp->raddr_seg = {0}; + swap_endian_store(reinterpret_cast(&segp->raddr_seg.raddr), reinterpret_cast(raddr)); + segp->raddr_seg.rkey = rkey; + segp++; +} + +/* + * Data Segments - contain pointers and a byte count for the scatter/gather list. + * They can optionally contain data, which will save a memory read access for + * gather Work Requests. + */ +/* + * static MLX5DV_ALWAYS_INLINE + * void mlx5dv_set_data_seg(struct mlx5_wqe_data_seg *seg, uint32_t length, uint32_t lkey, uintptr_t address) { + * seg->byte_count = htobe32(length); + * seg->lkey = htobe32(lkey); + * seg->addr = htobe64(address); + * } + */ +__device__ void SegmentBuilder::update_data_seg(uintptr_t *address, uint32_t length, uint32_t lkey) { + segp->data_seg = {0}; + swap_endian_store(&segp->data_seg.byte_count, length); + segp->data_seg.lkey = lkey; + swap_endian_store(reinterpret_cast(&segp->data_seg.addr), reinterpret_cast(address)); + segp++; +} + +__device__ void SegmentBuilder::update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp) { + segp->atomic_seg = {0}; + swap_endian_store(reinterpret_cast(&segp->atomic_seg.swap_add), atomic_data); + swap_endian_store(reinterpret_cast(&segp->atomic_seg.compare), atomic_cmp); + segp++; +} + +} // namespace rocshmem diff --git a/src/gda/segment_builder.hpp b/src/gda/segment_builder.hpp new file mode 100644 index 0000000000..130a9604f3 --- /dev/null +++ b/src/gda/segment_builder.hpp @@ -0,0 +1,91 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_GDA_SEGMENT_BUILDER_HPP_ +#define LIBRARY_SRC_GDA_SEGMENT_BUILDER_HPP_ + +#include + +#include "util.hpp" + +namespace rocshmem { + +class SegmentBuilder { + public: + __device__ SegmentBuilder(uint64_t wqe_idx, void *base); + + /* + * struct mlx5_wqe_ctrl_seg { + * __be32 opmod_idx_opcode; + * __be32 qpn_ds; + * uint8_t signature; + * __be16 dci_stream_channel_id; + * uint8_t fm_ce_se; + * __be32 imm; + * } __attribute__((__packed__)) __attribute__((__aligned__(4))); + */ + __device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm); + + /* + * struct mlx5_wqe_raddr_seg { + * __be64 raddr; + * __be32 rkey; + * __be32 reserved; + * }; + */ + __device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey); + + /* + * struct mlx5_wqe_data_seg { + * __be32 byte_count; + * __be32 lkey; + * __be64 addr; + * }; + */ + __device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey); + + /* + * struct mlx5_wqe_atomic_seg { + * __be64 swap_add; + * __be64 compare; + * }; + */ + __device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp); + + private: + const int SEGMENTS_PER_WQE = 4; + + union mlx5_segment { + mlx5_wqe_ctrl_seg ctrl_seg; + mlx5_wqe_raddr_seg raddr_seg; + mlx5_wqe_data_seg data_seg; + mlx5_wqe_atomic_seg atomic_seg; + }; + + mlx5_segment *segp; +}; + +} // namespace rocshmem + +#endif // LIBRARY_SRC_GDA_SEGMENT_BUILDER_HPP_ diff --git a/src/gda/topology.cpp b/src/gda/topology.cpp new file mode 100644 index 0000000000..a49312bb99 --- /dev/null +++ b/src/gda/topology.cpp @@ -0,0 +1,884 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + *****************************************************************************/ + +#include "topology.hpp" + +using namespace rocshmem; + +namespace rocshmem +{ + + const char* GidPriorityStr[] = { + "RoCEv1 Link-local", + "RoCEv2 Link-local", + "RoCEv1 IPv6", + "RoCEv2 IPv6", + "RoCEv1 IPv4-mapped IPv6", + "RoCEv2 IPv4-mapped IPv6" + }; + + // Check that CPU memory array of numBytes has been allocated on targetId NUMA node + static int CheckPages(char* array, size_t numBytes, int targetId) + { + size_t const pageSize = getpagesize(); + size_t const numPages = (numBytes + pageSize - 1) / pageSize; + + std::vector pages(numPages); + std::vector status(numPages); + + pages[0] = array; + for (int i = 1; i < numPages; i++) { + pages[i] = (char*)pages[i-1] + pageSize; + } + + long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0); + if (retCode) { + fprintf(stderr,"Unable to collect page table information for allocated memory. " + "Ensure NUMA library is installed properly"); + return -1; + } + + size_t mistakeCount = 0; + for (size_t i = 0; i < numPages; i++) { + if (status[i] < 0) { + fprintf(stderr, "Unexpected page status (%d) for page %zu", status[i], i); + return -1; + } + if (status[i] != targetId) mistakeCount++; + } + if (mistakeCount > 0) { + fprintf(stderr, "%lu out of %lu pages for memory allocation were not on NUMA node %d." + " This could be due to hardware memory issues, or the use of numa-rebalancing daemons such as numad", + mistakeCount, numPages, targetId); + return -1; + } + return ROCSHMEM_SUCCESS; + } + + // Allocate memory + static int AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr) + { + if (numBytes == 0) { + fprintf(stderr, "Unable to allocate 0 bytes"); + return -1; + } + *memPtr = nullptr; + + MemType const& memType = memDevice.memType; + + if (IsCpuMemType(memType)) { + // Set numa policy prior to call to hipHostMalloc + numa_set_preferred(memDevice.memIndex); + + // Allocate host-pinned memory (should respect NUMA mem policy) + CHECK_HIP(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent)); + + // Check that the allocated pages are actually on the correct NUMA node + memset(*memPtr, 0, numBytes); + ERR_CHECK(CheckPages((char*)*memPtr, numBytes, memDevice.memIndex)); + // Reset to default numa mem policy + numa_set_preferred(-1); + } else if (IsGpuMemType(memType)) { + int prev_dev; + CHECK_HIP(hipGetDevice(&prev_dev)); + + // Switch to the appropriate GPU + CHECK_HIP(hipSetDevice(memDevice.memIndex)); + + // Allocate GPU memory on appropriate device + CHECK_HIP(hipMalloc((void**)memPtr, numBytes)); + + // Clear the memory + CHECK_HIP(hipMemset(*memPtr, 0, numBytes)); + CHECK_HIP(hipDeviceSynchronize()); + + // Reset to original GPU + CHECK_HIP(hipSetDevice(prev_dev)); + } else { + printf("Unsupported memory type (%d)", memType); + return -1; + } + return ROCSHMEM_SUCCESS; + } + + // Deallocate memory + static int DeallocateMemory(MemType memType, void *memPtr, size_t const bytes) + { + // Avoid deallocating nullptr + if (memPtr == nullptr) { + fprintf(stderr, "Attempted to free null pointer for %lu bytes", bytes); + return -1; + } + + switch (memType) { + case MEM_CPU: + { + CHECK_HIP(hipHostFree(memPtr)); + break; + } + case MEM_GPU: + { + CHECK_HIP(hipFree(memPtr)); + break; + } + default: + fprintf(stderr, "Attempting to deallocate unrecognized memory type (%d)", memType); + return -1; + } + return ROCSHMEM_SUCCESS; + } + + + // HSA-related functions + //======================================================================================== + + static int GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent) + { + static bool isInitialized = false; + static std::vector cpuAgents; + static std::vector gpuAgents; + + int const& exeIndex = exeDevice.exeIndex; + int const numCpus = GetNumDevices(EXE_CPU); + int const numGpus = GetNumDevices(EXE_GPU); + + // Initialize results on first use + if (!isInitialized) { + hsa_amd_pointer_info_t info; + info.size = sizeof(info); + + int err; + int32_t* tempBuffer; + + // Index CPU agents + cpuAgents.clear(); + for (int i = 0; i < numCpus; i++) { + ERR_CHECK(AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer)); + CHECK_HSA(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL)); + cpuAgents.push_back(info.agentOwner); + ERR_CHECK(DeallocateMemory(MEM_CPU, tempBuffer, 1024)); + } + + // Index GPU agents + gpuAgents.clear(); + for (int i = 0; i < numGpus; i++) { + ERR_CHECK(AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer)); + CHECK_HSA(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL)); + gpuAgents.push_back(info.agentOwner); + ERR_CHECK(DeallocateMemory(MEM_GPU, tempBuffer, 1024)); + } + isInitialized = true; + } + + switch (exeDevice.exeType) { + case EXE_CPU: + if (exeIndex < 0 || exeIndex >= numCpus) { + fprintf(stderr, "CPU index must be between 0 and %d inclusively", numCpus - 1); + return -1; + } + agent = cpuAgents[exeDevice.exeIndex]; + break; + case EXE_GPU: + if (exeIndex < 0 || exeIndex >= numGpus) { + fprintf(stderr, "GPU index must be between 0 and %d inclusively", numGpus - 1); + return -1; + } + agent = gpuAgents[exeIndex]; + break; + default: + fprintf(stderr, "Attempting to get HSA agent of unknown or unsupported executor type (%d)", + exeDevice.exeType); + return -1; + } + return ROCSHMEM_SUCCESS; + } + + // Get the hsa_agent_t associated with a MemDevice + static int GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent) + { + if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent); + if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU, memDevice.memIndex}, agent); + + fprintf(stderr, "Unable to get HSA agent for memDevice (%d,%d)", + memDevice.memType, memDevice.memIndex); + return -1; + } + + // Structure to track PCIe topology + struct PCIeNode + { + std::string address; ///< PCIe address for this PCIe node + std::string description; ///< Description for this PCIe node + std::set children; ///< Children PCIe nodes + + // Default constructor + PCIeNode() : address(""), description("") {} + + // Constructor + PCIeNode(std::string const& addr) : address(addr) {} + + // Constructor + PCIeNode(std::string const& addr, std::string const& desc) + :address(addr), description(desc) {} + + // Comparison operator for std::set + bool operator<(PCIeNode const& other) const { + return address < other.address; + } + }; + + // Structure to track information about IBV devices + struct IbvDevice + { + ibv_device* devicePtr; + std::string name; + std::string busId; + bool hasActivePort; + int numaNode; + int gidIndex; + std::string gidDescriptor; + bool isRoce; + }; + + // Function to collect information about IBV devices + //======================================================================================== + static bool IsConfiguredGid(union ibv_gid const& gid) + { + const struct in6_addr *a = (struct in6_addr *) gid.raw; + int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]); + if (((a->s6_addr32[0] | trailer) == 0UL) || + ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) { + return false; + } + return true; + } + + static bool LinkLocalGid(union ibv_gid const& gid) + { + const struct in6_addr *a = (struct in6_addr *) gid.raw; + if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) { + return true; + } + return false; + } + + static int GetRoceVersionNumber(struct ibv_context* const& context, + int const& portNum, + int const& gidIndex, + int& version) + { + char const* deviceName = ibv_get_device_name(context->device); + char gidRoceVerStr[16] = {}; + char roceTypePath[PATH_MAX] = {}; + sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", + deviceName, portNum, gidIndex); + + int fd = open(roceTypePath, O_RDONLY); + if (fd == -1) { + fprintf(stderr, "Failed while opening RoCE file path (%s)", roceTypePath); + return -1; + } + + int ret = read(fd, gidRoceVerStr, 15); + close(fd); + + if (ret == -1) { + fprintf(stderr, "Failed while reading RoCE version"); + return -1; + } + + if (strlen(gidRoceVerStr)) { + if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0 + || strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) { + version = 1; + } + else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) { + version = 2; + } + } + return ROCSHMEM_SUCCESS; + } + + static bool IsIPv4MappedIPv6(const union ibv_gid &gid) + { + // look for ::ffff:x.x.x.x format + // From Broadcom documentation + // https://techdocs.broadcom.com/us/en/storage-and-ethernet-connectivity/ethernet-nic-controllers/bcm957xxx/adapters/frequently-asked-questions1.html + // "The IPv4 address is really an IPv4 address mapped into the IPv6 address space. + // This can be identified by 80 “0” bits, followed by 16 “1” bits (“FFFF” in hexadecimal) + // followed by the original 32-bit IPv4 address." + return (gid.global.subnet_prefix == 0 && + gid.raw[8] == 0 && + gid.raw[9] == 0 && + gid.raw[10] == 0xff && + gid.raw[11] == 0xff); + } + + static int GetGidIndex(struct ibv_context* context, + int const& gidTblLen, + int const& portNum, + std::pair& gidInfo) + { + if(gidInfo.first >= 0) return ROCSHMEM_SUCCESS; // honor user choice + union ibv_gid gid; + + GidPriority highestPriority = GidPriority::UNKNOWN; + int gidIndex = -1; + + for (int i = 0; i < gidTblLen; ++i) { + IBV_CALL(ibv_query_gid, context, portNum, i, &gid); + if (!IsConfiguredGid(gid)) continue; + int gidCurrRoceVersion; + if(GetRoceVersionNumber(context, portNum, i, gidCurrRoceVersion) != ROCSHMEM_SUCCESS) continue; + GidPriority currPriority; + if (IsIPv4MappedIPv6(gid)) { + currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV4 : GidPriority::ROCEV1_IPV4; + } else if (!LinkLocalGid(gid)) { + currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV6 : GidPriority::ROCEV1_IPV6; + } else { + currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_LINK_LOCAL : GidPriority::ROCEV1_LINK_LOCAL; + } + if(currPriority > highestPriority) { + highestPriority = currPriority; + gidIndex = i; + } + } + + if (highestPriority == GidPriority::UNKNOWN) { + gidInfo.first = -1; + fprintf(stderr, "Failed to auto-detect a valid GID index. Try setting it manually through IB_GID_INDEX"); + return -1; + } + gidInfo.first = gidIndex; + gidInfo.second = GidPriorityStr[highestPriority]; + return ROCSHMEM_SUCCESS; + } + + static vector& GetIbvDeviceList() + { + static bool isInitialized = false; + static vector ibvDeviceList = {}; + + // Build list on first use + if (!isInitialized) { + + // Query the number of IBV devices + int numIbvDevices = 0; + ibv_device** deviceList = ibv_get_device_list(&numIbvDevices); + + if (deviceList && numIbvDevices > 0) { + // Loop over each device to collect information + for (int i = 0; i < numIbvDevices; i++) { + IbvDevice ibvDevice; + ibvDevice.devicePtr = deviceList[i]; + ibvDevice.name = deviceList[i]->name; + ibvDevice.hasActivePort = false; + { + struct ibv_context *context = ibv_open_device(ibvDevice.devicePtr); + if (context) { + struct ibv_device_attr deviceAttr; + if (!ibv_query_device(context, &deviceAttr)) { + int activePort; + ibvDevice.gidIndex = -1; + for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) { + struct ibv_port_attr portAttr; + if (ibv_query_port(context, port, &portAttr)) continue; + if (portAttr.state == IBV_PORT_ACTIVE) { + activePort = port; + ibvDevice.hasActivePort = true; + if(portAttr.link_layer == IBV_LINK_LAYER_ETHERNET) { + ibvDevice.isRoce = true; + std::pair gidInfo (-1, ""); + auto res = GetGidIndex(context, portAttr.gid_tbl_len, activePort, gidInfo); + if (res == ROCSHMEM_SUCCESS) { + ibvDevice.gidIndex = gidInfo.first; + ibvDevice.gidDescriptor = gidInfo.second; + } + } + break; + } + } + } + ibv_close_device(context); + } + } + ibvDevice.busId = ""; + { + std::string device_path(ibvDevice.devicePtr->dev_path); + if (std::filesystem::exists(device_path)) { + std::string pciPath = std::filesystem::canonical(device_path + "/device").string(); + std::size_t pos = pciPath.find_last_of('/'); + if (pos != std::string::npos) { + ibvDevice.busId = pciPath.substr(pos + 1); + } + } + } + + // Get nearest numa node for this device + ibvDevice.numaNode = -1; + std::filesystem::path devicePath = "/sys/bus/pci/devices/" + ibvDevice.busId + "/numa_node"; + std::string canonicalPath = std::filesystem::canonical(devicePath).string(); + + if (std::filesystem::exists(canonicalPath)) { + std::ifstream file(canonicalPath); + if (file.is_open()) { + std::string numaNodeStr; + std::getline(file, numaNodeStr); + int numaNodeVal; + if (sscanf(numaNodeStr.c_str(), "%d", &numaNodeVal) == 1) + ibvDevice.numaNode = numaNodeVal; + file.close(); + } + } + ibvDeviceList.push_back(ibvDevice); + } + } + ibv_free_device_list(deviceList); + isInitialized = true; + } + return ibvDeviceList; + } + + // PCIe-related functions + //======================================================================================== + + // Prints off PCIe tree + static void PrintPCIeTree(PCIeNode const& node, + std::string const& prefix = "", + bool isLast = true) + { + if (!node.address.empty()) { + printf("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str()); + if (!node.description.empty()) { + printf("(%s)", node.description.c_str()); + } + printf("\n"); + } + auto const& children = node.children; + for (auto it = children.begin(); it != children.end(); ++it) { + PrintPCIeTree(*it, prefix + (isLast ? " " : "│ "), std::next(it) == children.end()); + } + } + + // Inserts nodes along pcieAddress down a tree starting from root + static int InsertPCIePathToTree(std::string const& pcieAddress, + std::string const& description, + PCIeNode& root) + { + std::filesystem::path devicePath = "/sys/bus/pci/devices/" + pcieAddress; + std::string canonicalPath = std::filesystem::canonical(devicePath).string(); + + if (!std::filesystem::exists(devicePath)) { + fprintf(stderr, "Device path %s does not exist", devicePath.c_str()); + return -1; + } + + std::istringstream iss(canonicalPath); + std::string token; + + PCIeNode* currNode = &root; + while (std::getline(iss, token, '/')) { + auto it = (currNode->children.insert(PCIeNode(token))).first; + currNode = const_cast(&(*it)); + } + currNode->description = description; + + return ROCSHMEM_SUCCESS; + } + + // Returns root node for PCIe tree. Constructed on first use + static PCIeNode* GetPCIeTreeRoot() + { + static bool isInitialized = false; + static PCIeNode pcieRoot; + + // Build PCIe tree on first use + if (!isInitialized) { + // Add NICs to the tree + int numNics = rocshmem::GetNumDevices(rocshmem::EXE_NIC); + auto const& ibvDeviceList = rocshmem::GetIbvDeviceList(); + for (IbvDevice const& ibvDevice : ibvDeviceList) { + if (!ibvDevice.hasActivePort || ibvDevice.busId == "") continue; + InsertPCIePathToTree(ibvDevice.busId, ibvDevice.name, pcieRoot); + } + + // Add GPUs to the tree + int numGpus = rocshmem::GetNumDevices(rocshmem::EXE_GPU); + for (int i = 0; i < numGpus; ++i) { + char hipPciBusId[64]; + if (hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i) == hipSuccess) { + InsertPCIePathToTree(hipPciBusId, "GPU " + std::to_string(i), pcieRoot); + } + } +#ifdef VERBS_DEBUG + PrintPCIeTree(pcieRoot); +#endif + isInitialized = true; + } + return &pcieRoot; + } + + // Finds the lowest common ancestor in PCIe tree between two nodes + static PCIeNode const* GetLcaBetweenNodes(PCIeNode const* root, + std::string const& node1Address, + std::string const& node2Address) + { + if (!root || root->address == node1Address || root->address == node2Address) + return root; + + PCIeNode const* lcaFound1 = nullptr; + PCIeNode const* lcaFound2 = nullptr; + + // Recursively iterate over children + for (auto const& child : root->children) { + PCIeNode const* lca = GetLcaBetweenNodes(&child, node1Address, node2Address); + if (!lca) continue; + if (!lcaFound1) { + // First time found + lcaFound1 = lca; + } else { + // Second time found + lcaFound2 = lca; + break; + } + } + + // If two children were found, then current node is the lowest common ancestor + return (lcaFound1 && lcaFound2) ? root : lcaFound1; + } + + // Gets the depth of an node in the PCIe tree + static int GetLcaDepth(std::string const& targetBusID, + PCIeNode const* const& node, + int depth = 0) + { + if (!node) return -1; + if (targetBusID == node->address) return depth; + + for (auto const& child : node->children) { + int distance = GetLcaDepth(targetBusID, &child, depth + 1); + if (distance != -1) + return distance; + } + return -1; + } + + // Function to extract the bus number from a PCIe address (domain:bus:device.function) + static int ExtractBusNumber(std::string const& pcieAddress) + { + int domain, bus, device, function; + char delimiter; + + std::istringstream iss(pcieAddress); + iss >> std::hex >> domain >> delimiter >> bus >> delimiter >> device >> delimiter >> function; + if (iss.fail()) { +#ifdef VERBS_DEBUG + printf("Invalid PCIe address format: %s\n", pcieAddress.c_str()); +#endif + return -1; + } + return bus; + } + + // Function to compute the distance between two bus IDs + static int GetBusIdDistance(std::string const& pcieAddress1, + std::string const& pcieAddress2) + { + int bus1 = ExtractBusNumber(pcieAddress1); + int bus2 = ExtractBusNumber(pcieAddress2); + return (bus1 < 0 || bus2 < 0) ? -1 : std::abs(bus1 - bus2); + } + + // Given a target busID and a set of candidate devices, returns a set of indices + // that is "closest" to the target + static std::set GetNearestDevicesInTree(std::string const& targetBusId, + std::vector const& candidateBusIdList) + { + int maxDepth = -1; + int minDistance = std::numeric_limits::max(); + std::set matches = {}; + + // Loop over the candidates to find the ones with the lowest common ancestor (LCA) + for (int i = 0; i < candidateBusIdList.size(); i++) { + std::string const& candidateBusId = candidateBusIdList[i]; + if (candidateBusId == "") continue; + PCIeNode const* lca = GetLcaBetweenNodes(GetPCIeTreeRoot(), targetBusId, candidateBusId); + if (!lca) continue; + + int depth = GetLcaDepth(lca->address, GetPCIeTreeRoot()); + int currDistance = GetBusIdDistance(targetBusId, candidateBusId); + + // When more than one LCA match is found, choose the one with smallest busId difference + // NOTE: currDistance could be -1, which signals problem with parsing, however still + // remains a valid "closest" candidate, so is included + if (depth > maxDepth || (depth == maxDepth && depth >= 0 && currDistance < minDistance)) { + maxDepth = depth; + matches.clear(); + matches.insert(i); + minDistance = currDistance; + } else if (depth == maxDepth && depth >= 0 && currDistance == minDistance) { + matches.insert(i); + } + } + return matches; + } + + int GetNumDevices(DeviceType exeType) + { + switch (exeType) { + case rocshmem::EXE_CPU: + return numa_num_configured_nodes(); + case rocshmem::EXE_GPU: + { + int numDetectedGpus = 0; + hipError_t status = hipGetDeviceCount(&numDetectedGpus); + if (status != hipSuccess) numDetectedGpus = 0; + return numDetectedGpus; + } + case rocshmem::EXE_NIC: + { + return GetIbvDeviceList().size(); + } + default: + return 0; + } + } + + int GetClosestCpuNumaToGpu(int gpuIndex) + { + hsa_agent_t gpuAgent; + ERR_CHECK(GetHsaAgent({EXE_GPU, gpuIndex}, gpuAgent)); + + hsa_agent_t closestCpuAgent; + if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, &closestCpuAgent) + == HSA_STATUS_SUCCESS) { + int numCpus = GetNumDevices(EXE_CPU); + for (int i = 0; i < numCpus; i++) { + hsa_agent_t cpuAgent; + ERR_CHECK(GetHsaAgent({EXE_CPU, i}, cpuAgent)); + if (cpuAgent.handle == closestCpuAgent.handle) return i; + } + } + return -1; + } + + int GetClosestCpuNumaToNic(int nicIndex) + { + int numNics = GetNumDevices(rocshmem::EXE_NIC); + if (nicIndex < 0 || nicIndex >= numNics) return -1; + return GetIbvDeviceList()[nicIndex].numaNode; + } + + + int GetClosestNicToGpu(int gpuIndex, char** dev_name) + { + static bool isInitialized = false; + static std::vector closestNicId; + static auto const& ibvDeviceList = GetIbvDeviceList(); + + int numGpus = GetNumDevices(rocshmem::EXE_GPU); + if (gpuIndex < 0 || gpuIndex >= numGpus) return -1; + + // Build closest NICs per GPU on first use + if (!isInitialized) { + closestNicId.resize(numGpus, -1); + + // Build up list of NIC bus addresses + std::vector ibvAddressList; + for (auto const& ibvDevice : ibvDeviceList) + ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : ""); + + // Track how many times a device has been assigned as "closest" + // This allows distributed work across devices using multiple ports (sharing the same busID) + // NOTE: This isn't necessarily optimal, but likely to work in most cases involving multi-port + // Counter example: + // + // G0 prefers (N0,N1), picks N0 + // G1 prefers (N1,N2), picks N1 + // G2 prefers N0, picks N0 + // + // instead of G0->N1, G1->N2, G2->N0 + + std::vector assignedCount(ibvDeviceList.size(), 0); + + // Loop over each GPU to find the closest NIC(s) based on PCIe address + for (int i = 0; i < numGpus; i++) { + // Collect PCIe address for the GPU + char hipPciBusId[64]; + hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i); + if (err != hipSuccess) { +#ifdef VERBS_DEBUG + printf("Failed to get PCI Bus ID for HIP device %d: %s\n", i, hipGetErrorString(err)); +#endif + closestNicId[i] = -1; + continue; + } + + // Find closest NICs + std::set closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList); + + // Pick the least-used NIC to assign as closest + int closestIdx = -1; + for (auto idx : closestNicIdxs) { + if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx]) + closestIdx = idx; + } + + // The following will only use distance between bus IDs + // to determine the closest NIC to GPU if the PCIe tree approach fails + if (closestIdx < 0) { +#ifdef VERBS_DEBUG + printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n"); +#endif + + int minDistance = std::numeric_limits::max(); + for (int j = 0; j < ibvDeviceList.size(); j++) { + if (ibvDeviceList[j].busId != "") { + int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[j].busId); + if (distance < minDistance && distance >= 0) { + minDistance = distance; + closestIdx = j; + } + } + } + } + closestNicId[i] = closestIdx; + if (closestIdx != -1) assignedCount[closestIdx]++; + } + isInitialized = true; + } + + DPRINTF("GPU Device id: %d closest NIC id : %d name: %s\n", gpuIndex, closestNicId[gpuIndex], + ibvDeviceList[closestNicId[gpuIndex]].name.c_str()); + if (dev_name != NULL) { + *dev_name = strdup(ibvDeviceList[closestNicId[gpuIndex]].name.c_str()); + } + + return closestNicId[gpuIndex]; + } + + static int RemappedCpuIndex(int origIdx) + { + static std::vector remappingCpu; + + // Build CPU remapping on first use + // Skip numa nodes that are not configured + if (remappingCpu.empty()) { + for (int node = 0; node <= numa_max_node(); node++) + if (numa_bitmask_isbitset(numa_get_mems_allowed(), node)) + remappingCpu.push_back(node); + } + return remappingCpu[origIdx]; + } + + static void PrintNicToGPUTopo(bool outputToCsv) + { + printf(" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s) | GID Index | GID Descriptor\n"); + if(!outputToCsv) + printf("-----+-------------+--------+--------------+------+----------------+-----------+-------------------\n"); + + int numGpus = rocshmem::GetNumDevices(rocshmem::EXE_GPU); + auto const& ibvDeviceList = rocshmem::GetIbvDeviceList(); + for (int i = 0; i < ibvDeviceList.size(); i++) { + + std::string closestGpusStr = ""; + for (int j = 0; j < numGpus; j++) { + if (rocshmem::GetClosestNicToGpu(j, nullptr) == i) { + if (closestGpusStr != "") closestGpusStr += ","; + closestGpusStr += std::to_string(j); + } + } + + printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n", + i, ibvDeviceList[i].name.c_str(), + ibvDeviceList[i].hasActivePort ? "Yes" : "No", + ibvDeviceList[i].busId.c_str(), + ibvDeviceList[i].numaNode, + closestGpusStr.c_str(), + ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A", + ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? ibvDeviceList[i].gidDescriptor.c_str() : "N/A" + ); + } + printf("\n"); + } + + void DisplayTopology(bool outputToCsv) + { + int numCpus = rocshmem::GetNumDevices(rocshmem::EXE_CPU); + int numGpus = rocshmem::GetNumDevices(rocshmem::EXE_GPU); + int numNics = rocshmem::GetNumDevices(rocshmem::EXE_NIC); + char sep = (outputToCsv ? ',' : '|'); + + if (outputToCsv) { + printf("NumCpus,%d\n", numCpus); + printf("NumGpus,%d\n", numGpus); + printf("NumNics,%d\n", numNics); + } else { + printf("\nDetected Topology:\n"); + printf("==================\n"); + printf(" %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1); + printf(" %d GPU device(s)\n", numGpus); + printf(" %d Supported NIC device(s)\n", numNics); + } + + // Print out detected CPU topology + printf("\n %c", sep); + for (int j = 0; j < numCpus; j++) + printf("NUMA %02d%c", j, sep); + printf(" #Cpus %c Closest GPU(s)\n", sep); + + if (!outputToCsv) { + printf("------------+"); + for (int j = 0; j <= numCpus; j++) + printf("-------+"); + printf("---------------\n"); + } + + for (int i = 0; i < numCpus; i++) { + int nodeI = RemappedCpuIndex(i); + printf("NUMA %02d (%02d)%c", i, nodeI, sep); + for (int j = 0; j < numCpus; j++) { + int nodeJ = RemappedCpuIndex(j); + int numaDist = numa_distance(nodeI, nodeJ); + printf(" %5d %c", numaDist, sep); + } + + int numCpuCores = 0; + for (int j = 0; j < numa_num_configured_cpus(); j++) + if (numa_node_of_cpu(j) == nodeI) numCpuCores++; + printf(" %5d %c", numCpuCores, sep); + + for (int j = 0; j < numGpus; j++) { + if (rocshmem::GetClosestCpuNumaToGpu(j) == nodeI) { + printf(" %d", j); + } + } + printf("\n"); + } + printf("\n"); + + // Print out detected NIC topology + PrintNicToGPUTopo(outputToCsv); + } +} diff --git a/src/gda/topology.hpp b/src/gda/topology.hpp new file mode 100644 index 0000000000..2dfbfbbce9 --- /dev/null +++ b/src/gda/topology.hpp @@ -0,0 +1,247 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + *****************************************************************************/ + +#pragma once +#include +#include +#include +#include +#include // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev) +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "util.hpp" + +namespace rocshmem +{ + using std::map; + using std::pair; + using std::set; + using std::vector; + + /** + * Enumeration of GID priority + * + * @note These are the GID types ordered in priority from lowest (0) to highest + */ + enum GidPriority + { + UNKNOWN = -1, ///< Default + ROCEV1_LINK_LOCAL = 0, ///< RoCEv1 Link-local + ROCEV2_LINK_LOCAL = 1, ///< RoCEv2 Link-local fe80::/10 + ROCEV1_IPV6 = 2, ///< RoCEv1 IPv6 + ROCEV2_IPV6 = 3, ///< RoCEv2 IPv6 + ROCEV1_IPV4 = 4, ///< RoCEv1 IPv4-mapped IPv6 + ROCEV2_IPV4 = 5, ///< RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x + }; + + + /** + * Enumeration of supported memory types + * + * @note These are possible types of memory to be used as sources/destinations + */ + enum MemType + { + MEM_CPU = 0, ///< Coarse-grained pinned CPU memory + MEM_GPU = 1, ///< Coarse-grained global GPU memory + }; + + /** + * Enumeration of supported Executor types + * + * @note The Executor is the device used to perform a Transfer + * @note IBVerbs executor is currently not implemented yet + */ + + enum DeviceType + { + EXE_CPU = 0, + EXE_GPU = 1, + EXE_NIC = 2 + }; + + inline bool IsCpuExeType(DeviceType e){ return e == EXE_CPU; } + inline bool IsGpuExeType(DeviceType e){ return e == EXE_GPU; } + inline bool IsNicExeType(DeviceType e){ return e == EXE_NIC; } + + /** + * A ExeDevice defines a specific Executor + */ + struct ExeDevice + { + DeviceType exeType; ///< Device type + int32_t exeIndex; ///< Device index + + bool operator<(ExeDevice const& other) const { + return (exeType < other.exeType) || (exeType == other.exeType && exeIndex < other.exeIndex); + } + }; + + + /** + * A MemDevice indicates a memory type on a specific device + */ + struct MemDevice + { + MemType memType; ///< Memory type + int32_t memIndex; ///< Device index + + bool operator<(MemDevice const& other) const { + return (memType < other.memType) || (memType == other.memType && memIndex < other.memIndex); + } + }; + + inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU); } + inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU); } + + /** + * Returns the index of the NUMA node closest to the given GPU + * + * @param[in] gpuIndex Index of the GPU to query + * @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect + */ + int GetClosestCpuNumaToGpu(int gpuIndex); + + /** + * Returns the index of the NUMA node closest to the given NIC + * + * @param[in] nicIndex Index of the NIC to query + * @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect + */ + int GetClosestCpuNumaToNic(int nicIndex); + + /** + * Returns the index of the NIC closest to the given GPU + * + * @param[in] gpuIndex Index of the GPU to query + * @param[out] dev_name Name of of IB Verbs capable NIC index closest to GPU gpuIndex + * @returns index of IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect + */ + int GetClosestNicToGpu(int gpuIndex, char **dev_name); + + /** + * Returns information about number of available Devices + * + * @param[in] Type Hardware Device type to query + * @returns Number of detected Devices of type Type + */ + int GetNumDevices(DeviceType Type); + + void DisplayTopology(bool outputToCsv); + +}; + +//========================================================================================== +// End of rocshmem API +//========================================================================================== + +// Error check macros +#define ROCSHMEM_SUCCESS 0 + +#define ERR_CHECK(cmd) \ + do { \ + int error = cmd; \ + if (error != 0) { \ + fprintf(stderr, "error: %d at %s:%d\n", error, __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define CHECK_HSA(cmd) \ + do { \ + hsa_status_t error = cmd; \ + if (error != HSA_STATUS_SUCCESS) { \ + fprintf(stderr, "error: %d at %s:%d\n", error, __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + + +// Helper macros for calling RDMA functions and reporting errors +#ifdef VERBS_DEBUG +#define IBV_CALL(__func__, ...) \ + do { \ + int error = __func__(__VA_ARGS__); \ + if (error != 0) { \ + fprintf(stderr,"Encountered IbVerbs error (%d) at line (%d) " \ + "and function (%s)", (error), __LINE__, #__func__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define IBV_PTR_CALL(__ptr__, __func__, ...) \ + do { \ + __ptr__ = __func__(__VA_ARGS__); \ + if (__ptr__ == nullptr) { \ + fprintf(stderr, "Encountered IbVerbs nullptr error at line (%d) " \ + "and function (%s)", __LINE__, #__func__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#else +#define IBV_CALL(__func__, ...) \ + do { \ + int error = __func__(__VA_ARGS__); \ + if (error != 0) { \ + fprintf(stderr, "Encountered IbVerbs error (%d) in func (%s) " \ + , error, #__func__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define IBV_PTR_CALL(__ptr__, __func__, ...) \ + do { \ + __ptr__ = __func__(__VA_ARGS__); \ + if (__ptr__ == nullptr) { \ + fprintf(stderr, "Encountered IbVerbs nullptr error in func (%s) ", \ + #__func__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#endif + diff --git a/src/host/host.cpp b/src/host/host.cpp index 87e8f351af..4cdc5f33a8 100644 --- a/src/host/host.cpp +++ b/src/host/host.cpp @@ -28,8 +28,8 @@ #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) #include "host_helpers.hpp" -#include "../memory/window_info.hpp" -#include "../util.hpp" +#include "memory/window_info.hpp" +#include "util.hpp" #include diff --git a/src/host/host.hpp b/src/host/host.hpp index ef1ef32563..9c777d2e0b 100644 --- a/src/host/host.hpp +++ b/src/host/host.hpp @@ -39,10 +39,10 @@ #include #include "rocshmem/rocshmem.hpp" -#include "../hdp_policy.hpp" -#include "../memory/symmetric_heap.hpp" -#include "../memory/window_info.hpp" -#include "../bootstrap/bootstrap.hpp" +#include "hdp_policy.hpp" +#include "memory/symmetric_heap.hpp" +#include "memory/window_info.hpp" +#include "bootstrap/bootstrap.hpp" namespace rocshmem { diff --git a/src/host/host_helpers.hpp b/src/host/host_helpers.hpp index d6d450a38c..4490c7a9da 100644 --- a/src/host/host_helpers.hpp +++ b/src/host/host_helpers.hpp @@ -26,7 +26,7 @@ #define LIBRARY_SRC_HOST_HOST_HELPERS_HPP_ #include "host.hpp" -#include "../memory/window_info.hpp" +#include "memory/window_info.hpp" #include diff --git a/src/host/host_templates.hpp b/src/host/host_templates.hpp index 79c837fa52..f95ef62e57 100644 --- a/src/host/host_templates.hpp +++ b/src/host/host_templates.hpp @@ -27,8 +27,8 @@ #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) #include "host_helpers.hpp" -#include "../memory/window_info.hpp" -#include "../team.hpp" +#include "memory/window_info.hpp" +#include "team.hpp" #include #include diff --git a/src/ipc/backend_ipc.cpp b/src/ipc/backend_ipc.cpp index 6aea27b784..895bf924d1 100644 --- a/src/ipc/backend_ipc.cpp +++ b/src/ipc/backend_ipc.cpp @@ -61,8 +61,7 @@ int get_ls_non_zero_bit(char *bitmask, int mask_length) { return position; } -IPCBackend::IPCBackend(MPI_Comm comm) - : Backend(comm) { +IPCBackend::IPCBackend(MPI_Comm comm): Backend(comm) { type = BackendType::IPC_BACKEND; initIPC(); @@ -83,8 +82,7 @@ IPCBackend::IPCBackend(MPI_Comm comm) init(); } -IPCBackend::IPCBackend(TcpBootstrap *bootstrap) - : Backend(bootstrap) { +IPCBackend::IPCBackend(TcpBootstrap *bootstrap): Backend(bootstrap) { type = BackendType::IPC_BACKEND; initIPC(bootstrap); // no MPI involved @@ -115,7 +113,7 @@ void IPCBackend::init() { setup_team_world(); - init_wrk_sync_buffer(); + setup_wrk_sync_buffers(); rocshmem_collective_init(); @@ -203,14 +201,14 @@ void IPCBackend::team_destroy(rocshmem_team_t team) { /* Mark the pool as available */ int bit = team_obj->pool_index_; int byte_i = bit / CHAR_BIT; - pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT); + team_pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT); team_obj->~IPCTeam(); CHECK_HIP(hipFree(team_obj)); } void IPCBackend::Allreduce_char_BAND (char* inbuf, char *outbuf, size_t num_bytes, - Team *team) { + Team *team) { // Implement an Allreduce outside of MPI. This is specialized for the scenario // required for the team creation, i.e. assuming bytes and using BAND operation. @@ -251,16 +249,16 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team, * the pool of available work arrays. */ if (team_comm != MPI_COMM_NULL) { - NET_CHECK(MPI_Allreduce(pool_bitmask_, reduced_bitmask_, bitmask_size_, - MPI_CHAR, MPI_BAND, team_comm)); + NET_CHECK(MPI_Allreduce(team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_, + MPI_CHAR, MPI_BAND, team_comm)); } else { - Allreduce_char_BAND (pool_bitmask_, reduced_bitmask_, bitmask_size_, parent_team); + Allreduce_char_BAND (team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_, parent_team); } /* Pick the least significant non-zero bit (logical layout) in the reduced * bitmask */ auto max_num_teams{team_tracker.get_max_num_teams()}; - int common_index = get_ls_non_zero_bit(reduced_bitmask_, max_num_teams); + int common_index = get_ls_non_zero_bit(team_reduced_bitmask_, max_num_teams); if (common_index < 0) { /* No team available */ printf("Could not create team, all bits in use. Aborting.\n"); @@ -269,7 +267,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team, /* Mark the team as taken (by unsetting the bit in the pool bitmask) */ int byte = common_index / CHAR_BIT; - pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT)); + team_pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT)); /** * Allocate device-side memory for team_world and @@ -329,11 +327,11 @@ void IPCBackend::global_exit(int status) { } void IPCBackend::teams_destroy() { - free(pool_bitmask_); - free(reduced_bitmask_); + free(team_pool_bitmask_); + free(team_reduced_bitmask_); } -void IPCBackend::init_wrk_sync_buffer() { +void IPCBackend::setup_wrk_sync_buffers() { /** * calcualte work/sync buffer size */ @@ -342,12 +340,12 @@ void IPCBackend::init_wrk_sync_buffer() { /** * size of barrier sync */ - Wrk_Sync_buffer_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE; + wrk_sync_pool_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE; /** * Size of sync arrays for the teams */ - Wrk_Sync_buffer_size_ += sizeof(long) * max_num_teams * + wrk_sync_pool_size_ += sizeof(long) * max_num_teams * (ROCSHMEM_BARRIER_SYNC_SIZE + ROCSHMEM_REDUCE_SYNC_SIZE + ROCSHMEM_BCAST_SYNC_SIZE + @@ -357,23 +355,23 @@ void IPCBackend::init_wrk_sync_buffer() { * Size of work arrays for the teams * Accommodate largest possible data type for pWrk */ - Wrk_Sync_buffer_size_ += sizeof(double) * max_num_teams * + wrk_sync_pool_size_ += sizeof(double) * max_num_teams * (ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE + ROCSHMEM_ATA_MAX_WRKDATA_SIZE); /** * Size of fence array */ - Wrk_Sync_buffer_size_ += sizeof(int) * num_pes; + wrk_sync_pool_size_ += sizeof(int) * num_pes; /** - * Allocate a buffer of size Wrk_Sync_buffer_size_, using fine-grained + * Allocate a buffer of size wrk_sync_pool_size_, using fine-grained * memory allocator */ - fine_grained_allocator_.allocate((void**)&Wrk_Sync_buffer_ptr_, - Wrk_Sync_buffer_size_); - assert(Wrk_Sync_buffer_ptr_); - temp_Wrk_Sync_buff_ptr_ = Wrk_Sync_buffer_ptr_; + fine_grained_allocator_.allocate((void**)&wrk_sync_pool_, + wrk_sync_pool_size_); + assert(wrk_sync_pool_); + wrk_sync_pool_top_ = wrk_sync_pool_; /* * Allocate a c-array to hold the IPC handles @@ -383,16 +381,16 @@ void IPCBackend::init_wrk_sync_buffer() { /* * Call into the hip runtime to get an IPC handle for the allocated - * Wrk_Sync_buffer_ and store that IPC handle + * wrk_sync_pool_ buffer and store that IPC handle */ - CHECK_HIP(hipIpcGetMemHandle(&ipc_handle[my_pe], Wrk_Sync_buffer_ptr_)); + CHECK_HIP(hipIpcGetMemHandle(&ipc_handle[my_pe], wrk_sync_pool_)); /* * all-to-all exchange with each PE to share the IPC handles. */ if (backend_comm != MPI_COMM_NULL) { MPI_Allgather(MPI_IN_PLACE, sizeof(hipIpcMemHandle_t), MPI_CHAR, - ipc_handle, sizeof(hipIpcMemHandle_t), MPI_CHAR, backend_comm); + ipc_handle, sizeof(hipIpcMemHandle_t), MPI_CHAR, backend_comm); } else { assert (backend_bootstr != nullptr); backend_bootstr->allGather(ipc_handle, sizeof(hipIpcMemHandle_t)); @@ -403,9 +401,9 @@ void IPCBackend::init_wrk_sync_buffer() { * work/sync buffers */ fine_grained_allocator_.allocate( - reinterpret_cast(&Wrk_Sync_buffer_bases_), + reinterpret_cast(&wrk_sync_pool_bases_), num_pes * sizeof(char*)); - assert(Wrk_Sync_buffer_bases_); + assert(wrk_sync_pool_bases_); /* * For all local processing elements, initialize the device-side array @@ -414,11 +412,11 @@ void IPCBackend::init_wrk_sync_buffer() { for (int i = 0; i < num_pes; i++) { if (i != my_pe) { CHECK_HIP(hipIpcOpenMemHandle( - reinterpret_cast(&Wrk_Sync_buffer_bases_[i]), + reinterpret_cast(&wrk_sync_pool_bases_[i]), ipc_handle[i], hipIpcMemLazyEnablePeerAccess)); } else { - Wrk_Sync_buffer_bases_[i] = Wrk_Sync_buffer_ptr_; + wrk_sync_pool_bases_[i] = wrk_sync_pool_; } } } @@ -426,19 +424,19 @@ void IPCBackend::init_wrk_sync_buffer() { void IPCBackend::cleanup_wrk_sync_buffer() { for (int i = 0; i < num_pes; i++) { if (i != my_pe) { - CHECK_HIP(hipIpcCloseMemHandle(Wrk_Sync_buffer_bases_[i])); + CHECK_HIP(hipIpcCloseMemHandle(wrk_sync_pool_bases_[i])); } } - fine_grained_allocator_.deallocate(Wrk_Sync_buffer_bases_); - fine_grained_allocator_.deallocate(Wrk_Sync_buffer_ptr_); + fine_grained_allocator_.deallocate(wrk_sync_pool_bases_); + fine_grained_allocator_.deallocate(wrk_sync_pool_); } void IPCBackend::setup_fence_buffer() { /* * Allocate memory for fence */ - fence_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(int) * num_pes; + fence_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(int) * num_pes; } void IPCBackend::rocshmem_collective_init() { @@ -448,8 +446,8 @@ void IPCBackend::rocshmem_collective_init() { size_t one_sync_size_bytes {sizeof(*barrier_sync)}; size_t sync_size_bytes {one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE}; - barrier_sync = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sync_size_bytes; + barrier_sync = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sync_size_bytes; /* * Initialize the barrier synchronization array with default values. @@ -475,30 +473,30 @@ void IPCBackend::teams_init() { */ auto max_num_teams{team_tracker.get_max_num_teams()}; - barrier_pSync_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE + barrier_pSync_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams; - reduce_pSync_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE + reduce_pSync_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams; - bcast_pSync_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE + bcast_pSync_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams; - alltoall_pSync_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE + alltoall_pSync_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams; /* Accommodating for largest possible data type for pWrk */ - pWrk_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE + pWrk_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams; - pAta_pool = reinterpret_cast(temp_Wrk_Sync_buff_ptr_); - temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE + pAta_pool = reinterpret_cast(wrk_sync_pool_top_); + wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE * max_num_teams; /** @@ -540,18 +538,18 @@ void IPCBackend::teams_init() { * Description shows only a 2-byte long mask but idea extends to any * arbitrary size. */ - bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1) + team_bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1) : (max_num_teams / CHAR_BIT); - pool_bitmask_ = reinterpret_cast(malloc(bitmask_size_)); - reduced_bitmask_ = reinterpret_cast(malloc(bitmask_size_)); + team_pool_bitmask_ = reinterpret_cast(malloc(team_bitmask_size_)); + team_reduced_bitmask_ = reinterpret_cast(malloc(team_bitmask_size_)); - memset(pool_bitmask_, 0, bitmask_size_); - memset(reduced_bitmask_, 0, bitmask_size_); + memset(team_pool_bitmask_, 0, team_bitmask_size_); + memset(team_reduced_bitmask_, 0, team_bitmask_size_); /* Set all to available except the 0th one (reserved for TEAM_WORLD) */ for (int bit_i = 1; bit_i < max_num_teams; bit_i++) { int byte_i = bit_i / CHAR_BIT; - pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT); + team_pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT); } /** diff --git a/src/ipc/backend_ipc.hpp b/src/ipc/backend_ipc.hpp index b783fc6dc4..3036fcfa28 100644 --- a/src/ipc/backend_ipc.hpp +++ b/src/ipc/backend_ipc.hpp @@ -25,14 +25,14 @@ #ifndef LIBRARY_SRC_IPC_BACKEND_HPP_ #define LIBRARY_SRC_IPC_BACKEND_HPP_ -#include "../backend_bc.hpp" -#include "../containers/free_list_impl.hpp" -#include "../hdp_proxy.hpp" -#include "../memory/hip_allocator.hpp" -#include "../context_incl.hpp" +#include "backend_bc.hpp" +#include "containers/free_list_impl.hpp" +#include "hdp_proxy.hpp" +#include "memory/hip_allocator.hpp" +#include "context_incl.hpp" #include "ipc_context_proxy.hpp" -#include "../ipc_policy.hpp" -#include "../bootstrap/bootstrap.hpp" +#include "ipc_policy.hpp" +#include "bootstrap/bootstrap.hpp" namespace rocshmem { @@ -113,7 +113,7 @@ class IPCBackend : public Backend { * * @return Vector containing the addresses of the work/sync bases */ - char** get_wrk_sync_bases() { return Wrk_Sync_buffer_bases_; } + char** get_wrk_sync_bases() { return wrk_sync_pool_bases_; } /** * @brief The host-facing interface that will be used @@ -244,7 +244,7 @@ class IPCBackend : public Backend { /** * @brief The bitmask representing the availability of teams in the pool */ - char *pool_bitmask_{nullptr}; + char *team_pool_bitmask_{nullptr}; /** * @brief Bitmask to store the reduced result of bitmasks on pariticipating @@ -253,12 +253,12 @@ class IPCBackend : public Backend { * With no thread-safety for this bitmask, multithreaded creation of teams is * not supported. */ - char *reduced_bitmask_{nullptr}; + char *team_reduced_bitmask_{nullptr}; /** * @brief Size of the bitmask */ - int bitmask_size_{-1}; + int team_bitmask_size_{-1}; /** * Fine grained memory allocator for buffers used in collectives Routines @@ -268,31 +268,31 @@ class IPCBackend : public Backend { /** * @brief Collective routines work/sync buffer size */ - size_t Wrk_Sync_buffer_size_{}; + size_t wrk_sync_pool_size_{}; /** * @brief Collective routines work/sync buffer base ptr */ - char* const Wrk_Sync_buffer_ptr_{nullptr}; + char* const wrk_sync_pool_{nullptr}; /** * @brief Temporary buffer pointer pointing to the same address as - * Wrk_Sync_buffer_ptr_, used to calculate the starting addresses of + * wrk_sync_pool_, used to calculate the starting addresses of * different work and sync buffers. */ - char *temp_Wrk_Sync_buff_ptr_{nullptr}; + char *wrk_sync_pool_top_{nullptr}; /** * @brief Array containing the addresses of the work/sync buffer bases * of other PEs */ - char** Wrk_Sync_buffer_bases_{nullptr}; + char** wrk_sync_pool_bases_{nullptr}; /** * @brief Initialize memory required for work/sync buffers and open IPC - * handle on PE's Wrk_Sync_buffer_ptr. + * handle on PE's wrk_sync_pool. */ - void init_wrk_sync_buffer(); + void setup_wrk_sync_buffers(); /** * @brief Close IPC memory handles for work/sync buffers and deallocate diff --git a/src/ipc/context_ipc_device.cpp b/src/ipc/context_ipc_device.cpp index f68144ceae..d6454c58ce 100644 --- a/src/ipc/context_ipc_device.cpp +++ b/src/ipc/context_ipc_device.cpp @@ -22,19 +22,14 @@ * IN THE SOFTWARE. *****************************************************************************/ -#include "context_ipc_device.hpp" -#include "context_ipc_tmpl_device.hpp" - #include #include -#include - -#include -#include #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "backend_ipc.hpp" +#include "context_ipc_device.hpp" +#include "context_ipc_tmpl_device.hpp" namespace rocshmem { @@ -46,7 +41,7 @@ __host__ IPCContext::IPCContext(Backend *b, unsigned int ctx_id) barrier_sync = backend->barrier_sync; fence_pool = backend->fence_pool; - Wrk_Sync_buffer_bases_ = backend->get_wrk_sync_bases(); + wrk_sync_pool_bases_ = backend->get_wrk_sync_bases(); ctx_id_ = ctx_id; orders_.store = detail::atomic::rocshmem_memory_order::memory_order_seq_cst; @@ -64,18 +59,15 @@ __device__ void IPCContext::ctx_destroy(){ __device__ void IPCContext::putmem(void *dest, const void *source, size_t nelems, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; - ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[pe] + L_offset, - const_cast(source), nelems); + uint64_t L_offset = reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; + ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[pe] + L_offset, const_cast(source), nelems); ipcImpl_.ipcFence(); } __device__ void IPCContext::getmem(void *dest, const void *source, size_t nelems, int pe) { const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = - const_cast(src_typed) - ipcImpl_.ipc_bases[my_pe]; + uint64_t L_offset = const_cast(src_typed) - ipcImpl_.ipc_bases[my_pe]; ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[pe] + L_offset, nelems); ipcImpl_.ipcFence(); } @@ -107,26 +99,22 @@ __device__ void IPCContext::quiet() { __device__ void *IPCContext::shmem_ptr(const void *dest, int pe) { void *ret = nullptr; void *dst = const_cast(dest); - uint64_t L_offset = - reinterpret_cast(dst) - ipcImpl_.ipc_bases[my_pe]; - ret = ipcImpl_.ipc_bases[pe] + L_offset; + uint64_t L_offset = reinterpret_cast(dst) - ipcImpl_.ipc_bases[my_pe]; + ret = ipcImpl_.ipc_bases[pe] + L_offset; return ret; } __device__ void IPCContext::putmem_wg(void *dest, const void *source, size_t nelems, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; - ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[pe] + L_offset, - const_cast(source), nelems); + uint64_t L_offset = reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; + ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[pe] + L_offset, const_cast(source), nelems); __syncthreads(); } __device__ void IPCContext::getmem_wg(void *dest, const void *source, size_t nelems, int pe) { const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = - const_cast(src_typed) - ipcImpl_.ipc_bases[my_pe]; + uint64_t L_offset = const_cast(src_typed) - ipcImpl_.ipc_bases[my_pe]; ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[pe] + L_offset, nelems); __syncthreads(); } @@ -143,20 +131,16 @@ __device__ void IPCContext::getmem_nbi_wg(void *dest, const void *source, __device__ void IPCContext::putmem_wave(void *dest, const void *source, size_t nelems, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; - ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[pe] + L_offset, - const_cast(source), nelems); + uint64_t L_offset = reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; + ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[pe] + L_offset, const_cast(source), nelems); ipcImpl_.ipcFence(); } __device__ void IPCContext::getmem_wave(void *dest, const void *source, size_t nelems, int pe) { const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = - const_cast(src_typed) - ipcImpl_.ipc_bases[my_pe]; - ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[pe] + L_offset, - nelems); + uint64_t L_offset = const_cast(src_typed) - ipcImpl_.ipc_bases[my_pe]; + ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[pe] + L_offset, nelems); ipcImpl_.ipcFence(); } @@ -172,56 +156,46 @@ __device__ void IPCContext::getmem_nbi_wave(void *dest, const void *source, __device__ void IPCContext::internal_putmem(void *dest, const void *source, size_t nelems, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - Wrk_Sync_buffer_bases_[my_pe]; - memcpy(Wrk_Sync_buffer_bases_[pe] + L_offset, - const_cast(source), nelems); + uint64_t L_offset = reinterpret_cast(dest) - wrk_sync_pool_bases_[my_pe]; + memcpy(wrk_sync_pool_bases_[pe] + L_offset, const_cast(source), nelems); ipcImpl_.ipcFence(); } __device__ void IPCContext::internal_getmem(void *dest, const void *source, size_t nelems, int pe) { const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = - const_cast(src_typed) - Wrk_Sync_buffer_bases_[my_pe]; - memcpy(dest, Wrk_Sync_buffer_bases_[pe] + L_offset, nelems); + uint64_t L_offset = const_cast(src_typed) - wrk_sync_pool_bases_[my_pe]; + memcpy(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems); ipcImpl_.ipcFence(); } __device__ void IPCContext::internal_putmem_wg(void *dest, const void *source, size_t nelems, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - Wrk_Sync_buffer_bases_[my_pe]; - memcpy_wg(Wrk_Sync_buffer_bases_[pe] + L_offset, - const_cast(source), nelems); + uint64_t L_offset = reinterpret_cast(dest) - wrk_sync_pool_bases_[my_pe]; + memcpy_wg(wrk_sync_pool_bases_[pe] + L_offset, const_cast(source), nelems); __syncthreads(); } __device__ void IPCContext::internal_getmem_wg(void *dest, const void *source, size_t nelems, int pe) { const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = - const_cast(src_typed) - Wrk_Sync_buffer_bases_[my_pe]; - memcpy_wg(dest, Wrk_Sync_buffer_bases_[pe] + L_offset, nelems); + uint64_t L_offset = const_cast(src_typed) - wrk_sync_pool_bases_[my_pe]; + memcpy_wg(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems); __syncthreads(); } __device__ void IPCContext::internal_putmem_wave(void *dest, const void *source, size_t nelems, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - Wrk_Sync_buffer_bases_[my_pe]; - memcpy_wave(Wrk_Sync_buffer_bases_[pe] + L_offset, - const_cast(source), nelems); + uint64_t L_offset = reinterpret_cast(dest) - wrk_sync_pool_bases_[my_pe]; + memcpy_wave(wrk_sync_pool_bases_[pe] + L_offset, const_cast(source), nelems); ipcImpl_.ipcFence(); } __device__ void IPCContext::internal_getmem_wave(void *dest, const void *source, size_t nelems, int pe) { const char *src_typed = reinterpret_cast(source); - uint64_t L_offset = - const_cast(src_typed) - Wrk_Sync_buffer_bases_[my_pe]; - memcpy_wave(dest, Wrk_Sync_buffer_bases_[pe] + L_offset, - nelems); + uint64_t L_offset = const_cast(src_typed) - wrk_sync_pool_bases_[my_pe]; + memcpy_wave(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems); ipcImpl_.ipcFence(); } diff --git a/src/ipc/context_ipc_device.hpp b/src/ipc/context_ipc_device.hpp index 03450a4924..046bc1d31e 100644 --- a/src/ipc/context_ipc_device.hpp +++ b/src/ipc/context_ipc_device.hpp @@ -25,9 +25,9 @@ #ifndef LIBRARY_SRC_IPC_CONTEXT_DEVICE_HPP_ #define LIBRARY_SRC_IPC_CONTEXT_DEVICE_HPP_ -#include "../context.hpp" -#include "../atomic.hpp" -#include "../team.hpp" +#include "context.hpp" +#include "atomic.hpp" +#include "team.hpp" namespace rocshmem { @@ -235,8 +235,8 @@ class IPCContext : public Context { //internal functions used by collective operations template __device__ void internal_broadcast(T *dest, const T *source, int nelems, int pe_root, - int pe_start, int stride, int pe_size, - long *p_sync); // NOLINT(runtime/int) + int pe_start, int stride, int pe_size, + long *p_sync); // NOLINT(runtime/int) template __device__ void internal_put_broadcast(T *dst, const T *src, int nelems, @@ -311,7 +311,7 @@ class IPCContext : public Context { * @brief Array containing the addresses of the work/sync buffer bases * of other PEs */ - char **Wrk_Sync_buffer_bases_{nullptr}; + char **wrk_sync_pool_bases_{nullptr}; /** * @brief Decive context Id diff --git a/src/ipc/context_ipc_device_coll.cpp b/src/ipc/context_ipc_device_coll.cpp index 72223aa0d5..78547f701c 100644 --- a/src/ipc/context_ipc_device_coll.cpp +++ b/src/ipc/context_ipc_device_coll.cpp @@ -23,16 +23,16 @@ *****************************************************************************/ #include "rocshmem/rocshmem.hpp" -#include "../context_incl.hpp" +#include "context_incl.hpp" #include "context_ipc_tmpl_device.hpp" -#include "../util.hpp" +#include "util.hpp" #include "ipc_team.hpp" namespace rocshmem { __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start, - int stride, int n_pes, - int64_t *pSync) { + int stride, int n_pes, + int64_t *pSync) { int64_t flag_val = 1; if (pe == PE_start) { // Go through all PE offsets (except current offset = 0) @@ -67,8 +67,8 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start, } __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start, - int stride, int n_pes, - int64_t *pSync) { + int stride, int n_pes, + int64_t *pSync) { int64_t flag_val = 1; if (pe == PE_start) { wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1)); @@ -96,7 +96,7 @@ __device__ void IPCContext::internal_sync(int pe, int PE_start, int stride, } __device__ void IPCContext::internal_sync_wave(int pe, int PE_start, int stride, - int PE_size, int64_t *pSync) { + int PE_size, int64_t *pSync) { if (is_thread_zero_in_wave()) { if (PE_size < 64) { internal_direct_barrier(pe, PE_start, stride, PE_size, pSync); @@ -108,7 +108,7 @@ __device__ void IPCContext::internal_sync_wave(int pe, int PE_start, int stride, // Uses PE values that are relative to world __device__ void IPCContext::internal_sync_wg(int pe, int PE_start, int stride, - int PE_size, int64_t *pSync) { + int PE_size, int64_t *pSync) { __syncthreads(); if (is_thread_zero_in_block()) { if (PE_size < 64) { diff --git a/src/ipc/context_ipc_host.cpp b/src/ipc/context_ipc_host.cpp index 7c459df1ba..e6e630355d 100644 --- a/src/ipc/context_ipc_host.cpp +++ b/src/ipc/context_ipc_host.cpp @@ -27,15 +27,15 @@ #include #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../backend_type.hpp" -#include "../context_incl.hpp" +#include "backend_type.hpp" +#include "context_incl.hpp" #include "backend_ipc.hpp" -#include "../host/host.hpp" +#include "host/host.hpp" namespace rocshmem { __host__ IPCHostContext::IPCHostContext(Backend *backend, - [[maybe_unused]] int64_t options) + [[maybe_unused]] int64_t options) : Context(backend, true) { IPCBackend *b{static_cast(backend)}; @@ -60,22 +60,22 @@ __host__ IPCHostContext::~IPCHostContext() { } __host__ void IPCHostContext::putmem_nbi(void *dest, const void *source, - size_t nelems, int pe) { + size_t nelems, int pe) { host_interface->putmem_nbi(dest, source, nelems, pe, context_window_info); } __host__ void IPCHostContext::getmem_nbi(void *dest, const void *source, - size_t nelems, int pe) { + size_t nelems, int pe) { host_interface->getmem_nbi(dest, source, nelems, pe, context_window_info); } __host__ void IPCHostContext::putmem(void *dest, const void *source, - size_t nelems, int pe) { + size_t nelems, int pe) { host_interface->putmem(dest, source, nelems, pe, context_window_info); } __host__ void IPCHostContext::getmem(void *dest, const void *source, - size_t nelems, int pe) { + size_t nelems, int pe) { host_interface->getmem(dest, source, nelems, pe, context_window_info); } diff --git a/src/ipc/context_ipc_host.hpp b/src/ipc/context_ipc_host.hpp index ddec120299..e14f905035 100644 --- a/src/ipc/context_ipc_host.hpp +++ b/src/ipc/context_ipc_host.hpp @@ -25,7 +25,7 @@ #ifndef LIBRARY_SRC_IPC_CONTEXT_HOST_HPP_ #define LIBRARY_SRC_IPC_CONTEXT_HOST_HPP_ -#include "../context.hpp" +#include "context.hpp" namespace rocshmem { @@ -116,9 +116,9 @@ class IPCHostContext : public Context { template __host__ size_t wait_until_some(T *ivars, size_t nelems, - size_t* indices, - const int *status, - int cmp, T val); + size_t* indices, + const int *status, + int cmp, T val); template __host__ void wait_until_all_vector(T *ivars, size_t nelems, diff --git a/src/ipc/context_ipc_tmpl_device.hpp b/src/ipc/context_ipc_tmpl_device.hpp index 13094d8091..d35cec7f44 100644 --- a/src/ipc/context_ipc_tmpl_device.hpp +++ b/src/ipc/context_ipc_tmpl_device.hpp @@ -28,9 +28,9 @@ #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" #include "context_ipc_device.hpp" -#include "../util.hpp" +#include "util.hpp" #include "ipc_team.hpp" -#include "../rocshmem_calc.hpp" +#include "rocshmem_calc.hpp" #include @@ -45,14 +45,12 @@ __device__ void IPCContext::p(T *dest, T value, int pe) { } template -__device__ void IPCContext::put(T *dest, const T *source, size_t nelems, - int pe) { +__device__ void IPCContext::put(T *dest, const T *source, size_t nelems, int pe) { putmem(dest, source, nelems * sizeof(T), pe); } template -__device__ void IPCContext::put_nbi(T *dest, const T *source, size_t nelems, - int pe) { +__device__ void IPCContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) { putmem_nbi(dest, source, sizeof(T) * nelems, pe); } @@ -64,32 +62,26 @@ __device__ T IPCContext::g(const T *source, int pe) { } template -__device__ void IPCContext::get(T *dest, const T *source, size_t nelems, - int pe) { +__device__ void IPCContext::get(T *dest, const T *source, size_t nelems, int pe) { getmem(dest, source, sizeof(T) * nelems, pe); } template -__device__ void IPCContext::get_nbi(T *dest, const T *source, size_t nelems, - int pe) { +__device__ void IPCContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) { getmem_nbi(dest, source, sizeof(T) * nelems, pe); } // Atomics template __device__ void IPCContext::amo_add(void *dest, T value, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; - ipcImpl_.ipcAMOAdd( - reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), value); + uint64_t L_offset = reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; + ipcImpl_.ipcAMOAdd(reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), value); } template __device__ void IPCContext::amo_set(void *dest, T value, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; - ipcImpl_.ipcAMOSet( - reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), value); + uint64_t L_offset = reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; + ipcImpl_.ipcAMOSet(reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), value); } template @@ -140,34 +132,25 @@ __device__ void IPCContext::amo_xor(void *dst, T value, int pe) { template __device__ void IPCContext::amo_cas(void *dest, T value, T cond, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; - ipcImpl_.ipcAMOCas( - reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), cond, - value); + uint64_t L_offset = reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; + ipcImpl_.ipcAMOCas(reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), cond, value); } template __device__ T IPCContext::amo_fetch_add(void *dest, T value, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; - return ipcImpl_.ipcAMOFetchAdd( - reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), value); + uint64_t L_offset = reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; + return ipcImpl_.ipcAMOFetchAdd(reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), value); } template __device__ T IPCContext::amo_fetch_cas(void *dest, T value, T cond, int pe) { - uint64_t L_offset = - reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; - return ipcImpl_.ipcAMOFetchCas( - reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), cond, - value); + uint64_t L_offset = reinterpret_cast(dest) - ipcImpl_.ipc_bases[my_pe]; + return ipcImpl_.ipcAMOFetchCas(reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), cond, value); } // Collectives template -__device__ void compute_reduce(T *src, T *dst, int size, int wg_id, - int wg_size) { +__device__ void compute_reduce(T *src, T *dst, int size, int wg_id, int wg_size) { for (int i = wg_id; i < size; i += wg_size) { OpWrap::Calc(src, dst, i); } @@ -543,50 +526,42 @@ __device__ void IPCContext::fcollect_linear(rocshmem_team_t team, T *dst, // Block/wave functions template -__device__ void IPCContext::put_wg(T *dest, const T *source, size_t nelems, - int pe) { +__device__ void IPCContext::put_wg(T *dest, const T *source, size_t nelems, int pe) { putmem_wg(dest, source, nelems * sizeof(T), pe); } template -__device__ void IPCContext::put_nbi_wg(T *dest, const T *source, - size_t nelems, int pe) { +__device__ void IPCContext::put_nbi_wg(T *dest, const T *source, size_t nelems, int pe) { putmem_nbi_wg(dest, source, nelems * sizeof(T), pe); } template -__device__ void IPCContext::put_wave(T *dest, const T *source, size_t nelems, - int pe) { +__device__ void IPCContext::put_wave(T *dest, const T *source, size_t nelems, int pe) { putmem_wave(dest, source, nelems * sizeof(T), pe); } template -__device__ void IPCContext::put_nbi_wave(T *dest, const T *source, - size_t nelems, int pe) { +__device__ void IPCContext::put_nbi_wave(T *dest, const T *source, size_t nelems, int pe) { putmem_nbi_wave(dest, source, nelems * sizeof(T), pe); } template -__device__ void IPCContext::get_wg(T *dest, const T *source, size_t nelems, - int pe) { +__device__ void IPCContext::get_wg(T *dest, const T *source, size_t nelems, int pe) { getmem_wg(dest, source, nelems * sizeof(T), pe); } template -__device__ void IPCContext::get_nbi_wg(T *dest, const T *source, - size_t nelems, int pe) { +__device__ void IPCContext::get_nbi_wg(T *dest, const T *source, size_t nelems, int pe) { getmem_nbi_wg(dest, source, nelems * sizeof(T), pe); } template -__device__ void IPCContext::get_wave(T *dest, const T *source, size_t nelems, - int pe) { +__device__ void IPCContext::get_wave(T *dest, const T *source, size_t nelems, int pe) { getmem_wave(dest, source, nelems * sizeof(T), pe); } template -__device__ void IPCContext::get_nbi_wave(T *dest, const T *source, - size_t nelems, int pe) { +__device__ void IPCContext::get_nbi_wave(T *dest, const T *source, size_t nelems, int pe) { getmem_nbi_wave(dest, source, nelems * sizeof(T), pe); } diff --git a/src/ipc/context_ipc_tmpl_host.hpp b/src/ipc/context_ipc_tmpl_host.hpp index aad55260c4..f8fc4aa8cb 100644 --- a/src/ipc/context_ipc_tmpl_host.hpp +++ b/src/ipc/context_ipc_tmpl_host.hpp @@ -26,7 +26,7 @@ #define LIBRARY_SRC_IPC_CONTEXT_TMPL_HOST_HPP_ #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../host/host_templates.hpp" +#include "host/host_templates.hpp" namespace rocshmem { @@ -41,26 +41,22 @@ __host__ T IPCHostContext::g(const T *source, int pe) { } template -__host__ void IPCHostContext::put(T *dest, const T *source, size_t nelems, - int pe) { +__host__ void IPCHostContext::put(T *dest, const T *source, size_t nelems, int pe) { host_interface->put(dest, source, nelems, pe, context_window_info); } template -__host__ void IPCHostContext::get(T *dest, const T *source, size_t nelems, - int pe) { +__host__ void IPCHostContext::get(T *dest, const T *source, size_t nelems, int pe) { host_interface->get(dest, source, nelems, pe, context_window_info); } template -__host__ void IPCHostContext::put_nbi(T *dest, const T *source, size_t nelems, - int pe) { +__host__ void IPCHostContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) { host_interface->put_nbi(dest, source, nelems, pe, context_window_info); } template -__host__ void IPCHostContext::get_nbi(T *dest, const T *source, size_t nelems, - int pe) { +__host__ void IPCHostContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) { host_interface->get_nbi(dest, source, nelems, pe, context_window_info); } @@ -81,8 +77,7 @@ __host__ T IPCHostContext::amo_fetch_add(void *dst, T value, int pe) { template __host__ T IPCHostContext::amo_fetch_cas(void *dst, T value, T cond, int pe) { - return host_interface->amo_fetch_cas(dst, value, cond, pe, - context_window_info); + return host_interface->amo_fetch_cas(dst, value, cond, pe, context_window_info); } template @@ -96,23 +91,23 @@ __host__ void IPCHostContext::broadcast( template __host__ void IPCHostContext::broadcast(rocshmem_team_t team, T *dest, - const T *source, int nelems, - int pe_root) { + const T *source, int nelems, + int pe_root) { host_interface->broadcast(team, dest, source, nelems, pe_root); } template __host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce, - int pe_start, int log_pe_stride, - int pe_size, T *p_wrk, - long *p_sync) { // NOLINT(runtime/int) + int pe_start, int log_pe_stride, + int pe_size, T *p_wrk, + long *p_sync) { // NOLINT(runtime/int) host_interface->to_all(dest, source, nreduce, pe_start, log_pe_stride, pe_size, p_wrk, p_sync); } template __host__ int IPCHostContext::reduce(rocshmem_team_t team, T *dest, - const T *source, int nreduce) { + const T *source, int nreduce) { return host_interface->reduce(team, dest, source, nreduce); } @@ -123,8 +118,8 @@ __host__ void IPCHostContext::wait_until(T *ivars, int cmp, T val) { template __host__ void IPCHostContext::wait_until_all(T *ivars, size_t nelems, - const int* status, - int cmp, T val) { + const int* status, + int cmp, T val) { host_interface->wait_until_all(ivars, nelems, status, cmp, val, context_window_info); } @@ -137,31 +132,31 @@ __host__ size_t IPCHostContext::wait_until_any(T *ivars, size_t nelems, template __host__ size_t IPCHostContext::wait_until_some(T *ivars, size_t nelems, - size_t* indices, - const int* status, - int cmp, T val) { + size_t* indices, + const int* status, + int cmp, T val) { return host_interface->wait_until_some(ivars, nelems, indices, status, cmp, val, context_window_info); } template __host__ void IPCHostContext::wait_until_all_vector(T *ivars, size_t nelems, - const int* status, - int cmp, T* vals) { + const int* status, + int cmp, T* vals) { host_interface->wait_until_all_vector(ivars, nelems, status, cmp, vals, context_window_info); } template __host__ size_t IPCHostContext::wait_until_any_vector(T *ivars, size_t nelems, - const int* status, - int cmp, T* vals) { + const int* status, + int cmp, T* vals) { return host_interface->wait_until_any_vector(ivars, nelems, status, cmp, vals, context_window_info); } template __host__ size_t IPCHostContext::wait_until_some_vector(T *ivars, size_t nelems, - size_t* indices, - const int* status, - int cmp, T* vals) { + size_t* indices, + const int* status, + int cmp, T* vals) { return host_interface->wait_until_some_vector(ivars, nelems, indices, status, cmp, vals, context_window_info); } diff --git a/src/ipc/ipc_context_proxy.hpp b/src/ipc/ipc_context_proxy.hpp index 6b9d22204f..68fa15637f 100644 --- a/src/ipc/ipc_context_proxy.hpp +++ b/src/ipc/ipc_context_proxy.hpp @@ -26,7 +26,7 @@ #define LIBRARY_SRC_IPC_CONTEXT_PROXY_HPP_ -#include "../device_proxy.hpp" +#include "device_proxy.hpp" #include "backend_ipc.hpp" namespace rocshmem { diff --git a/src/ipc/ipc_team.cpp b/src/ipc/ipc_team.cpp index bdb8d75209..757bbe059c 100644 --- a/src/ipc/ipc_team.cpp +++ b/src/ipc/ipc_team.cpp @@ -24,14 +24,15 @@ #include "ipc_team.hpp" -#include "../backend_type.hpp" +#include "constants.hpp" +#include "backend_type.hpp" #include "backend_ipc.hpp" namespace rocshmem { IPCTeam::IPCTeam(Backend *backend, TeamInfo *team_info_parent, - TeamInfo *team_info_world, int num_pes, int my_pe, - MPI_Comm mpi_comm, int pool_index) + TeamInfo *team_info_world, int num_pes, int my_pe, + MPI_Comm mpi_comm, int pool_index) : Team(backend, team_info_parent, team_info_world, num_pes, my_pe, mpi_comm) { type = BackendType::IPC_BACKEND; @@ -39,18 +40,13 @@ IPCTeam::IPCTeam(Backend *backend, TeamInfo *team_info_parent, pool_index_ = pool_index; - barrier_pSync = - &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]); - reduce_pSync = - &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]); + barrier_pSync = &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]); + reduce_pSync = &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]); bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]); - alltoall_pSync = - &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]); + alltoall_pSync = &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]); - pWrk = reinterpret_cast(b->pWrk_pool) + - ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index; - pAta = reinterpret_cast(b->pAta_pool) + - ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index; + pWrk = reinterpret_cast(b->pWrk_pool) + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index; + pAta = reinterpret_cast(b->pAta_pool) + ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index; } IPCTeam::~IPCTeam() {} diff --git a/src/ipc/ipc_team.hpp b/src/ipc/ipc_team.hpp index 74039692dd..59a4200748 100644 --- a/src/ipc/ipc_team.hpp +++ b/src/ipc/ipc_team.hpp @@ -25,7 +25,7 @@ #ifndef LIBRARY_SRC_IPC_TEAM_HPP_ #define LIBRARY_SRC_IPC_TEAM_HPP_ -#include "../team.hpp" +#include "team.hpp" namespace rocshmem { diff --git a/src/ipc_policy.hpp b/src/ipc_policy.hpp index 7f1e17c925..bae17faf08 100644 --- a/src/ipc_policy.hpp +++ b/src/ipc_policy.hpp @@ -186,7 +186,7 @@ class IpcOffImpl { /* * Select which one of our IPC policies to use at compile time. */ -#ifdef USE_IPC +#if defined(USE_IPC) typedef IpcOnImpl IpcImpl; #else typedef IpcOffImpl IpcImpl; diff --git a/src/memory/binner.hpp b/src/memory/binner.hpp index d62be5bd6b..a66d5682fb 100644 --- a/src/memory/binner.hpp +++ b/src/memory/binner.hpp @@ -30,7 +30,7 @@ #include #include -#include "../constants.hpp" +#include "constants.hpp" #include "bin.hpp" /** diff --git a/src/memory/dlmalloc.hpp b/src/memory/dlmalloc.hpp index f1d2d3b462..e427fbeae3 100644 --- a/src/memory/dlmalloc.hpp +++ b/src/memory/dlmalloc.hpp @@ -28,7 +28,7 @@ #include #include -#include "../constants.hpp" +#include "constants.hpp" #include "shmem_allocator_strategy.hpp" /** diff --git a/src/memory/memory_allocator.cpp b/src/memory/memory_allocator.cpp index b9eaa96fe4..0c0177f9f8 100644 --- a/src/memory/memory_allocator.cpp +++ b/src/memory/memory_allocator.cpp @@ -26,7 +26,7 @@ #include -#include "../util.hpp" +#include "util.hpp" namespace rocshmem { diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index 6805b86cbb..97a4e43ff4 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -25,9 +25,9 @@ #ifndef LIBRARY_SRC_MEMORY_NOTIFIER_HPP_ #define LIBRARY_SRC_MEMORY_NOTIFIER_HPP_ -#include "../device_proxy.hpp" -#include "../util.hpp" -#include "../atomic.hpp" +#include "device_proxy.hpp" +#include "util.hpp" +#include "atomic.hpp" namespace rocshmem { diff --git a/src/memory/pow2_bins.hpp b/src/memory/pow2_bins.hpp index 3514c9d927..904f2dd380 100644 --- a/src/memory/pow2_bins.hpp +++ b/src/memory/pow2_bins.hpp @@ -28,7 +28,7 @@ #include #include -#include "../constants.hpp" +#include "constants.hpp" #include "bin.hpp" #include "binner.hpp" #include "shmem_allocator_strategy.hpp" diff --git a/src/memory/remote_heap_info.hpp b/src/memory/remote_heap_info.hpp index 9918540668..29286d6dac 100644 --- a/src/memory/remote_heap_info.hpp +++ b/src/memory/remote_heap_info.hpp @@ -32,7 +32,7 @@ #include "hip_allocator.hpp" #include "window_info.hpp" -#include "../bootstrap/bootstrap.hpp" +#include "bootstrap/bootstrap.hpp" /** * @file remote_heap_info.hpp diff --git a/src/memory/symmetric_heap.hpp b/src/memory/symmetric_heap.hpp index f7d3cf4871..c823918c3b 100644 --- a/src/memory/symmetric_heap.hpp +++ b/src/memory/symmetric_heap.hpp @@ -45,7 +45,7 @@ #include "remote_heap_info.hpp" #include "single_heap.hpp" -#include "../bootstrap/bootstrap.hpp" +#include "bootstrap/bootstrap.hpp" namespace rocshmem { diff --git a/src/reverse_offload/backend_proxy.hpp b/src/reverse_offload/backend_proxy.hpp index 7efad56b77..0d16178eed 100644 --- a/src/reverse_offload/backend_proxy.hpp +++ b/src/reverse_offload/backend_proxy.hpp @@ -27,8 +27,8 @@ #include -#include "../device_proxy.hpp" -#include "../stats.hpp" +#include "device_proxy.hpp" +#include "stats.hpp" #include "queue.hpp" namespace rocshmem { diff --git a/src/reverse_offload/backend_ro.cpp b/src/reverse_offload/backend_ro.cpp index c696fc5004..d7188709cc 100644 --- a/src/reverse_offload/backend_ro.cpp +++ b/src/reverse_offload/backend_ro.cpp @@ -35,12 +35,12 @@ #include // NOLINT #include "rocshmem/rocshmem.hpp" -#include "../atomic_return.hpp" -#include "../backend_type.hpp" -#include "../context_incl.hpp" +#include "atomic_return.hpp" +#include "backend_type.hpp" +#include "context_incl.hpp" #include "mpi_transport.hpp" #include "ro_net_team.hpp" -#include "../util.hpp" +#include "util.hpp" namespace rocshmem { diff --git a/src/reverse_offload/backend_ro.hpp b/src/reverse_offload/backend_ro.hpp index 80b5993740..b2247367a6 100644 --- a/src/reverse_offload/backend_ro.hpp +++ b/src/reverse_offload/backend_ro.hpp @@ -28,10 +28,10 @@ #include #include -#include "../backend_bc.hpp" -#include "../containers/free_list_impl.hpp" -#include "../hdp_proxy.hpp" -#include "../memory/hip_allocator.hpp" +#include "backend_bc.hpp" +#include "containers/free_list_impl.hpp" +#include "hdp_proxy.hpp" +#include "memory/hip_allocator.hpp" #include "backend_proxy.hpp" #include "block_handle.hpp" #include "context_proxy.hpp" diff --git a/src/reverse_offload/block_handle.hpp b/src/reverse_offload/block_handle.hpp index 06ba09b8ed..0c80567514 100644 --- a/src/reverse_offload/block_handle.hpp +++ b/src/reverse_offload/block_handle.hpp @@ -25,9 +25,9 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_BLOCK_HANDLE_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_BLOCK_HANDLE_HPP_ -#include "../containers/atomic_wf_queue_impl.hpp" -#include "../hdp_policy.hpp" -#include "../ipc_policy.hpp" +#include "containers/atomic_wf_queue_impl.hpp" +#include "hdp_policy.hpp" +#include "ipc_policy.hpp" #include "profiler.hpp" #include "queue.hpp" diff --git a/src/reverse_offload/context_proxy.hpp b/src/reverse_offload/context_proxy.hpp index 91bbbdd3bf..3394c39847 100644 --- a/src/reverse_offload/context_proxy.hpp +++ b/src/reverse_offload/context_proxy.hpp @@ -26,8 +26,8 @@ #define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_ #include "rocshmem/rocshmem.hpp" -#include "../device_proxy.hpp" -#include "../memory/hip_allocator.hpp" +#include "device_proxy.hpp" +#include "memory/hip_allocator.hpp" #include "context_ro_device.hpp" namespace rocshmem { diff --git a/src/reverse_offload/context_ro_device.cpp b/src/reverse_offload/context_ro_device.cpp index 4e1b789a13..cd589b2492 100644 --- a/src/reverse_offload/context_ro_device.cpp +++ b/src/reverse_offload/context_ro_device.cpp @@ -34,12 +34,12 @@ #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) #include "rocshmem/rocshmem.hpp" -#include "../backend_type.hpp" -#include "../hdp_policy.hpp" +#include "backend_type.hpp" +#include "hdp_policy.hpp" #include "backend_proxy.hpp" #include "backend_ro.hpp" #include "ro_net_team.hpp" -#include "../sync/abql_block_mutex.hpp" +#include "sync/abql_block_mutex.hpp" namespace rocshmem { diff --git a/src/reverse_offload/context_ro_device.hpp b/src/reverse_offload/context_ro_device.hpp index 1b06f8c0c7..3fc7b31ae9 100644 --- a/src/reverse_offload/context_ro_device.hpp +++ b/src/reverse_offload/context_ro_device.hpp @@ -25,7 +25,7 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_RO_DEVICE_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_RO_DEVICE_HPP_ -#include "../context.hpp" +#include "context.hpp" #include "block_handle.hpp" #include "commands_types.hpp" #include "queue.hpp" diff --git a/src/reverse_offload/context_ro_host.cpp b/src/reverse_offload/context_ro_host.cpp index 2d24c73085..8692b8dfe5 100644 --- a/src/reverse_offload/context_ro_host.cpp +++ b/src/reverse_offload/context_ro_host.cpp @@ -27,9 +27,9 @@ #include #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../backend_type.hpp" -#include "../context_incl.hpp" -#include "../host/host.hpp" +#include "backend_type.hpp" +#include "context_incl.hpp" +#include "host/host.hpp" #include "backend_ro.hpp" namespace rocshmem { diff --git a/src/reverse_offload/context_ro_host.hpp b/src/reverse_offload/context_ro_host.hpp index 13d06f94a5..4e0719a84b 100644 --- a/src/reverse_offload/context_ro_host.hpp +++ b/src/reverse_offload/context_ro_host.hpp @@ -25,7 +25,7 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_RO_HOST_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_RO_HOST_HPP_ -#include "../context.hpp" +#include "context.hpp" namespace rocshmem { diff --git a/src/reverse_offload/context_ro_tmpl_host.hpp b/src/reverse_offload/context_ro_tmpl_host.hpp index 02dc788263..eb2b0efd51 100644 --- a/src/reverse_offload/context_ro_tmpl_host.hpp +++ b/src/reverse_offload/context_ro_tmpl_host.hpp @@ -26,7 +26,7 @@ #define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_HOST_TEMPLATES_HPP_ #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../host/host_templates.hpp" +#include "host/host_templates.hpp" namespace rocshmem { diff --git a/src/reverse_offload/mpi_transport.cpp b/src/reverse_offload/mpi_transport.cpp index 93f722fe3c..27ce188e3c 100644 --- a/src/reverse_offload/mpi_transport.cpp +++ b/src/reverse_offload/mpi_transport.cpp @@ -30,10 +30,10 @@ #include #include -#include "../host/host.hpp" +#include "host/host.hpp" #include "backend_ro.hpp" #include "ro_net_team.hpp" -#include "../util.hpp" +#include "util.hpp" namespace rocshmem { diff --git a/src/reverse_offload/profiler.hpp b/src/reverse_offload/profiler.hpp index cd043e1eba..c6b0e3ad9d 100644 --- a/src/reverse_offload/profiler.hpp +++ b/src/reverse_offload/profiler.hpp @@ -29,9 +29,9 @@ #include #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) -#include "../device_proxy.hpp" -#include "../memory/../memory/hip_allocator.hpp" -#include "../stats.hpp" +#include "device_proxy.hpp" +#include "memory/../memory/hip_allocator.hpp" +#include "stats.hpp" namespace rocshmem { diff --git a/src/reverse_offload/queue.hpp b/src/reverse_offload/queue.hpp index 89807cc7d5..16d8eb1b76 100644 --- a/src/reverse_offload/queue.hpp +++ b/src/reverse_offload/queue.hpp @@ -25,7 +25,7 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_HPP_ -#include "../hdp_proxy.hpp" +#include "hdp_proxy.hpp" #include "queue_proxy.hpp" #include "queue_desc_proxy.hpp" diff --git a/src/reverse_offload/queue_desc_proxy.hpp b/src/reverse_offload/queue_desc_proxy.hpp index a5ca2a207a..c9d90aa721 100644 --- a/src/reverse_offload/queue_desc_proxy.hpp +++ b/src/reverse_offload/queue_desc_proxy.hpp @@ -25,7 +25,7 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_DESC_PROXY_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_DESC_PROXY_HPP_ -#include "../device_proxy.hpp" +#include "device_proxy.hpp" namespace rocshmem { diff --git a/src/reverse_offload/queue_proxy.hpp b/src/reverse_offload/queue_proxy.hpp index bfa60e690b..d6a2b9be9e 100644 --- a/src/reverse_offload/queue_proxy.hpp +++ b/src/reverse_offload/queue_proxy.hpp @@ -27,13 +27,13 @@ #include -#include "../atomic_return.hpp" -#include "../device_proxy.hpp" -#include "../hdp_policy.hpp" -#include "../ipc_policy.hpp" +#include "atomic_return.hpp" +#include "device_proxy.hpp" +#include "hdp_policy.hpp" +#include "ipc_policy.hpp" #include "commands_types.hpp" #include "profiler.hpp" -#include "../sync/abql_block_mutex.hpp" +#include "sync/abql_block_mutex.hpp" namespace rocshmem { diff --git a/src/reverse_offload/ro_net_team.cpp b/src/reverse_offload/ro_net_team.cpp index cae0f5d92d..f04872826a 100644 --- a/src/reverse_offload/ro_net_team.cpp +++ b/src/reverse_offload/ro_net_team.cpp @@ -24,7 +24,7 @@ #include "ro_net_team.hpp" -#include "../backend_type.hpp" +#include "backend_type.hpp" #include "backend_ro.hpp" namespace rocshmem { diff --git a/src/reverse_offload/ro_net_team.hpp b/src/reverse_offload/ro_net_team.hpp index 5ffcf8e5f3..8af113b701 100644 --- a/src/reverse_offload/ro_net_team.hpp +++ b/src/reverse_offload/ro_net_team.hpp @@ -25,7 +25,7 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_TEAM_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_TEAM_HPP_ -#include "../team.hpp" +#include "team.hpp" #define MAX_ATA_BUFF_SIZE (1024 * 1024 * 128) diff --git a/src/reverse_offload/ro_team_proxy.hpp b/src/reverse_offload/ro_team_proxy.hpp index 06bb43d663..28e620f3e5 100644 --- a/src/reverse_offload/ro_team_proxy.hpp +++ b/src/reverse_offload/ro_team_proxy.hpp @@ -27,7 +27,7 @@ #include -#include "../device_proxy.hpp" +#include "device_proxy.hpp" #include "ro_net_team.hpp" #include "team_info_proxy.hpp" diff --git a/src/reverse_offload/team_info_proxy.hpp b/src/reverse_offload/team_info_proxy.hpp index 1ad25c0679..e73ca98d1b 100644 --- a/src/reverse_offload/team_info_proxy.hpp +++ b/src/reverse_offload/team_info_proxy.hpp @@ -25,8 +25,8 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_TEAM_INFO_PROXY_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_TEAM_INFO_PROXY_HPP_ -#include "../device_proxy.hpp" -#include "../team.hpp" +#include "device_proxy.hpp" +#include "team.hpp" namespace rocshmem { diff --git a/src/reverse_offload/window_proxy.hpp b/src/reverse_offload/window_proxy.hpp index 3883628ebb..1492e6d3ab 100644 --- a/src/reverse_offload/window_proxy.hpp +++ b/src/reverse_offload/window_proxy.hpp @@ -25,8 +25,8 @@ #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_WINDOW_PROXY_HPP_ #define LIBRARY_SRC_REVERSE_OFFLOAD_WINDOW_PROXY_HPP_ -#include "../device_proxy.hpp" -#include "../memory/window_info.hpp" +#include "device_proxy.hpp" +#include "memory/window_info.hpp" #include "mpi_transport.hpp" namespace rocshmem { diff --git a/src/rocshmem.cpp b/src/rocshmem.cpp index a5d69cf179..d1a8e38b2d 100644 --- a/src/rocshmem.cpp +++ b/src/rocshmem.cpp @@ -35,12 +35,17 @@ #include "backend_bc.hpp" #include "context_incl.hpp" -#ifdef USE_RO +#if defined(USE_RO) #include "reverse_offload/backend_ro.hpp" #include "reverse_offload/context_ro_tmpl_host.hpp" -#else +#elif defined(USE_IPC) #include "ipc/backend_ipc.hpp" #include "ipc/context_ipc_tmpl_host.hpp" +#elif defined(USE_GDA) +#include "gda/backend_gda.hpp" +#include "gda/context_gda_tmpl_host.hpp" +#else +#error "Select one backend among USE_RO, USE_IPC, USE_GDA" #endif #include "mpi_instance.hpp" #include "team.hpp" @@ -89,12 +94,15 @@ rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; mpi_instance = new MPIInstance(comm); -#ifdef USE_RO +#if defined(USE_RO) CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend))); backend = new (backend) ROBackend(comm); -#else +#elif defined(USE_IPC) CHECK_HIP(hipHostMalloc(&backend, sizeof(IPCBackend))); backend = new (backend) IPCBackend(comm); +#elif defined(USE_GDA) + CHECK_HIP(hipHostMalloc(&backend, sizeof(GDABackend))); + backend = new (backend) GDABackend(comm); #endif if (!backend) { @@ -166,12 +174,15 @@ rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; rocm_init(); -#ifdef USE_RO +#if defined(USE_RO) printf("RO Backend requires MPI library to be initialized, even when using uniqueId initializations!\n"); abort(); -#else +#elif defined(USE_IPC) CHECK_HIP(hipHostMalloc(&backend, sizeof(IPCBackend))); backend = new (backend) IPCBackend(bootstrap); +#elif defined(USE_GDA) + CHECK_HIP(hipHostMalloc(&backend, sizeof(GDABackend))); + backend = new (backend) GDABackend(bootstrap); #endif if (!backend) { diff --git a/src/rocshmem_gpu.cpp b/src/rocshmem_gpu.cpp index 2e3fc0821d..8abe25e7ec 100644 --- a/src/rocshmem_gpu.cpp +++ b/src/rocshmem_gpu.cpp @@ -51,13 +51,17 @@ #include "templates.hpp" #include "util.hpp" -#ifdef USE_RO +#if defined(USE_RO) #include "reverse_offload/context_ro_tmpl_device.hpp" -#else -#ifdef ENABLE_IPC_BITCODE - #include "ipc/backend_ipc.hpp" -#endif +#elif defined(USE_IPC) +# if defined(ENABLE_IPC_BITCODE) +# include "ipc/backend_ipc.hpp" +# endif #include "ipc/context_ipc_tmpl_device.hpp" +#elif defined(USE_GDA) +#include "gda/context_gda_tmpl_device.hpp" +#else +#error "Select one backend among USE_RO, USE_IPC, USE_GDA" #endif /****************************************************************************** @@ -70,7 +74,7 @@ __device__ rocshmem_ctx_t __attribute__((visibility("default"))) ROCSHMEM_CTX_D __constant__ Backend *device_backend_proxy; -#ifdef ENABLE_IPC_BITCODE +#if defined(ENABLE_IPC_BITCODE) typedef IPCContext ContextTy; #else typedef Context ContextTy; diff --git a/src/sync/abql_block_mutex.cpp b/src/sync/abql_block_mutex.cpp index cb23925d44..9a34cbb1bf 100644 --- a/src/sync/abql_block_mutex.cpp +++ b/src/sync/abql_block_mutex.cpp @@ -22,9 +22,9 @@ * IN THE SOFTWARE. *****************************************************************************/ -#include "../sync/abql_block_mutex.hpp" +#include "sync/abql_block_mutex.hpp" -#include "../util.hpp" +#include "util.hpp" namespace rocshmem { diff --git a/src/sync/abql_block_mutex.hpp b/src/sync/abql_block_mutex.hpp index 7bf95edc4a..bcd51aba01 100644 --- a/src/sync/abql_block_mutex.hpp +++ b/src/sync/abql_block_mutex.hpp @@ -25,7 +25,7 @@ #ifndef LIBRARY_SRC_SYNC_ABQL_BLOCK_MUTEX_HPP_ #define LIBRARY_SRC_SYNC_ABQL_BLOCK_MUTEX_HPP_ -#include "../device_proxy.hpp" +#include "device_proxy.hpp" #include diff --git a/src/team.cpp b/src/team.cpp index ab06c22d74..e26420e408 100644 --- a/src/team.cpp +++ b/src/team.cpp @@ -46,6 +46,10 @@ IPCTeam* get_internal_ipc_team(rocshmem_team_t team) { return reinterpret_cast(team); } +GDATeam* get_internal_gda_team(rocshmem_team_t team) { + return reinterpret_cast(team); +} + __host__ __device__ int team_translate_pe(rocshmem_team_t src_team, int src_pe, rocshmem_team_t dst_team) { if (src_team == ROCSHMEM_TEAM_INVALID || diff --git a/src/team.hpp b/src/team.hpp index bf9bc764ac..343424da39 100644 --- a/src/team.hpp +++ b/src/team.hpp @@ -36,6 +36,7 @@ class Backend; class Team; class ROTeam; class IPCTeam; +class GDATeam; class TeamInfo { public: @@ -164,6 +165,8 @@ ROTeam* get_internal_ro_team(rocshmem_team_t team); IPCTeam* get_internal_ipc_team(rocshmem_team_t team); +GDATeam* get_internal_gda_team(rocshmem_team_t team); + __host__ __device__ int team_translate_pe(rocshmem_team_t src_team, int src_pe, rocshmem_team_t dst_team); diff --git a/src/tools/rocshmem_info.cpp b/src/tools/rocshmem_info.cpp index 738b5b4f98..73040b6451 100644 --- a/src/tools/rocshmem_info.cpp +++ b/src/tools/rocshmem_info.cpp @@ -1,5 +1,5 @@ -#include "../util.hpp" +#include "util.hpp" #include #include diff --git a/src/util.hpp b/src/util.hpp index c3d059edf9..cdd70932fa 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -31,24 +31,67 @@ #include -#include "assembly.hpp" #include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir) #include "constants.hpp" +#include "assembly.hpp" namespace rocshmem { -#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) -#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) +#define LIKELY(X) __builtin_expect(X, 1) +#define UNLIKELY(X) __builtin_expect(X, 0) -#define CHECK_HIP(cmd) \ - { \ - hipError_t error = cmd; \ - if (error != hipSuccess) { \ - fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \ - error, __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ - } +/** + * @name CHECK_NNULL + * @brief Checks if value is NOT null. If it is null print errno and exit the program. + * + * @param[in] value Value to check + * @param[in] fn_str String describing checked function + * + */ +#define CHECK_NNULL(value, fn_str) do { \ + if (UNLIKELY(nullptr == (value))) { \ + fprintf(stderr, \ + "Error: %s: %s (%d) at RocSHMEM::%s:%d\n", \ + fn_str, strerror(errno), errno, \ + __FILE__, __LINE__); \ + abort(); \ + } \ +} while(0) + +/** + * @name CHECK_ZERO + * @brief Checks if value is zero. If it is not zero print errno and exit the program. + * + * @param[in] value Value to check + * @param[in] fn_str String describing checked function + * + */ +#define CHECK_ZERO(value, fn_str) do { \ + if (UNLIKELY(0 != (value))) { \ + fprintf(stderr, \ + "Error: %s: %s (%d) at RocSHMEM::%s:%d\n", \ + fn_str, strerror(errno), errno, \ + __FILE__, __LINE__); \ + abort(); \ + } \ +} while(0) + +/** + * @name CHECK_HIP + * @brief Checks if HIP command succeeded. If it is not not success then it exits the program. + * + * @param[in] instr HIP function to run and check + * + */ +#define CHECK_HIP(instr) do { \ + hipError_t error = (instr); \ + if (error != hipSuccess) { \ + fprintf(stderr, \ + "Error: " #instr ": %s (%d) at RocSHMEM::%s:%d\n", \ + hipGetErrorString(error), error, __FILE__, __LINE__); \ + abort(); \ + } \ +} while(0) #ifdef DEBUG #define DPRINTF(...) \ @@ -132,7 +175,7 @@ __device__ __forceinline__ int get_flat_grid_id() { * Returns the flattened thread index of the calling thread within the grid. */ __device__ __forceinline__ int get_flat_id() { - return get_flat_grid_id() * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z) + get_flat_block_id(); + return get_flat_grid_id() * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z) + get_flat_block_id(); } /* @@ -142,6 +185,51 @@ __device__ __forceinline__ bool is_thread_zero_in_wave() { return (get_flat_block_id() % WF_SIZE) == 0; } +__device__ __forceinline__ uint64_t get_active_lane_mask() { + return __ballot(true); +} + +__device__ __forceinline__ unsigned int get_active_lane_count(uint64_t active_lane_mask) { + return __popcll(active_lane_mask); +} + +__device__ __forceinline__ unsigned int get_active_lane_count() { + return get_active_lane_count(get_active_lane_mask()); +} + +__device__ __forceinline__ unsigned int get_active_lane_num(uint64_t active_lane_mask) { + return __popcll(active_lane_mask & __lanemask_lt()); +} + +__device__ __forceinline__ unsigned int get_active_lane_num() { + return get_active_lane_num(get_active_lane_mask()); +} + +__device__ __forceinline__ int get_first_active_lane_id(uint64_t active_lane_mask) { + return __ffsll((unsigned long long int)active_lane_mask) - 1; +} + +__device__ __forceinline__ int get_first_active_lane_id() { + return get_first_active_lane_id(get_active_lane_mask()); +} + +__device__ __forceinline__ bool is_first_active_lane(uint64_t active_lane_mask) { + return get_active_lane_num(active_lane_mask) == 0; +} + +__device__ __forceinline__ bool is_first_active_lane() { + return is_first_active_lane(get_active_lane_mask()); +} + +__device__ __forceinline__ bool is_last_active_lane(uint64_t active_lane_mask) { + return get_active_lane_num(active_lane_mask) == get_active_lane_count(active_lane_mask) - 1; +} + +__device__ __forceinline__ bool is_last_active_lane() { + return is_last_active_lane(get_active_lane_mask()); +} + + extern __constant__ int* print_lock; template @@ -168,6 +256,9 @@ __device__ void gpu_dprintf(const char* fmt, const Args&... args) { } } +#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) +#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) + __device__ __forceinline__ void memcpy(void* dst, void* src, size_t size) { uint8_t* dst_bytes{static_cast(dst)}; uint8_t* src_bytes{static_cast(src)}; @@ -264,8 +355,7 @@ __device__ __forceinline__ void memcpy_wave(void* dst, void* src, size_t size) { int rocm_init(); -void rocm_memory_lock_to_fine_grain(void* ptr, size_t size, void** gpu_ptr, - int gpu_id); +void rocm_memory_lock_to_fine_grain(void* ptr, size_t size, void** gpu_ptr, int gpu_id); class rocshmem_env_config { public: diff --git a/tests/functional_tests/amo_standard_tester.cpp b/tests/functional_tests/amo_standard_tester.cpp index 028cb4447e..c05f1d780e 100644 --- a/tests/functional_tests/amo_standard_tester.cpp +++ b/tests/functional_tests/amo_standard_tester.cpp @@ -23,6 +23,7 @@ *****************************************************************************/ #include "amo_standard_tester.hpp" +#include "tester.hpp" #include #include @@ -101,10 +102,7 @@ void AMOStandardTester::verifyResults(size_t size) { break; } - int fetch_op = (_type == AMO_FAddTestType || _type == AMO_FIncTestType || - _type == AMO_FCswapTestType) - ? 1 - : 0; + int fetch_op = (_type == AMO_FAddTestType || _type == AMO_FIncTestType || _type == AMO_FCswapTestType) ? 1: 0; if (fetch_op == 1) { ret = *std::max_element(_ret_val, _ret_val + args.num_wgs); diff --git a/tests/functional_tests/tester.hpp b/tests/functional_tests/tester.hpp index ddb65c6508..ea2bb1a728 100644 --- a/tests/functional_tests/tester.hpp +++ b/tests/functional_tests/tester.hpp @@ -187,14 +187,15 @@ class Tester { hipEvent_t stop_event; }; -#define CHECK_HIP(cmd) \ - { \ - hipError_t error = cmd; \ - if (error != hipSuccess) { \ - fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \ - error, __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ - } +//TODO remove altogether? THere is a small difference in print format +#undef CHECK_HIP +#define CHECK_HIP(instr) do { \ + hipError_t error = (instr); \ + if (error != hipSuccess) { \ + fprintf(stderr, "error: " #instr ": %s (%d) at %s:%d\n", \ + hipGetErrorString(error), error, __FILE__, __LINE__); \ + abort(); \ + } \ +} while(0) #endif /* _TESTER_HPP */