diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43a0c293cf..47c9d3f620 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,8 +41,9 @@ endif()
 ###############################################################################
 option(DEBUG "Enable debug trace" OFF)
 option(PROFILE "Enable statistics and timing support" OFF)
-option(USE_RO "Enable RO conduit." ON)
+option(USE_RO "Enable RO conduit" ON)
 option(USE_IPC "Enable IPC support (using HIP)" OFF)
+option(USE_GDA "Enable GDA conduit" OFF)
 option(USE_THREADS "Enable workgroup threads to share network queues" OFF)
 option(USE_WF_COAL "Enable wavefront message coalescing" OFF)
 option(USE_HEAP_DEVICE_FINEGRAIN "Heap uses GPU memory in finegrain mode" ON)
@@ -68,6 +69,8 @@ option(BUILD_TOOLS "Build binary tools (e.g., rocshmem_info)" ON)
 option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
 option(BUILD_CODE_COVERAGE "Build with code coverage flags (gcc only)" OFF)
 
+option(GDA_IONIC "Build for AMD Pensando IONIC RDMA provider" OFF)
+option(GDA_BNXT "Build for Broadcom" OFF)
 
 ###############################################################################
 # PROJECT
@@ -162,7 +165,6 @@ if (NOT BUILD_TESTS_ONLY)
   target_compile_options(
     ${PROJECT_NAME}
     PUBLIC
-      ${offload_flags}
       -fgpu-rdc
   )
 
@@ -172,6 +174,7 @@ if (NOT BUILD_TESTS_ONLY)
       $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
       $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>            # rocshmem_config.h
       $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/rocshmem>   # rocshmem_config.h from rocshmem.hpp
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
       $<INSTALL_INTERFACE:include>
   )
 
diff --git a/README.md b/README.md
index 0754d4a6df..62c9e46f47 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ code complexity and enables more fine-grained communication/computation
 overlap than traditional host-driven networking.
 rocSHMEM uses a single symmetric heap (SHEAP) that is allocated on GPU memories.
 
-There are currently two backends for rocSHMEM;
-IPC and Reverse Offload (RO).
+There are currently three backends for rocSHMEM;
+IPC, Reverse Offload (RO), and GPU-IB.
 The backends primarily differ in their implementations of intra-kernel networking.
 
 The IPC backend implements communication primitives using load/store operations issued from the GPU.
diff --git a/cmake/FindIBVerbs.cmake b/cmake/FindIBVerbs.cmake
new file mode 100644
index 0000000000..6c4d631262
--- /dev/null
+++ b/cmake/FindIBVerbs.cmake
@@ -0,0 +1,83 @@
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+
+find_package(PkgConfig QUIET)
+if (PkgConfig_FOUND)
+if (IBVerbs_ROOT )
+  # We don't use IBVerbs_DIR as this is supposed to be used when finding hwloc-config.cmake only
+  set(ENV{PKG_CONFIG_PATH} "${IBVerbs_ROOT}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
+endif()
+pkg_check_modules(PC_IBVerbs QUIET libibverbs)
+endif()
+
+find_path(IBVerbs_INCLUDE_DIR infiniband/verbs.h
+  HINTS ${PC_IBVerbs_INCLUDEDIR} ${PC_IBVerbs_INCLUDE_DIRS}
+  PATH_SUFFIXES include
+)
+
+find_library(IBVerbs_LIBRARY
+  NAMES ibverbs libibverbs
+  HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS}
+  PATH_SUFFIXES lib lib64
+)
+
+if (GDA_IONIC)
+find_library(IBVerbs_PROVIDER_LIBRARY
+  NAMES ionic libionic
+  HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS}
+  PATH_SUFFIXES lib lib64
+)
+elseif (GDA_BNXT)
+find_library(IBVerbs_PROVIDER_LIBRARY
+  NAMES bnxt_re libbnxt_re
+  HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS}
+  PATH_SUFFIXES lib lib64
+)
+else()
+find_library(IBVerbs_PROVIDER_LIBRARY
+  NAMES mlx5 libmlx5
+  HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS}
+  PATH_SUFFIXES lib lib64
+)
+endif()
+
+find_package_handle_standard_args(IBVerbs DEFAULT_MSG
+  IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY
+)
+mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY)
+
+if (IBVerbs_FOUND)
+add_library(IBVerbs::verbs UNKNOWN IMPORTED)
+set_target_properties(IBVerbs::verbs PROPERTIES
+  IMPORTED_LOCATION "${IBVerbs_LIBRARY}"
+  INTERFACE_COMPILE_OPTIONS "${PC_IBVerbs_CFLAGS_OTHER}"
+  INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_INCLUDE_DIR}"
+)
+add_library(IBVerbs::verbs_provider UNKNOWN IMPORTED)
+set_target_properties(IBVerbs::verbs_provider PROPERTIES
+  IMPORTED_LOCATION "${IBVerbs_PROVIDER_LIBRARY}"
+  INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_PROVIDER_INCLUDE_DIR}"
+)
+target_link_libraries(IBVerbs::verbs INTERFACE IBVerbs::verbs_provider)
+endif()
diff --git a/cmake/rocshmem_config.h.in b/cmake/rocshmem_config.h.in
index 36c5aeae24..644a87a69e 100644
--- a/cmake/rocshmem_config.h.in
+++ b/cmake/rocshmem_config.h.in
@@ -26,6 +26,7 @@
 #cmakedefine PROFILE
 #cmakedefine USE_RO
 #cmakedefine USE_IPC
+#cmakedefine USE_GDA
 #cmakedefine USE_THREADS
 #cmakedefine USE_SHARED_CTX
 #cmakedefine USE_WF_COAL
@@ -41,3 +42,5 @@
 #cmakedefine USE_SINGLE_NODE
 #cmakedefine USE_HDP_FLUSH
 #cmakedefine USE_HDP_FLUSH_HOST_SIDE
+#cmakedefine GDA_IONIC
+#cmakedefine GDA_BNXT
diff --git a/cmake/setup_project.cmake b/cmake/setup_project.cmake
index 5658aab4b9..df3a98376c 100644
--- a/cmake/setup_project.cmake
+++ b/cmake/setup_project.cmake
@@ -75,3 +75,4 @@ set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -ggdb")
 
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
diff --git a/scripts/build_configs/gda b/scripts/build_configs/gda
new file mode 100755
index 0000000000..c339b7155e
--- /dev/null
+++ b/scripts/build_configs/gda
@@ -0,0 +1,49 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+
+set -e
+
+src_path=$(dirname "$(realpath $0)")/../../
+
+cmake \
+    -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
+    -DCMAKE_VERBOSE_MAKEFILE=OFF \
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+    -DBUILD_FUNCTIONAL_TESTS=ON \
+    -DBUILD_UNIT_TESTS=ON \
+    -DDEBUG=OFF \
+    -DPROFILE=OFF \
+    -DUSE_GDA=ON \
+    -DUSE_RO=OFF \
+    -DUSE_IPC=OFF \
+    -DUSE_THREADS=OFF \
+    -DUSE_WF_COAL=OFF \
+    -DUSE_HDP_FLUSH=OFF \
+    -DUSE_HDP_FLUSH_HOST_SIDE=OFF \
+    $* $src_path
+cmake --build . --parallel 8
+cmake --install .
diff --git a/scripts/build_configs/gda_bnxt b/scripts/build_configs/gda_bnxt
new file mode 100755
index 0000000000..77f3f29391
--- /dev/null
+++ b/scripts/build_configs/gda_bnxt
@@ -0,0 +1,30 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+
+set -e
+
+script_path=$(dirname "$(realpath $0)")
+
+source $script_path/gda -DGDA_BNXT=ON $*
diff --git a/scripts/build_configs/gda_ionic b/scripts/build_configs/gda_ionic
new file mode 100755
index 0000000000..cd91bcc6a1
--- /dev/null
+++ b/scripts/build_configs/gda_ionic
@@ -0,0 +1,30 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+
+set -e
+
+script_path=$(dirname "$(realpath $0)")
+
+source $script_path/gda -DGDA_IONIC=ON $*
diff --git a/scripts/build_configs/gda_mlx5 b/scripts/build_configs/gda_mlx5
new file mode 100755
index 0000000000..9337aed9d1
--- /dev/null
+++ b/scripts/build_configs/gda_mlx5
@@ -0,0 +1,30 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+
+set -e
+
+script_path=$(dirname "$(realpath $0)")
+
+exec $script_path/gda $*
diff --git a/scripts/build_configs/ipc_single b/scripts/build_configs/ipc_single
index 83a82d4756..5432710399 100755
--- a/scripts/build_configs/ipc_single
+++ b/scripts/build_configs/ipc_single
@@ -1,3 +1,4 @@
+#!/bin/bash
 ###############################################################################
 # Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 #
@@ -22,37 +23,28 @@
 # IN THE SOFTWARE.
 ###############################################################################
 
-#!/bin/bash
-
 set -e
 
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
 src_path=$(dirname "$(realpath $0)")/../../
 
 cmake \
     -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+    -DBUILD_FUNCTIONAL_TESTS=ON \
+    -DBUILD_UNIT_TESTS=ON \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
+    -DUSE_GDA=OFF \
     -DUSE_RO=OFF \
     -DUSE_IPC=ON \
     -DUSE_THREADS=OFF \
     -DUSE_WF_COAL=OFF \
-    -DUSE_SINGLE_NODE=ON \
     -DUSE_HDP_FLUSH=OFF \
     -DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-    -DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \
-    -DBUILD_FUNCTIONAL_TESTS=ON \
-    -DBUILD_UNIT_TESTS=ON \
-    $src_path
+    -DUSE_SINGLE_NODE=ON \
+    $* $src_path
 cmake --build . --parallel 8
 cmake --install .
diff --git a/scripts/build_configs/ipc_tests_only b/scripts/build_configs/ipc_tests_only
index 41fde20882..f219ba6165 100755
--- a/scripts/build_configs/ipc_tests_only
+++ b/scripts/build_configs/ipc_tests_only
@@ -1,3 +1,4 @@
+#!/bin/bash
 ###############################################################################
 # Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 #
@@ -22,27 +23,22 @@
 # IN THE SOFTWARE.
 ###############################################################################
 
-#!/bin/bash
-
 set -e
 
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
 src_path=$(dirname "$(realpath $0)")/../../
 
-# If as specific rocSHMEM version is required, the recommended approach
-# is to set environment variable 'rocshmem_ROOT'
 cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
+    -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+    -DBUILD_FUNCTIONAL_TESTS=ON \
+    -DBUILD_EXAMPLES=ON \
+    -DBUILD_UNIT_TESTS=OFF \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
+    -DUSE_GDA=OFF \
     -DUSE_RO=OFF \
     -DUSE_IPC=ON \
     -DUSE_THREADS=OFF \
@@ -50,10 +46,6 @@ cmake \
     -DUSE_SINGLE_NODE=ON \
     -DUSE_HDP_FLUSH=OFF \
     -DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-    -DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \
     -DBUILD_TESTS_ONLY=ON \
-    -DBUILD_FUNCTIONAL_TESTS=ON \
-    -DBUILD_EXAMPLES=ON \
-    -DBUILD_UNIT_TESTS=OFF \
-    $src_path
+    $* $src_path
 cmake --build . --parallel 8
diff --git a/scripts/build_configs/ro_ipc b/scripts/build_configs/ro_ipc
index b39438e665..09e158577e 100755
--- a/scripts/build_configs/ro_ipc
+++ b/scripts/build_configs/ro_ipc
@@ -1,3 +1,4 @@
+#!/bin/bash
 ###############################################################################
 # Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 #
@@ -22,35 +23,27 @@
 # IN THE SOFTWARE.
 ###############################################################################
 
-#!/bin/bash
-
 set -e
 
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
 src_path=$(dirname "$(realpath $0)")/../../
 
 cmake \
     -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+    -DBUILD_FUNCTIONAL_TESTS=ON \
+    -DBUILD_UNIT_TESTS=ON \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
+    -DUSE_GDA=OFF \
+    -DUSE_RO=ON \
     -DUSE_IPC=ON \
     -DUSE_THREADS=OFF \
     -DUSE_WF_COAL=OFF \
     -DUSE_HDP_FLUSH=OFF \
     -DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-    -DUSE_RO=ON \
-    -DBUILD_FUNCTIONAL_TESTS=ON \
-    -DBUILD_UNIT_TESTS=ON \
-    $src_path
+    $* $src_path
 cmake --build . --parallel 8
 cmake --install .
diff --git a/scripts/build_configs/ro_net b/scripts/build_configs/ro_net
index abdcffcdd4..7757f4d58f 100755
--- a/scripts/build_configs/ro_net
+++ b/scripts/build_configs/ro_net
@@ -1,3 +1,4 @@
+#!/bin/bash
 ###############################################################################
 # Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 #
@@ -22,35 +23,27 @@
 # IN THE SOFTWARE.
 ###############################################################################
 
-#!/bin/bash
-
 set -e
 
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
 src_path=$(dirname "$(realpath $0)")/../../
 
 cmake \
     -DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+    -DBUILD_FUNCTIONAL_TESTS=ON \
+    -DBUILD_UNIT_TESTS=ON \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
+    -DUSE_GDA=OFF \
+    -DUSE_RO=ON \
     -DUSE_IPC=OFF \
     -DUSE_THREADS=OFF \
     -DUSE_WF_COAL=OFF \
     -DUSE_HDP_FLUSH=OFF \
     -DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-    -DUSE_RO=ON \
-    -DBUILD_FUNCTIONAL_TESTS=ON \
-    -DBUILD_UNIT_TESTS=ON \
-    $src_path
+    $* $src_path
 cmake --build . --parallel 8
 cmake --install .
diff --git a/scripts/build_configs/ro_net_debug b/scripts/build_configs/ro_net_debug
index a7c42ba234..e77c8a6f89 100755
--- a/scripts/build_configs/ro_net_debug
+++ b/scripts/build_configs/ro_net_debug
@@ -1,3 +1,4 @@
+#!/bin/bash
 ###############################################################################
 # Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 #
@@ -22,33 +23,8 @@
 # IN THE SOFTWARE.
 ###############################################################################
 
-#!/bin/bash
-
 set -e
 
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
+script_path=$(dirname "$(realpath $0)")
 
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    -DUSE_HDP_FLUSH=OFF \
-    -DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-    -DUSE_RO=ON \
-    -DBUILD_FUNCTIONAL_TESTS=ON \
-    -DBUILD_UNIT_TESTS=ON \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
+BUILD_TYPE=Debug source $script_path/ro_net $*
diff --git a/scripts/functional_tests/driver.sh b/scripts/functional_tests/driver.sh
index 3c4902134f..269092ef72 100755
--- a/scripts/functional_tests/driver.sh
+++ b/scripts/functional_tests/driver.sh
@@ -195,6 +195,7 @@ TestRMAPut() {
   ExecTest  "waveput"          2       2            128       1048576
   ExecTest  "waveput"          2       16           128       8
 
+  ExecTest  "defaultctxput"    2       4            128       1024
   ExecTest  "teamctxput"       2       4            128       1024
   ExecTest  "teamctxput"       2       16           256       1024
 
@@ -226,6 +227,7 @@ TestRMAPut() {
   ExecTest  "waveputnbi"       2       2            128       1048576
   ExecTest  "waveputnbi"       2       16           128       8
 
+  ExecTest  "defaultctxputnbi" 2       4            128       1024
   ExecTest  "teamctxputnbi"    2       4            128       1024
   ExecTest  "teamctxputnbi"    2       16           256       1024
 }
@@ -250,6 +252,7 @@ TestRMAGet() {
   ExecTest  "waveget"          2       2            128       1048576
   ExecTest  "waveget"          2       16           128       8
 
+  ExecTest  "defaultctxget"    2       4            128       1024
   ExecTest  "teamctxget"       2       4            128       1024
   ExecTest  "teamctxget"       2       16           256       1024
 
@@ -276,6 +279,7 @@ TestRMAGet() {
   ExecTest  "wavegetnbi"       2       2            128       1048576
   ExecTest  "wavegetnbi"       2       16           128       8
 
+  ExecTest  "defaultctxgetnbi" 2       4            128       1024
   ExecTest  "teamctxgetnbi"    2       4            128       1024
   ExecTest  "teamctxgetnbi"    2       16           256       1024
 }
@@ -434,6 +438,186 @@ TestOther() {
   unset ROCSHMEM_MAX_NUM_CONTEXTS
 }
 
+# TODO: remove when GDA is feature complete
+TestGDA() {
+  ##############################################################################
+  #       | Name             | Ranks | Workgroups | Threads | Max Message Size #
+  ##############################################################################
+  ExecTest  "put"              2       1            1         1048576
+  ExecTest  "put"              2       1            1024      512
+  ExecTest  "put"              2       8            1         1048576
+  ExecTest  "put"              2       16           128       8
+  ExecTest  "put"              2       32           256       512
+  ExecTest  "put"              2       64           1024      8
+
+#  ExecTest  "wgput"            2       1            64        1048576
+#  ExecTest  "wgput"            2       2            64        1048576
+#  ExecTest  "wgput"            2       16           64        8
+
+  ExecTest  "waveput"          2       1            64        1048576
+  ExecTest  "waveput"          2       2            64        1048576
+  ExecTest  "waveput"          2       2            128       1048576
+  ExecTest  "waveput"          2       16           128       8
+
+  ExecTest  "defaultctxput"    2       4            128       1024
+  ExecTest  "teamctxput"       2       4            128       1024
+  ExecTest  "teamctxput"       2       16           256       1024
+
+#  ExecTest  "get"              2       1            1         1048576
+#  ExecTest  "get"              2       1            1024      512
+#  ExecTest  "get"              2       8            1         1048576
+#  ExecTest  "get"              2       16           128       8
+#  ExecTest  "get"              2       32           256       512
+#  ExecTest  "get"              2       64           1024      8
+
+#  ExecTest  "wgget"            2       1            64        1048576
+#  ExecTest  "wgget"            2       2            64        1048576
+#  ExecTest  "wgget"            2       16           64        8
+
+#  ExecTest  "waveget"          2       1            64        1048576
+#  ExecTest  "waveget"          2       2            64        1048576
+#  ExecTest  "waveget"          2       2            128       1048576
+#  ExecTest  "waveget"          2       16           128       8
+
+#  ExecTest  "defaultctxget"    2       4            128       1024
+#  ExecTest  "teamctxget"       2       4            128       1024
+#  ExecTest  "teamctxget"       2       16           256       1024
+
+#  ExecTest  "g"                2       1            1         128
+#  ExecTest  "g"                2       1            1024      2
+#  ExecTest  "g"                2       8            1         32
+#  ExecTest  "g"                2       16           128       4
+
+#Implemented but known incorrect
+#  ExecTest  "p"                2       1            1         128
+#  ExecTest  "p"                2       1            1024      2
+#  ExecTest  "p"                2       8            1         32
+#  ExecTest  "p"                2       16           128       4
+
+  ################################ Non-Blocking ################################
+
+  ExecTest  "putnbi"           2       1            1         1048576
+  ExecTest  "putnbi"           2       1            1024      512
+  ExecTest  "putnbi"           2       8            1         1048576
+  ExecTest  "putnbi"           2       16           128       8
+  ExecTest  "putnbi"           2       32           256       512
+  ExecTest  "putnbi"           2       64           1024      8
+
+#  ExecTest  "wgputnbi"         2       1            64        1048576
+#  ExecTest  "wgputnbi"         2       2            64        1048576
+#  ExecTest  "wgputnbi"         2       16           64        8
+
+  ExecTest  "waveputnbi"       2       1            64        1048576
+  ExecTest  "waveputnbi"       2       2            64        1048576
+  ExecTest  "waveputnbi"       2       2            128       1048576
+  ExecTest  "waveputnbi"       2       16           128       8
+
+  ExecTest  "defaultctxputnbi" 2       4            128       1024
+  ExecTest  "teamctxputnbi"    2       4            128       1024
+  ExecTest  "teamctxputnbi"    2       16           256       1024
+
+#  ExecTest  "getnbi"           2       1            1         1048576
+#  ExecTest  "getnbi"           2       1            1024      512
+#  ExecTest  "getnbi"           2       8            1         1048576
+#  ExecTest  "getnbi"           2       16           128       8
+#  ExecTest  "getnbi"           2       32           256       512
+#  ExecTest  "getnbi"           2       64           1024      8
+
+#  ExecTest  "wggetnbi"         2       1            64        1048576
+#  ExecTest  "wggetnbi"         2       2            64        1048576
+#  ExecTest  "wggetnbi"         2       16           64        8
+
+#  ExecTest  "wavegetnbi"       2       1            64        1048576
+#  ExecTest  "wavegetnbi"       2       2            64        1048576
+#  ExecTest  "wavegetnbi"       2       2            128       1048576
+#  ExecTest  "wavegetnbi"       2       16           128       8
+
+#  ExecTest  "defaultctxgetnbi" 2       4            128       1024
+#  ExecTest  "teamctxgetnbi"    2       4            128       1024
+#  ExecTest  "teamctxgetnbi"    2       16           256       1024
+
+#TestAMO() {
+  ##############################################################################
+  #       | Name             | Ranks | Workgroups | Threads | Max Message Size #
+  ##############################################################################
+#  ExecTest  "amo_fetch"        2       1            1
+#  ExecTest  "amo_fetch"        2       1            1024
+#  ExecTest  "amo_fetch"        2       8            1
+#  ExecTest  "amo_fetch"        2       32           128
+
+#  ExecTest  "amo_set"          2       1            1
+#  ExecTest  "amo_set"          2       8            1
+#  ExecTest  "amo_set"          2       32           1
+
+#  ExecTest  "amo_fcswap"       2       1            1
+#  ExecTest  "amo_fcswap"       2       32           1
+#  ExecTest  "amo_fcswap"       2       8            1
+
+#Works on CX7, not implemented on BNXT
+#  ExecTest  "amo_finc"         2       1            1
+#  ExecTest  "amo_finc"         2       1            1024
+#  ExecTest  "amo_finc"         2       8            1
+#  ExecTest  "amo_finc"         2       32           128
+
+#This works but tester requires get
+#  ExecTest  "amo_inc"          2       1            1
+#  ExecTest  "amo_inc"          2       1            1024
+#  ExecTest  "amo_inc"          2       8            1
+#  ExecTest  "amo_inc"          2       32           128
+
+#Works on CX7, not implemented on BNXT
+#  ExecTest  "amo_fadd"         2       1            1
+#  ExecTest  "amo_fadd"         2       1            1024
+#  ExecTest  "amo_fadd"         2       8            1
+#  ExecTest  "amo_fadd"         2       32           128
+
+#This works but tester requires get
+#  ExecTest  "amo_add"          2       1            1
+#  ExecTest  "amo_add"          2       1            1024
+#  ExecTest  "amo_add"          2       8            1
+#  ExecTest  "amo_add"          2       32           128
+
+#  ExecTest  "amo_fetchand"     2       1            1
+
+#  ExecTest  "amo_and"          2       1            1
+
+#  ExecTest  "amo_xor"          2       1            1
+
+#TestColl() {
+  ##############################################################################
+  #       | Name             | Ranks | Workgroups | Threads | Max Message Size #
+  ##############################################################################
+  ExecTest  "barrierall"       2       1            1
+  ExecTest  "teambarrier"      2       1            1
+
+  ExecTest  "sync"             2       1            1
+  ExecTest  "syncall"          2       1            1
+
+#  ExecTest  "alltoall"         2       1            1         512
+
+#  ExecTest  "teambroadcast"    2       1            1         32768
+
+#  ExecTest  "fcollect"         2       1            1         512
+#  ExecTest  "fcollect"         2       1            1         32768
+
+#  ExecTest  "teamreduction"    2       1            1         32768
+
+#TestOther() {
+  ##############################################################################
+  #       | Name             | Ranks | Workgroups | Threads | Max Message Size #
+  ##############################################################################
+  ExecTest  "init"             2       1            1
+
+#  ExecTest  "pingpong"         2       1            1
+#  ExecTest  "pingpong"         2       8            1
+#  ExecTest  "pingpong"         2       32           1
+
+  # This test requires more contexts than workgroups
+  export ROCSHMEM_MAX_NUM_CONTEXTS=1024
+  ExecTest  "teamctxinfra"     2       1            1
+  unset ROCSHMEM_MAX_NUM_CONTEXTS
+}
+
 ValidateInput() {
   INPUT_COUNT=$1
   if [ $INPUT_COUNT -lt 3 ] ; then
@@ -467,6 +651,9 @@ ValidateInput $#
 ValidateLogDir $LOG_DIR
 
 case $TEST in
+  *"gda")
+    TestGDA
+    ;;
   *"all")
     TestRMA
     TestAMO
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 732eece2c9..c8542da45f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -63,13 +63,15 @@ target_compile_options(${PROJECT_NAME} PUBLIC ${ROCSHMEM_COMPILE_FLAGS})
 ###############################################################################
 # ROCSHMEM TARGET FOR BACKENDS
 ###############################################################################
-IF (USE_RO)
+if (USE_RO)
 add_subdirectory(reverse_offload)
-ELSE()
+elseif (USE_IPC)
 add_subdirectory(ipc)
-ENDIF()
-add_subdirectory(containers)
+elseif (USE_GDA)
+add_subdirectory(gda)
+endif()
 add_subdirectory(host)
+add_subdirectory(containers)
 add_subdirectory(memory)
 add_subdirectory(sync)
 add_subdirectory(bootstrap)
diff --git a/src/backend_bc.cpp b/src/backend_bc.cpp
index 1469d9a774..c974c11428 100644
--- a/src/backend_bc.cpp
+++ b/src/backend_bc.cpp
@@ -27,10 +27,12 @@
 #include "backend_type.hpp"
 #include "context_incl.hpp"
 
-#ifdef USE_RO
+#if defined(USE_RO)
 #include "reverse_offload/backend_ro.hpp"
-#else
+#elif defined(USE_IPC)
 #include "ipc/backend_ipc.hpp"
+#elif defined(USE_GDA)
+#include "gda/backend_gda.hpp"
 #endif
 
 #include <cassert>
@@ -247,18 +249,22 @@ void Backend::reset_stats() {
 }
 
 __device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
-#ifdef USE_RO
+#if defined(USE_RO)
   return static_cast<ROBackend*>(this)->create_ctx(option, ctx);
-#else
+#elif defined(USE_IPC)
   return static_cast<IPCBackend*>(this)->create_ctx(option, ctx);
+#elif defined(USE_GDA)
+  return static_cast<GDABackend*>(this)->create_ctx(option, ctx);
 #endif
 }
 
 __device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) {
-#ifdef USE_RO
+#if defined(USE_RO)
   static_cast<ROBackend*>(this)->destroy_ctx(ctx);
-#else
+#elif defined(USE_IPC)
   static_cast<IPCBackend*>(this)->destroy_ctx(ctx);
+#elif defined(USE_GDA)
+  static_cast<GDABackend*>(this)->destroy_ctx(ctx);
 #endif
 }
 
diff --git a/src/backend_type.hpp b/src/backend_type.hpp
index 98268c7422..ef9c7d3874 100644
--- a/src/backend_type.hpp
+++ b/src/backend_type.hpp
@@ -46,7 +46,7 @@ namespace rocshmem {
  * @note Derived classes which use Backend as a base class must add
  * themselves to this enum class to support static polymorphism.
  */
-enum class BackendType { RO_BACKEND, IPC_BACKEND };
+enum class BackendType { RO_BACKEND, IPC_BACKEND, GDA_BACKEND };
 
 /**
  * @brief Helper macro for some dispatch calls
@@ -56,40 +56,52 @@ enum class BackendType { RO_BACKEND, IPC_BACKEND };
 /**
  * @brief Device static dispatch method call.
  */
-#ifdef USE_RO
+#if defined(USE_RO)
 #define DISPATCH(Func)                     \
   static_cast<ROContext *>(this)->Func;
-#else
+#elif defined(USE_IPC)
 #define DISPATCH(Func)                     \
   static_cast<IPCContext *>(this)->Func;
+#elif defined(USE_GDA)
+#define DISPATCH(Func)                     \
+  static_cast<GDAContext *>(this)->Func;
 #endif
 
 /**
  * @brief Device static dispatch method call with a return value.
  */
-#ifdef USE_RO
+#if defined(USE_RO)
 #define DISPATCH_RET(Func)                             \
   auto ret_val = static_cast<ROContext *>(this)->Func; \
   return ret_val;
-#else
-#define DISPATCH_RET(Func)                         \
-  auto ret_val{0};                                 \
-  ret_val = static_cast<IPCContext *>(this)->Func; \
+#elif defined(USE_IPC)
+#define DISPATCH_RET(Func)                              \
+  auto ret_val = static_cast<IPCContext *>(this)->Func; \
+  return ret_val;
+#elif defined(USE_GDA)
+#define DISPATCH_RET(Func)                              \
+  auto ret_val = static_cast<GDAContext *>(this)->Func; \
   return ret_val;
 #endif
+
 /**
  * @brief Device static dispatch method call with a return type of pointer.
  */
-#ifdef USE_RO
+#if defined(USE_RO)
 #define DISPATCH_RET_PTR(Func)                    \
   void *ret_val{nullptr};                         \
   ret_val = static_cast<ROContext *>(this)->Func; \
   return ret_val;
-#else
+#elif defined(USE_IPC)
 #define DISPATCH_RET_PTR(Func)                     \
   void *ret_val{nullptr};                          \
   ret_val = static_cast<IPCContext *>(this)->Func; \
   return ret_val;
+#elif defined(USE_GDA)
+#define DISPATCH_RET_PTR(Func)                     \
+  void *ret_val{nullptr};                          \
+  ret_val = static_cast<GDAContext *>(this)->Func; \
+  return ret_val;
 #endif
 
 /**
@@ -99,11 +111,14 @@ enum class BackendType { RO_BACKEND, IPC_BACKEND };
  * MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and
  * threading semantics of collectives in OpenSHMEM match those of MPI.
  */
-#ifdef USE_RO
+#if defined(USE_RO)
 #define HOST_DISPATCH(Func) static_cast<ROHostContext *>(this)->Func;
-#else
+#elif defined(USE_IPC)
 #define HOST_DISPATCH(Func) static_cast<IPCHostContext *>(this)->Func;
+#elif defined(USE_GDA)
+#define HOST_DISPATCH(Func) static_cast<GDAHostContext *>(this)->Func;
 #endif
+
 /**
  * @brief Host static dispatch method call with return value.
  *
@@ -111,31 +126,38 @@ enum class BackendType { RO_BACKEND, IPC_BACKEND };
  * MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and
  * threading semantics of collectives in OpenSHMEM match those of MPI.
  */
-
-#ifdef USE_RO
+#if defined(USE_RO)
 #define HOST_DISPATCH_RET(Func)                            \
   auto ret_val = static_cast<ROHostContext *>(this)->Func; \
   return ret_val;
-#else
-#define HOST_DISPATCH_RET(Func)                        \
-  auto ret_val{0};                                     \
-  ret_val = static_cast<IPCHostContext *>(this)->Func; \
+#elif defined(USE_IPC)
+#define HOST_DISPATCH_RET(Func)                             \
+  auto ret_val = static_cast<IPCHostContext *>(this)->Func; \
+  return ret_val;
+#elif defined(USE_GDA)
+#define HOST_DISPATCH_RET(Func)                             \
+  auto ret_val = static_cast<GDAHostContext *>(this)->Func; \
   return ret_val;
 #endif
 
 /**
  * @brief Host static dispatch method call with a return type of pointer.
  */
-#ifdef USE_RO
+#if defined(USE_RO)
 #define HOST_DISPATCH_RET_PTR(Func)                    \
   void *ret_val{nullptr};                              \
   ret_val = static_cast<ROHostContext *>(this)->Func;  \
   return ret_val;
-#else
+#elif defined(USE_IPC)
 #define HOST_DISPATCH_RET_PTR(Func)                    \
   void *ret_val{nullptr};                              \
   ret_val = static_cast<IPCHostContext *>(this)->Func; \
   return ret_val;
+#elif defined(USE_GDA)
+#define HOST_DISPATCH_RET_PTR(Func)                    \
+  void *ret_val{nullptr};                              \
+  ret_val = static_cast<GDAHostContext *>(this)->Func; \
+  return ret_val;
 #endif
 
 }  // namespace rocshmem
diff --git a/src/bootstrap/bootstrap.cpp b/src/bootstrap/bootstrap.cpp
index 54311a462a..ff107695a4 100644
--- a/src/bootstrap/bootstrap.cpp
+++ b/src/bootstrap/bootstrap.cpp
@@ -32,7 +32,7 @@
 
 #include "bootstrap.hpp"
 #include "utils.hpp"
-#include "../util.hpp"
+#include "util.hpp"
 #include "socket.hpp"
 
 namespace rocshmem {
diff --git a/src/bootstrap/socket.cpp b/src/bootstrap/socket.cpp
index 5b1b57dfda..1760c20096 100644
--- a/src/bootstrap/socket.cpp
+++ b/src/bootstrap/socket.cpp
@@ -36,7 +36,7 @@
 
 #include "socket.hpp"
 #include "utils.hpp"
-#include "../util.hpp"
+#include "util.hpp"
 
 namespace rocshmem {
 
diff --git a/src/bootstrap/utils.cpp b/src/bootstrap/utils.cpp
index 829599134a..bf68ded083 100644
--- a/src/bootstrap/utils.cpp
+++ b/src/bootstrap/utils.cpp
@@ -34,7 +34,7 @@
 #include <iostream>
 
 #include "utils.hpp"
-#include "../util.hpp"
+#include "util.hpp"
 
 constexpr char HOSTID_FILE[32] = "/proc/sys/kernel/random/boot_id";
 
diff --git a/src/containers/array_impl.hpp b/src/containers/array_impl.hpp
index 6697ace54a..551cf5bf91 100644
--- a/src/containers/array_impl.hpp
+++ b/src/containers/array_impl.hpp
@@ -26,7 +26,7 @@
 #define LIBRARY_SRC_CONTAINERS_ARRAY_IMPL_HPP_
 
 #include "array.hpp"
-#include "../constants.hpp"
+#include "constants.hpp"
 
 #include <hip/hip_runtime.h>
 #include <cassert>
diff --git a/src/containers/atomic_wf_queue.hpp b/src/containers/atomic_wf_queue.hpp
index 02e0f1d82c..bac0dbb030 100644
--- a/src/containers/atomic_wf_queue.hpp
+++ b/src/containers/atomic_wf_queue.hpp
@@ -27,9 +27,9 @@
 
 #include <hip/hip_runtime.h>
 
-#include "../memory/hip_allocator.hpp"
-#include "../sync/abql_block_mutex.hpp"
-#include "../src/util.hpp"
+#include "memory/hip_allocator.hpp"
+#include "sync/abql_block_mutex.hpp"
+#include "util.hpp"
 
 namespace rocshmem {
 
diff --git a/src/containers/free_list.hpp b/src/containers/free_list.hpp
index 9c238b4687..2bc640b338 100644
--- a/src/containers/free_list.hpp
+++ b/src/containers/free_list.hpp
@@ -27,8 +27,8 @@
 
 #include <hip/hip_runtime.h>
 
-#include "../memory/hip_allocator.hpp"
-#include "../sync/abql_block_mutex.hpp"
+#include "memory/hip_allocator.hpp"
+#include "sync/abql_block_mutex.hpp"
 
 namespace rocshmem {
 
diff --git a/src/containers/share_strategy.cpp b/src/containers/share_strategy.cpp
index 1e6ee557b7..e40c4b6256 100644
--- a/src/containers/share_strategy.cpp
+++ b/src/containers/share_strategy.cpp
@@ -23,7 +23,7 @@
  *****************************************************************************/
 
 #include "share_strategy.hpp"
-#include "../constants.hpp"
+#include "constants.hpp"
 
 #include <hip/hip_runtime.h>
 
diff --git a/src/context_incl.hpp b/src/context_incl.hpp
index b95bbf94f3..5f8106ebed 100644
--- a/src/context_incl.hpp
+++ b/src/context_incl.hpp
@@ -28,12 +28,17 @@
 #include "context.hpp"
 #include "context_tmpl_device.hpp"
 #include "context_tmpl_host.hpp"
-#ifdef USE_RO
+#if defined(USE_RO)
 #include "reverse_offload/context_ro_device.hpp"
 #include "reverse_offload/context_ro_host.hpp"
-#else
+#elif defined(USE_IPC)
 #include "ipc/context_ipc_device.hpp"
 #include "ipc/context_ipc_host.hpp"
+#elif defined(USE_GDA)
+#include "gda/context_gda_device.hpp"
+#include "gda/context_gda_host.hpp"
+#else
+#error "Select one backend among USE_RO, USE_IPC, USE_GDA"
 #endif
 
 #endif  // LIBRARY_SRC_CONTEXT_INCL_HPP_
diff --git a/src/context_tmpl_device.hpp b/src/context_tmpl_device.hpp
index 0fedf3ffd0..3c0b5802ac 100644
--- a/src/context_tmpl_device.hpp
+++ b/src/context_tmpl_device.hpp
@@ -27,10 +27,12 @@
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "backend_type.hpp"
-#ifdef USE_RO
+#if defined(USE_RO)
 #include "reverse_offload/context_ro_device.hpp"
-#else
+#elif defined(USE_IPC)
 #include "ipc/context_ipc_device.hpp"
+#elif defined(USE_GDA)
+#include "gda/context_gda_device.hpp"
 #endif
 
 namespace rocshmem {
diff --git a/src/context_tmpl_host.hpp b/src/context_tmpl_host.hpp
index 68a572bdaf..53236c540c 100644
--- a/src/context_tmpl_host.hpp
+++ b/src/context_tmpl_host.hpp
@@ -27,11 +27,14 @@
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "backend_type.hpp"
-#ifdef USE_RO
+#if defined(USE_RO)
 #include "reverse_offload/context_ro_host.hpp"
-#else
+#elif defined(USE_IPC)
 #include "ipc/context_ipc_host.hpp"
+#elif defined(USE_GDA)
+#include "gda/context_gda_host.hpp"
 #endif
+
 namespace rocshmem {
 
 template <typename T>
diff --git a/src/gda/CMakeLists.txt b/src/gda/CMakeLists.txt
new file mode 100644
index 0000000000..056f5f2423
--- /dev/null
+++ b/src/gda/CMakeLists.txt
@@ -0,0 +1,55 @@
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+
+###############################################################################
+# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY
+###############################################################################
+target_sources(
+  ${PROJECT_NAME}
+  PRIVATE
+    context_gda_device.cpp
+    context_gda_device_coll.cpp
+    context_gda_host.cpp
+    backend_gda.cpp
+    gda_team.cpp
+    queue_pair.cpp
+    endian.cpp
+    topology.cpp
+)
+
+find_package(IBVerbs REQUIRED)
+
+target_link_libraries(
+  ${PROJECT_NAME}
+  PUBLIC
+    IBVerbs::verbs
+    numa
+)
+
+if (GDA_IONIC)
+elseif (GDA_BNXT)
+  add_subdirectory(bnxt)
+else()
+  target_sources(${PROJECT_NAME} PRIVATE segment_builder.cpp)
+endif()
diff --git a/src/gda/backend_gda.cpp b/src/gda/backend_gda.cpp
new file mode 100644
index 0000000000..a7c555a527
--- /dev/null
+++ b/src/gda/backend_gda.cpp
@@ -0,0 +1,1238 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include <cstring>
+
+#include "backend_gda.hpp"
+#include "gda_team.hpp"
+#include "util.hpp"
+#include "topology.hpp"
+
+#include <hip/hip_runtime.h>
+#include <cstdlib>
+#include <cassert>
+
+namespace rocshmem {
+
+#define NET_CHECK(cmd) {                                     \
+    if (cmd != MPI_SUCCESS) {                                \
+      fprintf(stderr, "Unrecoverable error: MPI Failure\n"); \
+      abort();                                               \
+    }                                                        \
+  }
+
+extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
+
+rocshmem_team_t get_external_team(GDATeam *team) {
+  return reinterpret_cast<rocshmem_team_t>(team);
+}
+
+int get_ls_non_zero_bit(char *bitmask, int mask_length) {
+  int position{-1};
+  for (int bit_i = 0; bit_i < mask_length; bit_i++) {
+    int byte_i = bit_i / CHAR_BIT;
+    if (bitmask[byte_i] & (1 << (bit_i % CHAR_BIT))) {
+      position = bit_i;
+      break;
+    }
+  }
+
+  return position;
+}
+
+GDABackend::GDABackend(MPI_Comm comm):  Backend(comm) {
+  init();
+}
+
+GDABackend::GDABackend(TcpBootstrap *bootstrap):  Backend(bootstrap) {
+  init();
+}
+
+void GDABackend::init() {
+  type = BackendType::GDA_BACKEND;
+  read_env();
+
+  //TODO setup_host_interface();
+  /* Initialize the host interface */
+  if (MPI_COMM_NULL != backend_comm)
+    host_interface = std::make_shared<HostInterface>(hdp_proxy_.get(), //TODO: need an hdp proxy?
+                                                     backend_comm,
+                                                     &heap);
+  else
+    host_interface = std::make_shared<HostInterface>(hdp_proxy_.get(), //TODO: need an hdp proxy?
+                                                     backend_bootstr,
+                                                     &heap);
+
+  setup_wrk_sync_buffer();
+  setup_fence_buffer();
+  setup_collectives();
+
+  setup_teams();
+  setup_team_world();
+  rte_barrier();
+
+  setup_ibv();
+  setup_heap_memory_rkey();
+  setup_gpu_qps();
+
+  setup_ctxs();
+  rte_barrier();
+}
+
+GDABackend::~GDABackend() {
+  cleanup_ctxs();
+
+  cleanup_teams();
+  auto *team_world{team_tracker.get_team_world()};
+  team_world->~Team();
+  CHECK_HIP(hipFree(team_world));
+
+  cleanup_wrk_sync_buffer();
+
+  cleanup_gpu_qps();
+  cleanup_heap_memory_rkey();
+  cleanup_ibv();
+}
+
+void GDABackend::read_env() {
+  if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
+    std::stringstream sstream(maximum_num_contexts_str);
+    sstream >> maximum_num_contexts_;
+  }
+  char* value{nullptr};
+  if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) {
+    requested_dev = strdup(value);
+  } else {
+    int gpu_dev = 0;
+    CHECK_HIP(hipGetDevice(&gpu_dev));
+    int nic_dev = rocshmem::GetClosestNicToGpu(gpu_dev, &requested_dev);
+    assert (nic_dev != -1);
+  }
+  if ((value = getenv("ROCSHMEM_SQ_SIZE"))) {
+    sq_size = atoi(value);
+  }
+}
+
+
+void GDABackend::setup_host_ctx() {
+  default_host_ctx = std::make_unique<GDAHostContext>(this, 0);
+  ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
+}
+
+void GDABackend::setup_default_ctx() {
+  TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world;
+  default_context_proxy_ = GDADefaultContextProxyT(this, tinfo);
+}
+
+void GDABackend::setup_ctxs() {
+  setup_host_ctx();
+  setup_default_ctx();
+
+  CHECK_HIP(hipMalloc(&ctx_array, sizeof(GDAContext) * maximum_num_contexts_));
+  // 0th context is default context
+  for (size_t i = 0; i < maximum_num_contexts_; i++) {
+    new (&ctx_array[i]) GDAContext(this, i + 1);
+    ctx_free_list.get()->push_back(ctx_array + i);
+  }
+}
+
+void GDABackend::cleanup_ctxs() {
+  ctx_free_list.~FreeListProxy();
+  for (size_t i = 0; i < maximum_num_contexts_; i++) {
+    ctx_array[i].~GDAContext();
+  }
+
+  CHECK_HIP(hipFree(ctx_array));
+}
+
+__device__ bool GDABackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) {
+  GDAContext *ctx_{nullptr};
+
+  auto pop_result = ctx_free_list.get()->pop_front();
+  if (!pop_result.success) {
+    return false;
+  }
+  ctx_ = pop_result.value;
+
+  ctx->ctx_opaque = ctx_;
+
+  ctx_->tinfo = reinterpret_cast<TeamInfo *>(ctx->team_opaque);
+  return true;
+}
+
+__device__ void GDABackend::destroy_ctx(rocshmem_ctx_t *ctx) {
+  ctx_free_list.get()->push_back(static_cast<GDAContext *>(ctx->ctx_opaque));
+}
+
+void GDABackend::setup_team_world() {
+  TeamInfo *team_info_wrt_parent, *team_info_wrt_world;
+
+  /**
+   * Allocate device-side memory for team_world and construct a
+   * GDA team in it.
+   */
+  CHECK_HIP(hipMalloc(&team_info_wrt_parent, sizeof(TeamInfo)));
+  CHECK_HIP(hipMalloc(&team_info_wrt_world, sizeof(TeamInfo)));
+
+  new (team_info_wrt_parent) TeamInfo(nullptr, 0, 1, num_pes);
+  new (team_info_wrt_world) TeamInfo(nullptr, 0, 1, num_pes);
+
+  GDATeam *team_world{nullptr};
+  CHECK_HIP(hipMalloc(&team_world, sizeof(GDATeam)));
+  new (team_world) GDATeam(this, team_info_wrt_parent, team_info_wrt_world,
+                           num_pes, my_pe, backend_comm, 0);
+  team_tracker.set_team_world(team_world);
+
+  /**
+   * Copy the address to ROCSHMEM_TEAM_WORLD.
+   */
+  ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
+}
+
+void GDABackend::team_destroy(rocshmem_team_t team) {
+  GDATeam *team_obj = get_internal_gda_team(team);
+
+  /* Mark the pool as available */
+  int bit = team_obj->pool_index_;
+  int byte_i = bit / CHAR_BIT;
+  team_pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT);
+
+  team_obj->~GDATeam();
+  CHECK_HIP(hipFree(team_obj));
+}
+
+//TODO: factorize somewhere else maybe backend_bc
+void GDABackend::Alltoall_char_inplace (char *inoutbuf, size_t num_bytes, rocshmem_team_t team) {
+  // Implement an Alltoall outside of MPI assuming in_place communication
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+  int num_pes = team_obj->num_pes;
+  int my_pe = team_obj->my_pe;
+  int *pes_in_world = new int[num_pes];
+
+  int my_pe_in_world = team_obj->my_pe_in_world;
+  for (int i = 0; i < num_pes; i++) {
+      pes_in_world[i] = team_obj->get_pe_in_world(i);
+  }
+
+  // Since this is an in-place algorithm, allocate the temporary receive buffer first
+  char *recv_buf = new char[num_bytes * num_pes];
+  std::memset(recv_buf, 0, num_pes * num_bytes);
+
+  // Perform pairwise exchange - local copy is ommitted
+  for (int step = 1; step < num_pes; step++) {
+    int sendto_team  = (my_pe + step) % num_pes;
+    int recvfrom_team = (my_pe + num_pes - step) % num_pes;
+
+    char *tmpsend = (char*)inoutbuf + (ptrdiff_t)sendto_team * num_bytes;
+    char *tmprecv = (char*)recv_buf + (ptrdiff_t)recvfrom_team * num_bytes;
+
+    // similarly to the allGather in the bootstrap code, we do send first
+    // followed by the receive.
+    // There is a chance for deadlock in my opinion for large messages.
+    backend_bootstr->send(tmpsend, num_bytes, pes_in_world[sendto_team], step /* used as tag */);
+    backend_bootstr->recv(tmprecv, num_bytes, pes_in_world[recvfrom_team], step);
+  }
+  //Since this is an in_place all-to-all, copy data back into the user buffer
+  for (int step = 0; step < num_pes; step++) {
+    if (step == my_pe) continue;
+    std::memcpy(&inoutbuf[step*num_bytes], &recv_buf[step*num_bytes], num_bytes);
+  }
+
+  delete[] recv_buf;
+  delete[] pes_in_world;
+}
+
+//TODO: factorize somewhere else, maybe backend_bc?
+void GDABackend::Allreduce_char_BAND (char* inbuf, char *outbuf, size_t num_bytes,
+                                      Team *team) {
+
+  // Implement an Allreduce outside of MPI. This is specialized for the scenario
+  // required for the team creation, i.e. assuming bytes and using BAND operation.
+  // Implementation uses an Allgather operation followed a local reduction.
+
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+  int num_pes = team_obj->num_pes;
+  int my_pe = team_obj->my_pe;
+
+  char *tmp_buffer = new char[num_pes * num_bytes];
+  std::memset(tmp_buffer, 0, num_pes * num_bytes);
+  std::memcpy (&tmp_buffer[my_pe * num_bytes], inbuf, num_bytes);
+
+  if (num_pes == backend_bootstr->getNranks() ) {
+    backend_bootstr->allGather(tmp_buffer, num_bytes);
+  } else {
+    printf("GDABackend::create_new_team: non-mpi version only supports parent_teams that contain all processes. Aborting.\n");
+    abort();
+  }
+
+  for (int i = 0; i < num_bytes; i++) {
+    outbuf[i] = tmp_buffer[i];
+    for (int j = 1; j < num_pes; j++) {
+      outbuf[i] &= tmp_buffer[j * num_bytes + i];
+    }
+  }
+
+  delete[] tmp_buffer;
+}
+
+void GDABackend::create_new_team([[maybe_unused]] Team *parent_team,
+                                TeamInfo *team_info_wrt_parent,
+                                TeamInfo *team_info_wrt_world, int num_pes,
+                                int my_pe_in_new_team, MPI_Comm team_comm,
+                                rocshmem_team_t *new_team) {
+  /**
+   * Read the bit mask and find out a common index into
+   * the pool of available work arrays.
+   */
+  if (team_comm != MPI_COMM_NULL) {
+    NET_CHECK(MPI_Allreduce(team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_,
+                            MPI_CHAR, MPI_BAND, team_comm));
+  } else {
+    Allreduce_char_BAND (team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_, parent_team);
+  }
+
+  /* Pick the least significant non-zero bit (logical layout) in the reduced
+   * bitmask */
+  auto max_num_teams{team_tracker.get_max_num_teams()};
+  int common_index = get_ls_non_zero_bit(team_reduced_bitmask_, max_num_teams);
+  if (common_index < 0) {
+    /* No team available */
+    printf("Could not create team, all bits in use. Aborting.\n");
+    abort();
+  }
+
+  /* Mark the team as taken (by unsetting the bit in the pool bitmask) */
+  int byte = common_index / CHAR_BIT;
+  team_pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT));
+
+  /**
+   * Allocate device-side memory for team_world and
+   * construct a GDA team in it
+   */
+  GDATeam *new_team_obj;
+  CHECK_HIP(hipMalloc(&new_team_obj, sizeof(GDATeam)));
+  new (new_team_obj)
+      GDATeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes,
+                my_pe_in_new_team, team_comm, common_index);
+
+  *new_team = get_external_team(new_team_obj);
+}
+
+void GDABackend::ctx_create(int64_t options, void **ctx) {
+  GDAHostContext *new_ctx{nullptr};
+  new_ctx = new GDAHostContext(this, options);
+  *ctx = new_ctx;
+}
+
+GDAHostContext *get_internal_gda_net_ctx(Context *ctx) {
+  return reinterpret_cast<GDAHostContext *>(ctx);
+}
+
+void GDABackend::ctx_destroy(Context *ctx) {
+  GDAHostContext *gda_host_ctx{get_internal_gda_net_ctx(ctx)};
+  delete gda_host_ctx;
+}
+
+void GDABackend::reset_backend_stats() {
+  assert(false);
+}
+
+void GDABackend::dump_backend_stats() {
+  assert(false);
+}
+
+__host__ void GDABackend::global_exit(int status) {
+  if (backend_comm != MPI_COMM_NULL)
+    MPI_Abort(backend_comm, status);
+  else
+    abort();
+}
+
+void GDABackend::cleanup_teams() {
+  free(team_pool_bitmask_);
+  free(team_reduced_bitmask_);
+}
+
+void GDABackend::setup_wrk_sync_buffer() {
+  /**
+   * compute work/sync buffer size
+   */
+  auto max_num_teams{team_tracker.get_max_num_teams()};
+
+  /**
+   * size of barrier sync
+   */
+  wrk_sync_pool_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE;
+
+  /**
+   * Size of sync arrays for the teams
+  */
+  wrk_sync_pool_size_ += sizeof(long) * max_num_teams *
+                           (ROCSHMEM_BARRIER_SYNC_SIZE +
+                            ROCSHMEM_REDUCE_SYNC_SIZE +
+                            ROCSHMEM_BCAST_SYNC_SIZE +
+                            ROCSHMEM_ALLTOALL_SYNC_SIZE);
+
+  /**
+   * Size of work arrays for the teams
+   * Accommodate largest possible data type for pWrk
+  */
+  wrk_sync_pool_size_ += sizeof(double) * max_num_teams *
+                           (ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE +
+                            ROCSHMEM_ATA_MAX_WRKDATA_SIZE);
+
+  /**
+   * Size of fence array
+   */
+  wrk_sync_pool_size_ += sizeof(int) * num_pes; //TODO: do we need a fence array?
+
+  /**
+   * Allocate a buffer of size wrk_sync_pool_size_, using heap memory
+   * (should be uncached fine-grained ideally)
+  */
+  heap.malloc((void**)&wrk_sync_pool_, wrk_sync_pool_size_);
+  assert(wrk_sync_pool_);
+  wrk_sync_pool_top_ = wrk_sync_pool_;
+}
+
+void GDABackend::cleanup_wrk_sync_buffer() {
+  heap.free(wrk_sync_pool_);
+}
+
+void GDABackend::setup_fence_buffer() { //TODO is this used?
+  /*
+   * Reserve memory for fence
+   */
+  fence_pool = reinterpret_cast<int *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(int) * num_pes;
+}
+
+void GDABackend::setup_collectives() {
+  /*
+   * Allocate heap space for barrier_sync
+   */
+  size_t one_sync_size_bytes {sizeof(*barrier_sync)};
+  size_t sync_size_bytes {one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
+
+  barrier_sync = reinterpret_cast<int64_t*>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sync_size_bytes;
+
+  /*
+   * Initialize the barrier synchronization array with default values.
+   */
+  for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
+    barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
+  }
+
+  /*
+   * Make sure that all processing elements have done this before
+   * continuing.
+   */
+  rte_barrier();
+}
+
+void GDABackend::setup_teams() {
+  /**
+   * Allocate pools for the teams sync and work arrary from the SHEAP.
+   */
+  auto max_num_teams{team_tracker.get_max_num_teams()};
+
+  barrier_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE
+                            * max_num_teams;
+
+  reduce_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE
+                            * max_num_teams;
+
+  bcast_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
+                            * max_num_teams;
+
+  alltoall_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
+                            * max_num_teams;
+
+  /* Accommodating for largest possible data type for pWrk */
+  pWrk_pool = reinterpret_cast<void *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
+                            * max_num_teams;
+
+
+  pAta_pool = reinterpret_cast<void *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE
+                            * max_num_teams;
+
+  /**
+   * Initialize the sync arrays in the pool with default values.
+   */
+  long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
+  for (int team_i = 0; team_i < max_num_teams; team_i++) {
+    barrier_pSync = reinterpret_cast<long *>(
+        &barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
+    reduce_pSync = reinterpret_cast<long *>(
+        &reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
+    bcast_pSync = reinterpret_cast<long *>(
+        &bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
+    alltoall_pSync = reinterpret_cast<long *>(
+        &alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
+
+    for (size_t i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
+      barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
+    }
+    for (size_t i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
+      reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
+    }
+    for (size_t i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
+      bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
+    }
+    for (size_t i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
+      alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
+    }
+  }
+
+  /**
+   * Initialize bit mask
+   *
+   * Logical:
+   * MSB..........................................................................LSB
+   * Physical: MSB...1st least significant 8 bits...LSB  MSB...2nd least
+   * signifant 8 bits...LSB
+   *
+   * Description shows only a 2-byte long mask but idea extends to any
+   * arbitrary size.
+   */
+  team_bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1)
+                                             : (max_num_teams / CHAR_BIT);
+  team_pool_bitmask_ = reinterpret_cast<char *>(malloc(team_bitmask_size_));
+  team_reduced_bitmask_ = reinterpret_cast<char *>(malloc(team_bitmask_size_));
+
+  memset(team_pool_bitmask_, 0, team_bitmask_size_);
+  memset(team_reduced_bitmask_, 0, team_bitmask_size_);
+  /* Set all to available except the 0th one (reserved for TEAM_WORLD) */
+  for (int bit_i = 1; bit_i < max_num_teams; bit_i++) {
+    int byte_i = bit_i / CHAR_BIT;
+    team_pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT);
+  }
+
+  /**
+   * Make sure that all processing elements have done this before
+   * continuing.
+   */
+  rte_barrier();
+}
+
+void GDABackend::rte_barrier() {
+  if (backend_comm != MPI_COMM_NULL) {
+    NET_CHECK(MPI_Barrier(backend_comm));
+  } else {
+    backend_bootstr->barrier();
+  }
+}
+
+static void dump_ibv_context(struct ibv_context *x);
+static void dump_ibv_device(struct ibv_device *x);
+static void dump_ibv_pd(struct ibv_pd *x);
+static void dump_ibv_port_attr(struct ibv_port_attr *x);
+static void dump_ibv_qp(struct ibv_qp *qp, int conn_num);
+static void dump_mlx5dv_qp(struct mlx5dv_qp *qp_dv, int conn_num);
+static void dump_mlx5dv_cq(struct mlx5dv_cq *cq_dv, int conn_num);
+
+void GDABackend::setup_ibv() {
+  dest_info.resize(num_pes * (maximum_num_contexts_ + 1));
+  int ib_devices{0};
+  dev_list = ibv_get_device_list(&ib_devices);
+  CHECK_NNULL(dev_list, "ibv_get_device");
+  struct ibv_device* ib_dev = dev_list[0]; //TODO default to HIP selected device?
+  if (requested_dev) {
+    for (int i = 0; i < ib_devices; i++) {
+      const char* select_dev{ibv_get_device_name(dev_list[i])};
+      CHECK_NNULL(select_dev, "ibv_get_device_name");
+      if (strstr(select_dev, requested_dev)) {
+        ib_dev = dev_list[i];
+        break;
+      }
+    }
+  }
+  uint8_t port{1};
+  ib_init(ib_dev, port);
+  create_qps(port, &ib_state->portinfo);
+
+  auto npes = num_pes;
+  auto dinfo = dest_info.data();
+  for (int i = 0; i < maximum_num_contexts_ + 1; i++) {
+    if (backend_comm != MPI_COMM_NULL) {
+      MPI_Alltoall(MPI_IN_PLACE, sizeof(dest_info_t), MPI_CHAR, dinfo + i * npes, sizeof(dest_info_t), MPI_CHAR, backend_comm);
+    } else {
+      Alltoall_char_inplace(reinterpret_cast<char*>(dinfo + i * npes), sizeof(dest_info_t), ROCSHMEM_TEAM_WORLD);
+    }
+  }
+
+  for (int i = 0; i < qps.size(); i++) {
+    change_status_rtr(qps[i], &dest_info[i], port);
+  }
+  rte_barrier();
+  for (int i = 0; i < qps.size(); i++) {
+    change_status_rts(qps[i], &dest_info[i]);
+    dump_ibv_qp(qps[i], i);
+  }
+  rte_barrier();
+}
+
+void GDABackend::cleanup_ibv() {
+  ibv_free_device_list(dev_list);
+
+  delete ib_state;
+  if (requested_dev != nullptr)
+    free(requested_dev);
+}
+
+
+void GDABackend::setup_heap_memory_rkey() {
+  auto *base_heap = heap.get_local_heap_base();
+  int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+
+  heap_mr = ibv_reg_mr(ib_state->pd_orig, base_heap, heap.get_size(), access);
+  CHECK_NNULL(heap_mr, "ibv_reg_mr");
+
+  const size_t rkeys_size = sizeof(uint32_t) * num_pes;
+  uint32_t *host_rkey_cpy = reinterpret_cast<uint32_t*>(malloc(rkeys_size));
+  if (!host_rkey_cpy) { abort(); }
+
+  CHECK_HIP(hipHostMalloc(&heap_rkey, sizeof(uint32_t) * num_pes));
+  heap_rkey[my_pe] = heap_mr->rkey;
+
+  hipStream_t stream;
+  CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+  CHECK_HIP(hipMemcpyAsync(host_rkey_cpy, heap_rkey, rkeys_size, hipMemcpyDeviceToHost, stream));
+  CHECK_HIP(hipStreamSynchronize(stream));
+
+  if (backend_comm != MPI_COMM_NULL)
+    MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_rkey_cpy, sizeof(uint32_t), MPI_CHAR, backend_comm);
+  else
+    backend_bootstr->allGather(host_rkey_cpy, sizeof(uint32_t));
+
+  CHECK_HIP(hipMemcpyAsync(heap_rkey, host_rkey_cpy, rkeys_size, hipMemcpyHostToDevice, stream));
+  CHECK_HIP(hipStreamSynchronize(stream));
+  CHECK_HIP(hipStreamDestroy(stream));
+
+  free(host_rkey_cpy);
+}
+
+void GDABackend::cleanup_heap_memory_rkey() {
+  int ret = ibv_dereg_mr(heap_mr);
+  CHECK_ZERO(ret, "ibv_dereg_mr");
+
+  CHECK_HIP(hipHostFree(heap_rkey));
+}
+
+void GDABackend::setup_gpu_qps() {
+  CHECK_HIP(hipMalloc(&gpu_qps, sizeof(QueuePair) * (maximum_num_contexts_ + 1) * num_pes));
+  for (int i = 0; i < (maximum_num_contexts_ + 1) * num_pes; i++) {
+    QueuePair qp(ib_state->pd_orig);
+    CHECK_HIP(hipMemcpy(&gpu_qps[i], &qp, sizeof(QueuePair), hipMemcpyDefault));
+    initialize_gpu_qp(&gpu_qps[i], i);
+  }
+}
+
+void GDABackend::cleanup_gpu_qps() {
+  //TODO need to destruct qp[i]?
+  CHECK_HIP(hipFree(gpu_qps));
+  gpu_qps = nullptr;
+}
+
+//TODO this ifdef sequence should go in a nic-specific file, like it is for bnxt, maybe whats above too?
+#ifndef GDA_BNXT
+void GDABackend::ib_init(struct ibv_device* ib_dev, uint8_t port) {
+  ib_state = new ib_state_t;
+  CHECK_NNULL(ib_state, "ib_state object create");
+
+  ib_state->context = ibv_open_device(ib_dev);
+  CHECK_NNULL(ib_state->context, "ib open device");
+  dump_ibv_context(ib_state->context);
+  dump_ibv_device(ib_state->context->device);
+
+  ib_state->pd_orig = ibv_alloc_pd(ib_state->context);
+  CHECK_NNULL(ib_state->pd_orig, "ib allocate pd");
+  dump_ibv_pd(ib_state->pd_orig);
+
+  ibv_parent_domain_init_attr pattr{};
+  init_parent_domain_attr(&pattr);
+  ib_state->pd_parent = ibv_alloc_parent_domain(ib_state->context, &pattr);
+  CHECK_NNULL(ib_state->pd_parent, "ibv_alloc_parent_domain");
+  dump_ibv_pd(ib_state->pd_parent);
+
+#ifdef GDA_IONIC
+  ionic_dv_pd_set_sqcmb(ib_state->pd_parent, false, false, false);
+  ionic_dv_pd_set_rqcmb(ib_state->pd_parent, false, false, false);
+
+  for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) {
+    ib_state->pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(ib_state->context, &pattr);
+    CHECK_NNULL(ib_state->pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)");
+
+    ionic_dv_pd_set_sqcmb(ib_state->pd_uxdma[uxdma_i], false, false, false);
+    ionic_dv_pd_set_rqcmb(ib_state->pd_uxdma[uxdma_i], false, false, false);
+    ionic_dv_pd_set_udma_mask(ib_state->pd_uxdma[uxdma_i], 1u << uxdma_i);
+  }
+#endif
+
+  int err = ibv_query_port(ib_state->context, port, &ib_state->portinfo);
+  CHECK_ZERO(err, "ibv_query_port");
+  dump_ibv_port_attr(&ib_state->portinfo);
+
+  /* Must init after querying port */
+  init_gid_index(port);
+
+#ifdef GDA_IONIC
+  ionic_dv_ctx dvctx;
+  ionic_dv_get_ctx(&dvctx, ib_state->context);
+
+  int hip_dev_id = 0;
+  CHECK_HIP(hipGetDevice(&hip_dev_id));
+
+  void* gpu_db_page = nullptr;
+  rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id);
+
+  uint64_t *db_page_u64 = reinterpret_cast<uint64_t*>(dvctx.db_page);
+  uint64_t *gpu_db_page_u64 = reinterpret_cast<uint64_t*>(gpu_db_page);
+
+  uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64];
+
+  ib_state->gpu_db_page = gpu_db_page;
+  ib_state->gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype];
+  ib_state->gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype];
+#endif
+}
+
+template <typename StateType>
+void GDABackend::try_to_modify_qp(ibv_qp* qp, StateType state) {
+  int err = ibv_modify_qp(qp, &state.exp_qp_attr, state.exp_attr_mask);
+  CHECK_ZERO(err, "ibv_modify_qp");
+}
+
+void GDABackend::init_qp_status(ibv_qp* qp, uint8_t port) {
+  try_to_modify_qp<InitQPState>(qp, initqp(port));
+}
+
+void GDABackend::change_status_rtr(ibv_qp* qp, dest_info_t* dest, uint8_t port) {
+  try_to_modify_qp<RtrState>(qp, rtr(dest, port));
+}
+
+void GDABackend::change_status_rts(ibv_qp* qp, dest_info_t* dest) {
+  try_to_modify_qp<RtsState>(qp, rts(dest));
+}
+
+void GDABackend::create_qps(uint8_t port, ibv_port_attr* ib_port_att) {
+  ibv_qp_cap cap{};
+  cap.max_send_wr = sq_size;
+  cap.max_send_sge = 1;
+  cap.max_inline_data = 0;
+#ifdef GDA_IONIC
+  // TODO allow zero sges in the driver
+  cap.max_recv_sge = 1;
+#endif
+  QPInitAttr qp_init_attr{qpattr(cap)};
+  cqs.resize((maximum_num_contexts_ + 1) * num_pes);
+  qps.resize((maximum_num_contexts_ + 1) * num_pes);
+  int max_num_cqe = qp_init_attr.attr.cap.max_send_wr;
+  for (int i = 0; i < qps.size(); i++) {
+#ifdef GDA_IONIC
+    int uxdma_i = ((i + 1) / 2) & 1;
+    cqs[i] = create_cq(ib_state->context, ib_state->pd_uxdma[uxdma_i], max_num_cqe << 1);
+    CHECK_NNULL(cqs[i], "create_cq");
+    qps[i] = create_qp(ib_state->pd_uxdma[uxdma_i], ib_state->context, &qp_init_attr.attr, cqs[i]);
+#else
+    cqs[i] = create_cq(ib_state->context, ib_state->pd_parent, max_num_cqe);
+    CHECK_NNULL(cqs[i], "create_cq");
+    qps[i] = create_qp(ib_state->pd_parent, ib_state->context, &qp_init_attr.attr, cqs[i]);
+#endif
+    CHECK_NNULL(qps[i], "create_qp");
+    init_qp_status(qps[i], port);
+    dest_info[i].lid = ib_port_att->lid;
+    dest_info[i].qpn = qps[i]->qp_num;
+    dest_info[i].psn = 0;
+    dest_info[i].gid = gid;
+  }
+}
+
+void* GDABackend::pd_alloc(struct ibv_pd* pd, void* pd_context, size_t size, size_t alignment, uint64_t resource_type) {
+  void* dev_ptr{nullptr};
+  //TODO make this configurable, presumably we want it on device for all types?
+#ifdef GDA_IONIC
+  CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size, hipDeviceMallocUncached));
+#else
+  CHECK_HIP(hipHostMalloc(reinterpret_cast<void**>(&dev_ptr), size, hipHostMallocDefault));
+#endif
+  memset(dev_ptr, 0, size);
+  return dev_ptr;
+}
+
+void GDABackend::pd_release(struct ibv_pd* pd, void* pd_context, void* ptr, uint64_t resource_type) {
+  CHECK_HIP(hipFree(ptr));
+}
+
+void GDABackend::init_parent_domain_attr(ibv_parent_domain_init_attr* attr1) {
+  attr1->pd = ib_state->pd_orig;
+  attr1->td = nullptr;
+  attr1->comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS;
+  attr1->alloc = GDABackend::pd_alloc;
+  attr1->free = GDABackend::pd_release;
+  attr1->pd_context = nullptr;
+}
+
+ibv_cq* GDABackend::create_cq(ibv_context* context, ibv_pd* pd, int cqe) {
+  ibv_cq_init_attr_ex cq_attr;
+  memset(&cq_attr, 0, sizeof(ibv_cq_init_attr_ex));
+  cq_attr.cqe = cqe;
+  cq_attr.cq_context = nullptr;
+  cq_attr.channel = nullptr;
+  cq_attr.comp_vector = 0;
+  cq_attr.flags = 0;  // see ibv_exp_cq_create_flags
+  cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_PD;
+  cq_attr.parent_domain = pd;
+  ibv_cq_ex* cq_ex = ibv_create_cq_ex(context, &cq_attr);
+  CHECK_NNULL(cq_ex, "ibv_create_cq_ex");
+  ibv_cq *cq = ibv_cq_ex_to_cq(cq_ex);
+  CHECK_NNULL(cq, "ibv_cq_ex_to_cq");
+  return cq;
+}
+
+void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
+  int hip_dev_id{-1};
+  CHECK_HIP(hipGetDevice(&hip_dev_id));
+
+#ifdef GDA_IONIC
+  uint8_t udma_idx = ionic_dv_qp_get_udma_idx(qps[conn_num]);
+
+  ionic_dv_cq dvcq;
+  ionic_dv_get_cq(&dvcq, cqs[conn_num], udma_idx);
+
+  gpu_qp->cq_dbreg = ib_state->gpu_db_cq;
+  gpu_qp->cq_dbval = dvcq.q.db_val;
+  gpu_qp->cq_mask = dvcq.q.mask;
+
+  gpu_qp->cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
+
+  ionic_dv_qp dvqp;
+  ionic_dv_get_qp(&dvqp, qps[conn_num]);
+
+  gpu_qp->sq_dbreg = ib_state->gpu_db_sq;
+  gpu_qp->sq_dbval = dvqp.sq.db_val;
+  gpu_qp->sq_mask = dvqp.sq.mask;
+  gpu_qp->sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
+
+  gpu_qp->qp_num = qps[conn_num]->qp_num;
+  gpu_qp->lkey = heap_mr->lkey;
+  gpu_qp->rkey = heap_rkey[conn_num % num_pes];
+  gpu_qp->inline_threshold = 32;
+#else // !GDA_IONIC
+  mlx5dv_cq cq_out;
+  mlx5dv_obj mlx_obj;
+  mlx_obj.cq.in = cqs[conn_num];
+  mlx_obj.cq.out = &cq_out;
+  mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
+  dump_mlx5dv_cq(&cq_out, conn_num);
+
+  /*
+   * struct mlx5dv_cq {
+   *   void                    *buf;
+   *   __be32                  *dbrec;
+   *   uint32_t                cqe_cnt;
+   *   uint32_t                cqe_size;
+   *   void                    *cq_uar;
+   *   uint32_t                cqn;
+   *   uint64_t                comp_mask;
+   * };
+  */
+
+  gpu_qp->cq_buf = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
+  gpu_qp->cq_cnt = cq_out.cqe_cnt;
+  gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt);
+  gpu_qp->cq_dbrec = cq_out.dbrec;
+
+  mlx5dv_qp qp_out;
+  mlx_obj.qp.in = qps[conn_num];
+  mlx_obj.qp.out = &qp_out;
+  mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP);
+  dump_mlx5dv_qp(&qp_out, conn_num);
+
+  /*
+   * struct mlx5dv_qp {
+   *   __be32 *dbrec;
+   *   struct {
+   *     void *buf;
+   *     uint32_t wqe_cnt;
+   *     uint32_t stride;
+   *   } sq;
+   *   struct {
+   *     void *buf;
+   *     uint32_t wqe_cnt;
+   *     uint32_t stride;
+   *   } rq;
+   *   struct {
+   *     void *reg;
+   *     uint32_t size;
+   *   } bf;
+   *   uint64_t comp_mask;
+   *   off_t uar_mmap_offset;
+   *   uint32_t tirn;
+   *   uint32_t tisn;
+   *   uint32_t rqn;
+   *   uint32_t sqn;
+   *   uint64_t tir_icm_addr;
+   * };
+   */
+
+  gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR
+  gpu_qp->sq_buf = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
+  gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt;
+  gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]);
+  gpu_qp->lkey = htobe32(heap_mr->lkey);
+  gpu_qp->qp_num = qps[conn_num]->qp_num;
+  // The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers
+  void* gpu_ptr{nullptr};
+  rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id);
+  gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
+#endif // !GDA_IONIC
+}
+
+ibv_qp* GDABackend::create_qp(ibv_pd* pd, ibv_context* context, ibv_qp_init_attr_ex* qp_attr, ibv_cq* cq) {
+  ibv_qp* qp{nullptr};
+  assert(pd);
+  assert(context);
+  assert(qp_attr);
+  qp_attr->send_cq = cq;
+  qp_attr->recv_cq = cq;
+  qp_attr->pd = pd;
+  qp_attr->comp_mask = IBV_QP_INIT_ATTR_PD;
+  qp = ibv_create_qp_ex(context, qp_attr);
+  CHECK_NNULL(qp, "ibv_create_qp_ex");
+  return qp;
+}
+
+GDABackend::InitQPState GDABackend::initqp(uint8_t port) {
+  InitQPState init{};
+  init.exp_qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+  init.exp_qp_attr.port_num = port;
+  init.exp_attr_mask |= IBV_QP_ACCESS_FLAGS;
+  return init;
+}
+
+GDABackend::RtrState GDABackend::rtr(dest_info_t* dest, uint8_t port) {
+  RtrState rtr{};
+  rtr.exp_qp_attr.dest_qp_num = dest->qpn;
+  rtr.exp_qp_attr.rq_psn = dest->psn;
+  rtr.exp_qp_attr.ah_attr.port_num = port;
+  rtr.exp_qp_attr.path_mtu = ib_state->portinfo.active_mtu;
+  if (ib_state->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND) {
+    rtr.exp_qp_attr.ah_attr.dlid = dest->lid;
+  } else {
+    rtr.exp_qp_attr.ah_attr.is_global = 1;
+    rtr.exp_qp_attr.ah_attr.grh.dgid = dest->gid;
+    rtr.exp_qp_attr.ah_attr.grh.sgid_index = gid_index;
+    rtr.exp_qp_attr.ah_attr.grh.hop_limit = 1;
+  }
+  rtr.exp_attr_mask |= IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER;
+  return rtr;
+}
+
+GDABackend::RtsState GDABackend::rts(dest_info_t* dest) {
+  RtsState rts{};
+  rts.exp_qp_attr.sq_psn = dest->psn;
+  rts.exp_attr_mask |= IBV_QP_SQ_PSN;
+  return rts;
+}
+
+GDABackend::QPInitAttr GDABackend::qpattr(ibv_qp_cap cap) {
+  QPInitAttr qpattr(cap);
+  qpattr.attr.qp_type = IBV_QPT_RC;
+  return qpattr;
+}
+#endif
+
+void GDABackend::init_gid_index(uint8_t port_num) {
+  struct ibv_gid_entry *gid_entries;
+  struct ibv_gid_entry *gid_entry;
+  union ibv_gid current_gid;
+  union ibv_gid selected_gid;
+  uint32_t gid_type;
+  int err;
+
+  const uint8_t local_gid_prefix[2] = {0xFE, 0x80};
+  uint32_t selected_gid_type        = IBV_GID_TYPE_ROCE_V1;
+  int selected_gid_index            = -1;
+  ssize_t gid_tbl_entries           = 0;
+
+  int gid_tbl_len         = ib_state->portinfo.gid_tbl_len;
+  struct ibv_context *ctx = ib_state->context;
+
+  gid_entries = (struct ibv_gid_entry*) calloc(gid_tbl_len, sizeof(struct ibv_gid_entry));
+
+  gid_tbl_entries = ibv_query_gid_table(ctx, gid_entries, gid_tbl_len, 0);
+  if (gid_tbl_entries < 0) {
+    fprintf(stderr, "[Warning] ibv_query_gid_table failed. No available GIDs\n");
+    free(gid_entries);
+    return;
+  }
+
+  for (int i = 0; i < gid_tbl_entries; i++) {
+    gid_type = gid_entries[i].gid_type;
+
+    /* rocSHMEM does not use GIDs for IB mode */
+    if (gid_type == IBV_GID_TYPE_IB) {
+      break;
+    }
+
+    current_gid = gid_entries[i].gid;
+
+    err = ibv_query_gid(ctx, port_num, i, &current_gid);
+    CHECK_ZERO(err, "ibv_query_gid");
+
+    /* We don't want local GIDs */
+    if (memcmp(current_gid.raw, &local_gid_prefix, 2) == 0) {
+      continue;
+    }
+
+    /* Initialize using first available GID */
+    if (selected_gid_index == -1) {
+      selected_gid_index = i;
+      selected_gid_type  = gid_type;
+      selected_gid       = current_gid;
+    }
+    /* Choose RoCEv2 over RoCEv1 */
+    else  if (gid_type > selected_gid_type) {
+      selected_gid_index = i;
+      selected_gid_type  = gid_type;
+      selected_gid       = current_gid;
+    }
+  }
+
+  gid_index = selected_gid_index;
+  gid       = selected_gid;
+
+  free(gid_entries);
+}
+
+static void dump_ibv_context(struct ibv_context* x) {
+  /*
+   * struct ibv_context {
+   *   struct ibv_device      *device;
+   *   struct ibv_context_ops  ops;
+   *   int                     cmd_fd;
+   *   int                     async_fd;
+   *   int                     num_comp_vectors;
+   *   pthread_mutex_t         mutex;
+   *   void                   *abi_compat;
+   * };
+   */
+  DPRINTF("\n"
+         "===============================================\n"
+         "                IBV_CONTEXT\n"
+         "===============================================\n"
+         "  (ibv_device*)        device              = %p\n"
+         "  (int)                cmd_fd              = %d\n"
+         "  (int)                async_fd            = %d\n"
+         "  (int)                num_comp_vectors    = %d\n"
+         "  (void*)              abi_compat          = %p\n",
+         x->device, x->cmd_fd, x->async_fd, x->num_comp_vectors, x->abi_compat);
+};
+
+static void dump_ibv_device(struct ibv_device* x) {
+  /*
+   * struct ibv_device {
+   *   struct _ibv_device_ops  _ops;
+   *   enum ibv_node_type node_type;
+   *   enum ibv_transport_type transport_type;
+   *   char name[IBV_SYSFS_NAME_MAX];
+   *   char dev_name[IBV_SYSFS_NAME_MAX];
+   *   char dev_path[IBV_SYSFS_PATH_MAX];
+   *   char ibdev_path[IBV_SYSFS_PATH_MAX];
+   * };
+   */
+  DPRINTF("\n"
+         "===============================================\n"
+         "               IBV_DEVICE\n"
+         "===============================================\n"
+         "  (enum ibv_node_type)      node_type      = %d\n"
+         "  (enum ibv_transport_type) transport_type = %d\n"
+         "  (char[])                  name           = %s\n"
+         "  (char[])                  dev_name       = %s\n"
+         "  (char[])                  dev_path       = %s\n"
+         "  (char[])                  ibdev_path     = %s\n",
+         x->node_type, x->transport_type, x->name, x->dev_name, x->dev_path, x->ibdev_path);
+}
+
+static void dump_ibv_pd(struct ibv_pd* x) {
+  /*
+   * struct ibv_pd {
+   *   struct ibv_context     *context;
+   *   uint32_t                handle;
+   * };
+   */
+  DPRINTF("\n"
+         "===============================================\n"
+         "               IBV_PD\n"
+         "===============================================\n"
+         "  (ibv_context*) context = %p\n"
+         "  (uint32_t)     handle  = 0x%x\n",
+         x->context, x->handle);
+}
+
+static void dump_ibv_port_attr(struct ibv_port_attr* x) {
+  /*
+   * struct ibv_port_attr {
+   *   enum ibv_port_state     state;
+   *   enum ibv_mtu            max_mtu;
+   *   enum ibv_mtu            active_mtu;
+   *   int                     gid_tbl_len;
+   *   uint32_t                port_cap_flags;
+   *   uint32_t                max_msg_sz;
+   *   uint32_t                bad_pkey_cntr;
+   *   uint32_t                qkey_viol_cntr;
+   *   uint16_t                pkey_tbl_len;
+   *   uint16_t                lid;
+   *   uint16_t                sm_lid;
+   *   uint8_t                 lmc;
+   *   uint8_t                 max_vl_num;
+   *   uint8_t                 sm_sl;
+   *   uint8_t                 subnet_timeout;
+   *   uint8_t                 init_type_reply;
+   *   uint8_t                 active_width;
+   *   uint8_t                 active_speed;
+   *   uint8_t                 phys_state;
+   *   uint8_t                 link_layer;
+   *   uint8_t                 flags;
+   *   uint16_t                port_cap_flags2;
+   * };
+   */
+  DPRINTF("\n"
+         "===============================================\n"
+         "               IBV_PORT_ATTR\n"
+         "===============================================\n"
+         "  (enum ibv_port_state) state           = %u\n"
+         "  (enum ibv_mtu)        max_mtu         = %u\n"
+         "  (enum ibv_mtu)        active_mtu      = %u\n"
+         "  (int)                 gid_tbl_len     = %u\n"
+         "  (uint32_t)            port_cap_flags  = 0x%x\n"
+         "  (uint32_t)            max_msg_sz      = %u\n"
+         "  (uint32_t)            bad_pkey_cntr   = %u\n"
+         "  (uint32_t)            qkey_viol_cntr  = %u\n"
+         "  (uint16_t)            pkey_tbl_len    = %u\n"
+         "  (uint16_t)            lid             = 0x%x\n"
+         "  (uint16_t)            sm_lid          = 0x%x\n"
+         "  (uint8_t)             lmc             = 0x%x\n"
+         "  (uint8_t)             max_vl_num      = 0x%x\n"
+         "  (uint8_t)             sm_sl           = 0x%x\n"
+         "  (uint8_t)             subnet_timeout  = 0x%x\n"
+         "  (uint8_t)             init_type_reply = 0x%x\n"
+         "  (uint8_t)             active_width    = 0x%x\n"
+         "  (uint8_t)             active_speed    = 0x%x\n"
+         "  (uint8_t)             phys_state      = 0x%x\n"
+         "  (uint8_t)             link_layer      = 0x%x\n"
+         "  (uint8_t)             flags           = 0x%x\n"
+         "  (uint16_t)            port_cap_flags2 = 0x%x\n",
+         x->state, x->max_mtu, x->active_mtu, x->gid_tbl_len, x->port_cap_flags, x->max_msg_sz,
+         x->bad_pkey_cntr, x->qkey_viol_cntr, x->pkey_tbl_len, x->lid, x->sm_lid, x->lmc, x->max_vl_num,
+         x->sm_sl, x->subnet_timeout, x->init_type_reply, x->active_width, x->active_speed, x->phys_state,
+         x->link_layer, x->flags, x->port_cap_flags2);
+}
+
+void dump_ibv_qp(struct ibv_qp *qp, int conn_num) {
+  /*
+   * struct ibv_qp {
+   *   struct ibv_context     *context;
+   *   void                   *qp_context;
+   *   struct ibv_pd          *pd;
+   *   struct ibv_cq          *send_cq;
+   *   struct ibv_cq          *recv_cq;
+   *   struct ibv_srq         *srq;
+   *   uint32_t                handle;
+   *   uint32_t                qp_num;
+   *   enum ibv_qp_state       state;
+   *   enum ibv_qp_type        qp_type;
+   *   pthread_mutex_t         mutex;
+   *   pthread_cond_t          cond;
+   *   uint32_t                events_completed;
+   * };
+   */
+  DPRINTF("\n");
+  DPRINTF("============== QP_DUMP CONNECTION#%d ==========\n", conn_num);
+  DPRINTF("  (ibv_context*)      context          = %p\n",   qp->context);
+  DPRINTF("  (void*)             qp_context       = %p\n",   qp->qp_context);
+  DPRINTF("  (ibv_pd*)           pd               = %p\n",   qp->pd);
+  DPRINTF("  (ibv_cq*)           send_cq          = %p\n",   qp->send_cq);
+  DPRINTF("  (ibv_cq*)           recv_cq          = %p\n",   qp->recv_cq);
+  DPRINTF("  (ibv_srq*)          srq              = %p\n",   qp->srq);
+  DPRINTF("  (uint32_t)          handle           = 0x%x\n", qp->handle);
+  DPRINTF("  (uint32_t)          qp_num           = 0x%x\n", qp->qp_num);
+  DPRINTF("  (enum ibv_qp_state) state            = %u\n",   qp->state);
+  DPRINTF("  (enum_ibv_qp_type)  qp_type          = %u\n",   qp->qp_type);
+  DPRINTF("  (uint32_t)          events_completed = %u\n",   qp->events_completed);
+  DPRINTF("=========== QP_DUMP_END CONNECTION#%d  ========\n", conn_num);
+}
+
+#if !defined(GDA_IONIC) && !defined(GDA_BNXT)
+void dump_mlx5dv_qp(struct mlx5dv_qp *qp_dv, int conn_num) {
+  DPRINTF("\n");
+  DPRINTF("===============================================\n");
+  DPRINTF("     INITIALIZED MLXDV_QP FOR CONNECTION#%d\n", conn_num);
+  DPRINTF("===============================================\n");
+  DPRINTF("=================== QP_DUMP ===================\n");
+  DPRINTF("  (__be32*)  dbrec           = %p\n",     qp_dv->dbrec);
+  DPRINTF("  (void*)    sq.buf          = %p\n",     qp_dv->sq.buf);
+  DPRINTF("  (uint32_t) sq.wqe_cnt      = %u\n",     qp_dv->sq.wqe_cnt);
+  DPRINTF("  (uint32_t) sq.stride       = %u\n",     qp_dv->sq.stride);
+  DPRINTF("  (void*)    rq.buf          = %p\n",     qp_dv->rq.buf);
+  DPRINTF("  (uint32_t) rq.wqe_cnt      = %u\n",     qp_dv->rq.wqe_cnt);
+  DPRINTF("  (uint32_t) rq.stride       = %u\n",     qp_dv->rq.stride);
+  DPRINTF("  (void*)    bf.reg          = %p\n",     qp_dv->bf.reg);
+  DPRINTF("  (uint32_t) bf.size         = 0x%x\n",   qp_dv->bf.size);
+  DPRINTF("  (uint64_t) comp_mask       = 0x%lx\n",  qp_dv->comp_mask);
+  DPRINTF("  (off_t)    uar_mmap_offset = 0x%lx\n",  qp_dv->uar_mmap_offset);
+  DPRINTF("  (uint32_t) tirn            = 0x%x\n",   qp_dv->tirn);
+  DPRINTF("  (uint32_t) tisn            = 0x%x\n",   qp_dv->tisn);
+  DPRINTF("  (uint32_t) rqn             = 0x%x\n",   qp_dv->rqn);
+  DPRINTF("  (uint32_t) sqn             = 0x%x\n",   qp_dv->sqn);
+  DPRINTF("  (uint64_t) tir_icm_addr    = 0x%lx\n",  qp_dv->tir_icm_addr);
+  DPRINTF("================== QP_DUMP_END ================\n");
+}
+
+void dump_mlx5dv_cq(struct mlx5dv_cq *cq_dv, int conn_num) {
+  DPRINTF("\n");
+  DPRINTF("===============================================\n");
+  DPRINTF("     INITIALIZED MLX5DV_CQ FOR CONNECTION#%d\n", conn_num);
+  DPRINTF("===============================================\n");
+  DPRINTF("=================== CQ_DUMP ===================\n");
+  DPRINTF("  (void*)    buf             = %p\n",     cq_dv->buf);
+  DPRINTF("  (__be32*)  dbrec           = %p\n",     cq_dv->dbrec);
+  DPRINTF("  (uint32_t) cqe_cnt         = %u\n",     cq_dv->cqe_cnt);
+  DPRINTF("  (uint32_t) cqe_size        = %u\n",     cq_dv->cqe_size);
+  DPRINTF("  (void*)    cq_uar          = %p\n",     cq_dv->cq_uar);
+  DPRINTF("  (uint32_t) cqn             = 0x%x\n",   cq_dv->cqn);
+  DPRINTF("  (uint64_t) comp_mask       = 0x%lx\n",  cq_dv->comp_mask);
+  DPRINTF("================== CQ_DUMP_END ================\n");
+}
+#endif // !GDA_IONIC
+
+}  // namespace rocshmem
diff --git a/src/gda/backend_gda.hpp b/src/gda/backend_gda.hpp
new file mode 100644
index 0000000000..becdcfdb46
--- /dev/null
+++ b/src/gda/backend_gda.hpp
@@ -0,0 +1,485 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_BACKEND_HPP_
+#define LIBRARY_SRC_GDA_BACKEND_HPP_
+
+#include "backend_bc.hpp"
+#include "containers/free_list_impl.hpp"
+#include "hdp_proxy.hpp" //TODO useless?
+#include "memory/hip_allocator.hpp"
+#include "context_incl.hpp"
+#include "gda_context_proxy.hpp"
+#include "queue_pair.hpp"
+#include "bootstrap/bootstrap.hpp"
+
+namespace rocshmem {
+
+class GDAContext;
+class GDAHostContext;
+class QueuePair;
+class HostInterface;
+
+class GDABackend : public Backend {
+ private:
+  typedef struct ib_state {
+    struct ibv_context* context;
+    struct ibv_pd* pd_orig;
+#ifndef GDA_BNXT
+    struct ibv_pd* pd_parent;
+#endif
+#ifdef GDA_IONIC
+    struct ibv_pd* pd_uxdma[2];
+#endif
+    struct ibv_mr* mr;
+    struct ibv_port_attr portinfo;
+
+#ifdef GDA_IONIC
+    void *gpu_db_page;
+    uint64_t *gpu_db_cq;
+    uint64_t *gpu_db_sq;
+#endif
+  } ib_state_t;
+
+  typedef struct dest_info {
+    int lid;
+    int qpn;
+    int psn;
+    union ibv_gid gid;
+  } dest_info_t;
+
+#ifndef GDA_BNXT
+  class State {
+   public:
+    ibv_qp_attr exp_qp_attr{};
+    uint64_t exp_attr_mask{};
+  };
+
+  class InitQPState : public State {
+   public:
+    InitQPState() {
+      exp_qp_attr.qp_state = IBV_QPS_INIT;
+      exp_qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+      exp_attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
+    }
+  };
+
+  class RtrState : public State {
+   public:
+    RtrState() {
+      exp_qp_attr.qp_state = IBV_QPS_RTR;
+      exp_qp_attr.ah_attr.sl = 1;
+      exp_qp_attr.max_dest_rd_atomic = GDA_MAX_ATOMIC;
+      exp_qp_attr.min_rnr_timer = 12;
+      exp_attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU;
+    }
+  };
+
+  class RtsState : public State {
+   public:
+    RtsState() {
+      exp_qp_attr.qp_state = IBV_QPS_RTS;
+      exp_qp_attr.timeout = 14;
+      exp_qp_attr.retry_cnt = 7;
+      exp_qp_attr.rnr_retry = 7;
+      exp_qp_attr.max_rd_atomic = GDA_MAX_ATOMIC;
+      exp_attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC;
+    }
+  };
+
+  class QPInitAttr {
+   public:
+    explicit QPInitAttr(ibv_qp_cap cap) {
+      attr.cap = cap;
+      attr.sq_sig_all = 0;
+    }
+    ibv_qp_init_attr_ex attr{};
+  };
+#endif
+
+ /**
+   * @brief Common code invoked from the different constructors
+   */
+  void read_env();
+  void setup_ibv();
+  void cleanup_ibv();
+
+ public:
+  friend GDAContext;
+
+  /**
+   * @copydoc Backend::Backend(unsigned)
+   */
+  explicit GDABackend(MPI_Comm comm);
+  explicit GDABackend(TcpBootstrap *bootstr);
+
+  /**
+   * @copydoc Backend::~Backend()
+   */
+  virtual ~GDABackend();
+
+  __device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
+
+  /**
+   * @brief Destroy a `rocshmem_ctx_t` context and returns it back to the
+   * context free list.
+   */
+  __device__ void destroy_ctx(rocshmem_ctx_t *ctx);
+
+  /**
+   * @copydoc Backend::ctx_create
+   */
+  void ctx_create(int64_t options, void **ctx) override;
+
+  /**
+   * @copydoc Backend::ctx_destroy
+   */
+  void ctx_destroy(Context *ctx) override;
+
+  /**
+   * @brief Abort the application.
+   *
+   * @param[in] status Exit code.
+   *
+   * @return void.
+   *
+   * @note This routine terminates the entire application.
+   */
+  void global_exit(int status) override;
+
+  /**
+   * @copydoc Backend::create_new_team
+   */
+  void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
+                       TeamInfo *team_info_wrt_world, int num_pes,
+                       int my_pe_in_new_team, MPI_Comm team_comm,
+                       rocshmem_team_t *new_team) override;
+
+  /**
+   * @copydoc Backend::team_destroy(rocshmem_team_t)
+   */
+  void team_destroy(rocshmem_team_t team) override;
+
+  /**
+   * @brief Accessor for work/sync bases
+   *
+   * @return Vector containing the addresses of the work/sync bases
+   */
+  char** get_wrk_sync_bases() { return wrk_sync_pool_bases_; } //TODO UNUSED
+
+  /**
+   * @brief The host-facing interface that will be used
+   * by all contexts of the GDABackend
+   */
+  std::shared_ptr<HostInterface> host_interface{nullptr};
+
+  /**
+   * @brief Scratchpad for the internal barrier algorithms.
+   */
+  int64_t *barrier_sync{nullptr};
+
+  /**
+   * @brief Handle for raw memory for barrier sync
+   */
+  long *barrier_pSync_pool{nullptr};
+
+  /**
+   * @brief Handle for raw memory for reduce sync
+   */
+  long *reduce_pSync_pool{nullptr};
+
+  /**
+   * @brief Handle for raw memory for broadcast sync
+   */
+  long *bcast_pSync_pool{nullptr};
+
+  /**
+   * @brief Handle for raw memory for alltoall sync
+   */
+  long *alltoall_pSync_pool{nullptr};
+
+  /**
+   * @brief Handle for raw memory for work
+   */
+  void *pWrk_pool{nullptr};
+
+  /**
+   * @brief Handle for raw memory for alltoall
+   */
+  void *pAta_pool{nullptr};
+
+  /**
+   * @brief Handle for raw memory for fence/quiet
+  */
+  int *fence_pool{nullptr};
+
+ protected:
+   /**
+   * @copydoc Backend::dump_backend_stats()
+   */
+  void dump_backend_stats() override;
+
+  /**
+   * @copydoc Backend::reset_backend_stats()
+   */
+  void reset_backend_stats() override;
+
+  /**
+   * @brief Allocates uncacheable host memory for the hdp policy.
+   *
+   * @note Internal data ownership is managed by the proxy
+   */
+  HdpProxy<HIPHostAllocator> hdp_proxy_{};
+
+  /**
+   * @brief Holds a copy of the default context for host functions
+   */
+  std::unique_ptr<GDAHostContext> default_host_ctx{nullptr};
+
+  /**
+   * @brief Allocate and initialize team world.
+   */
+  void setup_team_world();
+
+  /**
+   * @brief Initialize the resources required to support teams
+   */
+  void setup_teams();
+
+  /**
+   * @brief Destruct the resources required to support teams
+   */
+  void cleanup_teams();
+
+  /**
+   * @brief Allocation and initialization of backend contexts.
+   */
+  void setup_ctxs();
+  void cleanup_ctxs();
+  void setup_host_ctx();
+  void setup_default_ctx();
+
+  /**
+   * @brief Allocate and initialize barrier operation addresses on
+   * symmetric heap.
+   *
+   * When this method completes, the barrier_sync member will be available
+   * for use.
+   */
+  void setup_collectives();
+
+  /**
+   * @brief Allocate buffer for fence/quiet operation
+   */
+  void setup_fence_buffer();
+
+  void setup_heap_memory_rkey();
+  void cleanup_heap_memory_rkey();
+
+  void initialize_gpu_qp(QueuePair* qp, int conn_num);
+
+#ifndef GDA_BNXT
+  InitQPState initqp(uint8_t port);
+
+  RtrState rtr(dest_info_t* dest, uint8_t port);
+
+  RtsState rts(dest_info_t* dest);
+
+  QPInitAttr qpattr(ibv_qp_cap cap);
+
+  void init_qp_status(ibv_qp* qp, uint8_t port);
+#endif
+
+  void change_status_rtr(ibv_qp* qp, dest_info_t* dest, uint8_t port);
+
+  void change_status_rts(ibv_qp* qp, dest_info_t* dest);
+
+  void create_qps(uint8_t port, ibv_port_attr* ib_port_att);
+
+#ifdef GDA_BNXT
+  void init_qp_status(uint8_t port);
+
+  void create_cqs(int ncqs, int cqe);
+
+  void create_qps_impl(int nqps);
+
+  int ibv_mtu_to_int(enum ibv_mtu mtu);
+#else
+  template <typename T>
+  void try_to_modify_qp(ibv_qp* qp, T state);
+
+  static void* pd_alloc(ibv_pd* pd, void* pd_context, size_t size, size_t alignment, uint64_t resource_type);
+
+  static void pd_release(ibv_pd* pd, void* pd_context, void* ptr, uint64_t resource_type);
+
+  void init_parent_domain_attr(ibv_parent_domain_init_attr* attr);
+
+  ibv_cq* create_cq(ibv_context* context, ibv_pd* pd, int cqe);
+
+  ibv_qp* create_qp(ibv_pd* pd, ibv_context* context, ibv_qp_init_attr_ex* qp_attr, ibv_cq* rcq);
+#endif
+
+  void ib_init(ibv_device* ib_dev, uint8_t port);
+
+  void init_gid_index(uint8_t port);
+
+  void setup_gpu_qps();
+  void cleanup_gpu_qps();
+
+  char* requested_dev{nullptr};
+
+  ibv_device** dev_list{nullptr};
+
+  ib_state_t* ib_state{nullptr};
+
+  std::vector<dest_info_t> dest_info;
+
+ private:
+  /**
+   * @brief Common code invoked from the different constructors
+   */
+  void init();
+
+  /**
+   * @brief Proxy for the default context
+   *
+   * @note Internal data ownership is managed by the proxy
+   */
+  GDADefaultContextProxyT default_context_proxy_;  // init handled in constructor
+
+  /**
+   * @brief An array of @ref ROContexts that backs the context FreeList.
+   */
+  GDAContext *ctx_array{nullptr};
+
+  /**
+   * @brief A free-list containing contexts.
+   */
+  FreeListProxy<HIPAllocator, GDAContext *> ctx_free_list{};
+
+  /**
+   * @brief Holds maximum number of contexts used in library
+   */
+  size_t maximum_num_contexts_{32};
+
+  /**
+   * @brief The bitmask representing the availability of teams in the pool
+   */
+  char *team_pool_bitmask_{nullptr};
+
+  /**
+   * @brief Bitmask to store the reduced result of bitmasks on pariticipating
+   * PEs
+   *
+   * With no thread-safety for this bitmask, multithreaded creation of teams is
+   * not supported.
+   */
+  char *team_reduced_bitmask_{nullptr};
+
+  /**
+   * @brief Size of the bitmask
+   */
+  int team_bitmask_size_{-1};
+
+  /**
+   * Fine grained memory allocator for buffers used in collectives Routines
+   */
+  HIPDefaultFinegrainedAllocator fine_grained_allocator_ {};
+
+  /**
+   * @brief Collective routines work/sync buffer size
+   */
+  size_t wrk_sync_pool_size_{};
+
+  /**
+   * @brief Collective routines work/sync buffer base ptr
+   */
+  char* const wrk_sync_pool_{nullptr};
+
+  /**
+   * @brief Temporary buffer pointer pointing to the same address as
+   * wrk_sync_pool_, used to calculate the starting addresses of
+   * different work and sync buffers.
+  */
+  char *wrk_sync_pool_top_{nullptr};
+
+  /**
+   * @brief Array containing the addresses of the work/sync buffer bases
+   * of other PEs
+  */
+  char** wrk_sync_pool_bases_{nullptr};//TODO UNUSED, maybe used again later when we decouple the sync from the main heap
+
+  /**
+   * @brief Initialize memory required for work/sync buffers and open GDA
+   * handle on PE's wrk_sync_pool.
+   */
+  void setup_wrk_sync_buffer();
+
+  /**
+   * @brief Close GDA memory handles for work/sync buffers and deallocate
+   * work/sync buffer.
+  */
+  void cleanup_wrk_sync_buffer();
+
+  /**
+   * @brief rte all-to-all
+   */
+  void Alltoall_char_inplace (char *inoutbuf, size_t num_bytes, rocshmem_team_t team);
+
+  /**
+   * @brief rte allreduce for teams
+   */
+  void Allreduce_char_BAND (char* inbuf, char *outbuf, size_t num_bytes, Team *team);
+
+  /**
+   * @brief rte barrier for initialization
+   */
+  void rte_barrier();
+
+  QueuePair *gpu_qps{nullptr};
+
+  std::vector<ibv_qp*> qps;
+
+  std::vector<ibv_cq*> cqs;
+
+  uint32_t sq_size{1024};
+
+  uint32_t *heap_rkey{nullptr};
+
+  ibv_mr *heap_mr{nullptr};
+
+  union ibv_gid gid;
+  int gid_index;
+
+#ifdef GDA_BNXT
+  std::vector<struct bnxt_host_qp> bnxt_qps;
+  std::vector<struct bnxt_host_cq> bnxt_cqs;
+
+  struct bnxt_re_dv_db_region_attr db_region_attr;
+#endif
+};
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_BACKEND_HPP_
diff --git a/src/gda/bnxt/CMakeLists.txt b/src/gda/bnxt/CMakeLists.txt
new file mode 100644
index 0000000000..6db1d7904f
--- /dev/null
+++ b/src/gda/bnxt/CMakeLists.txt
@@ -0,0 +1,29 @@
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+target_sources(
+  ${PROJECT_NAME}
+  PRIVATE
+    backend_gda_bnxt.cpp
+    queue_pair_bnxt.cpp
+)
diff --git a/src/gda/bnxt/backend_gda_bnxt.cpp b/src/gda/bnxt/backend_gda_bnxt.cpp
new file mode 100644
index 0000000000..b9ea02dec5
--- /dev/null
+++ b/src/gda/bnxt/backend_gda_bnxt.cpp
@@ -0,0 +1,366 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "gda/backend_gda.hpp"
+#include "util.hpp"
+#include <unistd.h> // getpagesize()
+
+namespace rocshmem {
+
+int GDABackend::ibv_mtu_to_int(enum ibv_mtu mtu) {
+  switch (mtu) {
+    case IBV_MTU_256:  return 256;
+    case IBV_MTU_512:  return 512;
+    case IBV_MTU_1024: return 1024;
+    case IBV_MTU_2048: return 2048;
+    case IBV_MTU_4096: return 4096;
+    default: {
+      fprintf(stderr, "[ERROR] Invalid ibv_mtu\n");
+      return 0;
+    }
+  }
+}
+
+void GDABackend::ib_init(struct ibv_device* ib_dev, uint8_t port) {
+  int err;
+
+  ib_state = new ib_state_t;
+  CHECK_NNULL(ib_state, "ib_state object create");
+
+  ib_state->context = ibv_open_device(ib_dev);
+  CHECK_NNULL(ib_state->context, "ibv_open_device");
+
+  ib_state->pd_orig = ibv_alloc_pd(ib_state->context);
+  CHECK_NNULL(ib_state->pd_orig, "ibv_alloc_pd");
+
+  err = ibv_query_port(ib_state->context, port, &ib_state->portinfo);
+  CHECK_ZERO(err, "ibv_query_port");
+
+  init_gid_index(port);
+}
+
+void GDABackend::init_qp_status(uint8_t port) {
+  int err;
+  struct ibv_qp_attr attr;
+  int attr_mask;
+
+  memset(&attr, 0, sizeof(struct ibv_qp_attr));
+
+  attr.qp_state        = IBV_QPS_INIT;
+  attr.pkey_index      = 0;
+  attr.port_num        = port;
+  attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE
+                       | IBV_ACCESS_LOCAL_WRITE
+                       | IBV_ACCESS_REMOTE_READ
+                       | IBV_ACCESS_REMOTE_ATOMIC;
+
+  attr_mask = IBV_QP_STATE
+            | IBV_QP_PKEY_INDEX
+            | IBV_QP_PORT
+            | IBV_QP_ACCESS_FLAGS;
+
+  for (int i =0; i < qps.size() ; i++) {
+    err = bnxt_re_dv_modify_qp(qps[i], &attr, attr_mask, 0, 0);
+    CHECK_ZERO(err, "bnxt_re_dv_modify_qp");
+  }
+}
+
+void GDABackend::change_status_rtr(ibv_qp *qp, dest_info_t *dest, uint8_t port) {
+  int err;
+  struct ibv_qp_attr attr;
+  int attr_mask;
+
+  memset(&attr, 0, sizeof(struct ibv_qp_attr));
+  attr.qp_state               = IBV_QPS_RTR;
+  attr.path_mtu               = ib_state->portinfo.active_mtu;
+  attr.rq_psn                 = dest->psn;
+  attr.dest_qp_num            = dest->qpn;
+
+  memcpy(&attr.ah_attr.grh.dgid, &dest->gid, 16);
+  attr.ah_attr.grh.sgid_index = gid_index;
+  attr.ah_attr.grh.hop_limit  = 1;
+  attr.ah_attr.sl             = 1;
+  attr.ah_attr.is_global      = 1;
+  attr.ah_attr.port_num       = port;
+
+  attr.max_dest_rd_atomic     = GDA_MAX_ATOMIC;
+  attr.min_rnr_timer          = 12;
+
+  attr_mask = IBV_QP_STATE
+            | IBV_QP_PATH_MTU
+            | IBV_QP_RQ_PSN
+            | IBV_QP_DEST_QPN
+            | IBV_QP_AV
+            | IBV_QP_MAX_DEST_RD_ATOMIC
+            | IBV_QP_MIN_RNR_TIMER;
+
+  err = bnxt_re_dv_modify_qp(qp, &attr, attr_mask, 0, 0);
+  CHECK_ZERO(err, "bnxt_re_dv_modify_qp");
+}
+
+void GDABackend::change_status_rts(ibv_qp* qp, dest_info_t* dest) {
+  int err;
+  struct ibv_qp_attr attr;
+  int attr_mask;
+
+  memset(&attr, 0, sizeof(struct ibv_qp_attr));
+  attr.qp_state      = IBV_QPS_RTS;
+  attr.sq_psn        = dest->psn;
+  attr.max_rd_atomic = GDA_MAX_ATOMIC;
+  attr.timeout       = 14;
+  attr.retry_cnt     = 7;
+  attr.rnr_retry     = 7;
+
+  attr_mask = IBV_QP_STATE
+            | IBV_QP_SQ_PSN
+            | IBV_QP_MAX_QP_RD_ATOMIC
+            | IBV_QP_TIMEOUT
+            | IBV_QP_RETRY_CNT
+            | IBV_QP_RNR_RETRY;
+
+  err = bnxt_re_dv_modify_qp(qp, &attr, attr_mask, 0, 0);
+  CHECK_ZERO(err, "bnxt_re_dv_modify_qp");
+}
+
+void GDABackend::create_qps(uint8_t port, ibv_port_attr* ib_port_att) {
+  int resize_length = (maximum_num_contexts_ + 1) * num_pes;
+
+  cqs.resize(resize_length);
+  bnxt_cqs.resize(resize_length);
+
+  bnxt_qps.resize(resize_length);
+  qps.resize(resize_length);
+
+  create_cqs(qps.size(), sq_size);
+  create_qps_impl(qps.size());
+  init_qp_status(port);
+
+  for (int i{0}; i < qps.size(); i++) {
+    dest_info[i].lid = ib_port_att->lid;
+    dest_info[i].qpn = qps[i]->qp_num;
+    dest_info[i].psn = 0;
+    dest_info[i].gid = gid;
+  }
+}
+
+void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
+  struct bnxt_re_dv_obj dv_obj;
+  struct bnxt_re_dv_cq dv_cq;
+  struct bnxt_re_dv_qp dv_qp;
+  struct ibv_context *context;
+  struct ibv_qp *ib_qp;
+  int err;
+
+  context = ib_state->context;
+  ib_qp = qps[conn_num];
+
+  /* Export CQ */
+  memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj));
+  dv_obj.cq.in  = cqs[conn_num];
+  dv_obj.cq.out = &dv_cq;
+
+  err = bnxt_re_dv_init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ);
+  CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)");
+
+  memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq));
+  gpu_qp->cq.buf   = bnxt_cqs[conn_num].buf;
+  gpu_qp->cq.depth = bnxt_cqs[conn_num].depth;
+  gpu_qp->cq.id    = dv_cq.cqn;
+  gpu_qp->cq.phase = BNXT_RE_QUEUE_START_PHASE;
+
+  /* Export QP */
+  memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj));
+  dv_obj.qp.in  = ib_qp;
+  dv_obj.qp.out = &dv_qp;
+
+  err = bnxt_re_dv_init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP);
+  CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");
+
+  memset(&gpu_qp->sq, 0, sizeof(bnxt_device_sq));
+  gpu_qp->sq.buf        = bnxt_qps[conn_num].sq_buf;
+  gpu_qp->sq.depth      = bnxt_qps[conn_num].mem_info.sq_slots;
+
+  if ((gpu_qp->sq.depth % BNXT_RE_STATIC_WQE_BB) != 0) {
+    fprintf(stderr,
+            "[WARNING] SQ depth not divisible by BNXT_RE_STATIC_WQE_BB. "
+            "There may be runtime errors.\n");
+  }
+
+  gpu_qp->sq.id          = ib_qp->qp_num;
+  gpu_qp->sq.msntbl      = bnxt_qps[conn_num].msntbl;
+  gpu_qp->sq.msn_tbl_sz  = bnxt_qps[conn_num].msn_tbl_sz;
+  gpu_qp->sq.psn_sz_log2 = std::log2(bnxt_qps[conn_num].mem_info.sq_psn_sz);
+  gpu_qp->sq.mtu         = ibv_mtu_to_int(ib_state->portinfo.active_mtu);
+
+  /* Export DB */
+  err = bnxt_re_dv_get_default_db_region(context, &db_region_attr);
+  CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");
+
+  CHECK_HIP(hipHostRegister(db_region_attr.dbr, getpagesize(), hipHostRegisterDefault));
+  CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->dbr, db_region_attr.dbr, 0));
+
+  /* Export Memory Keys */
+  gpu_qp->lkey = heap_mr->lkey;
+  gpu_qp->rkey = heap_rkey[conn_num % num_pes];
+}
+
+void GDABackend::create_cqs(int ncqs, int cqe) {
+  struct bnxt_re_dv_cq_attr cq_attr;
+  struct bnxt_re_dv_cq_init_attr cq_init_attr;
+  struct bnxt_re_dv_umem_reg_attr umem_attr;
+  struct ibv_context *context;
+
+  context = ib_state->context;
+
+  for (int i = 0; i < ncqs; i++) {
+    /* Allocate CQ mem */
+    memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr));
+    bnxt_cqs[i].handle = bnxt_re_dv_cq_mem_alloc(context, cqe, &cq_attr);
+    CHECK_NNULL(bnxt_cqs[i].handle, "bnxt_re_dv_cq_mem_alloc");
+
+    /* Allocate CQ UMEM */
+    bnxt_cqs[i].length = cq_attr.ncqe * cq_attr.cqe_size;
+    bnxt_cqs[i].depth  = cq_attr.ncqe;
+    CHECK_HIP(hipExtMallocWithFlags(&bnxt_cqs[i].buf, bnxt_cqs[i].length, hipDeviceMallocUncached));
+
+    /* Register CQ UMEM */
+    memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr));
+    umem_attr.addr         = bnxt_cqs[i].buf;
+    umem_attr.size         = bnxt_cqs[i].length;
+    umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;
+
+    bnxt_cqs[i].umem_handle = bnxt_re_dv_umem_reg(context, &umem_attr);
+    CHECK_NNULL(bnxt_cqs[i].umem_handle, "bnxt_re_dv_umem_reg(cq_buf)");
+
+    /* Create CQ */
+    memset(&cq_init_attr, 0, sizeof(struct bnxt_re_dv_cq_init_attr));
+    cq_init_attr.cq_handle   = (uint64_t) bnxt_cqs[i].handle;
+    cq_init_attr.umem_handle = bnxt_cqs[i].umem_handle;
+    cq_init_attr.ncqe        = cq_attr.ncqe;
+
+    cqs[i] = bnxt_re_dv_create_cq(context, &cq_init_attr);
+    CHECK_NNULL(cqs[i], "bnxt_re_dv_create_cq");
+  }
+}
+
+void GDABackend::create_qps_impl(int nqps) {
+  struct ibv_pd *pd;
+  struct ibv_context *context;
+  struct ibv_qp_init_attr ib_qp_attr;
+  struct bnxt_re_dv_umem_reg_attr umem_attr;
+  void *sq_ptr;
+  void *rq_ptr;
+  void* sq_umem_handle;
+  void* rq_umem_handle;
+  uint64_t msntbl_len;
+  uint64_t msntbl_offset;
+  int err;
+
+  pd = ib_state->pd_orig;
+  context = ib_state->context;
+
+  for (int i = 0; i < nqps; i++) {
+    /* IB QP Init Attr */
+    memset(&ib_qp_attr, 0, sizeof(struct ibv_qp_init_attr));
+    ib_qp_attr.send_cq             = cqs[i];
+    ib_qp_attr.recv_cq             = cqs[i];
+    ib_qp_attr.cap.max_send_wr     = sq_size;
+    ib_qp_attr.cap.max_recv_wr     = 0;
+    ib_qp_attr.cap.max_send_sge    = 1;
+    ib_qp_attr.cap.max_recv_sge    = 0;
+    ib_qp_attr.cap.max_inline_data = 0;
+    ib_qp_attr.qp_type             = IBV_QPT_RC;
+    ib_qp_attr.sq_sig_all          = 0;
+
+    /* Alloc qp_mem_info */
+    memset(&bnxt_qps[i].mem_info, 0, sizeof(struct bnxt_re_dv_qp_mem_info));
+    err = bnxt_re_dv_qp_mem_alloc(pd, &ib_qp_attr, &bnxt_qps[i].mem_info);
+    CHECK_ZERO(err, "bnxt_re_dv_qp_mem_alloc");
+
+    /* Alloc SQ */
+    CHECK_HIP(hipExtMallocWithFlags(&sq_ptr, bnxt_qps[i].mem_info.sq_len, hipDeviceMallocUncached));
+    bnxt_qps[i].mem_info.sq_va = (uint64_t) sq_ptr;
+    bnxt_qps[i].sq_buf = sq_ptr;
+
+    /* Obtain MSN Table Pointer */
+    msntbl_len             = (bnxt_qps[i].mem_info.sq_psn_sz * bnxt_qps[i].mem_info.sq_npsn);
+    msntbl_offset          = bnxt_qps[i].mem_info.sq_len - msntbl_len;
+    bnxt_qps[i].msntbl     = (void*) ((char*) bnxt_qps[i].sq_buf + msntbl_offset);
+    bnxt_qps[i].msn_tbl_sz = bnxt_qps[i].mem_info.sq_npsn;
+
+    /* Alloc RQ */
+    CHECK_HIP(hipExtMallocWithFlags(&rq_ptr, bnxt_qps[i].mem_info.rq_len, hipDeviceMallocUncached));
+    bnxt_qps[i].mem_info.rq_va = (uint64_t) rq_ptr;
+    bnxt_qps[i].rq_buf = rq_ptr;
+
+    /* Register UMEM */
+    memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr));
+    umem_attr.addr         = (void*) bnxt_qps[i].mem_info.sq_va;
+    umem_attr.size         = bnxt_qps[i].mem_info.sq_len;
+    umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;
+
+    sq_umem_handle = bnxt_re_dv_umem_reg(context, &umem_attr);
+    CHECK_NNULL(sq_umem_handle, "bnxt_re_dv_umem_reg(sq)");
+
+    memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr));
+    umem_attr.addr         = (void*) bnxt_qps[i].mem_info.rq_va;
+    umem_attr.size         = bnxt_qps[i].mem_info.rq_len;
+    umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;
+
+    rq_umem_handle = bnxt_re_dv_umem_reg(context, &umem_attr);
+    CHECK_NNULL(rq_umem_handle, "bnxt_re_dv_umem_reg(rq)");
+
+    /* IB DV QP Init Attr */
+    memset(&bnxt_qps[i].attr, 0, sizeof(struct bnxt_re_dv_qp_init_attr));
+    bnxt_qps[i].attr.send_cq         = ib_qp_attr.send_cq;
+    bnxt_qps[i].attr.recv_cq         = ib_qp_attr.recv_cq;
+    bnxt_qps[i].attr.max_send_wr     = ib_qp_attr.cap.max_send_wr;
+    bnxt_qps[i].attr.max_recv_wr     = ib_qp_attr.cap.max_recv_wr;
+    bnxt_qps[i].attr.max_send_sge    = ib_qp_attr.cap.max_send_sge;
+    bnxt_qps[i].attr.max_recv_sge    = ib_qp_attr.cap.max_recv_sge;
+    bnxt_qps[i].attr.max_inline_data = ib_qp_attr.cap.max_inline_data;
+    bnxt_qps[i].attr.qp_type         = ib_qp_attr.qp_type;
+
+    bnxt_qps[i].attr.qp_handle = bnxt_qps[i].mem_info.qp_handle;
+    bnxt_qps[i].attr.sq_umem_handle = sq_umem_handle;
+    bnxt_qps[i].attr.sq_len    = bnxt_qps[i].mem_info.sq_len;
+    bnxt_qps[i].attr.sq_slots  = bnxt_qps[i].mem_info.sq_slots;
+    bnxt_qps[i].attr.sq_wqe_sz = bnxt_qps[i].mem_info.sq_wqe_sz;
+    bnxt_qps[i].attr.sq_psn_sz = bnxt_qps[i].mem_info.sq_psn_sz;
+    bnxt_qps[i].attr.sq_npsn   = bnxt_qps[i].mem_info.sq_npsn;
+
+    bnxt_qps[i].attr.rq_umem_handle = rq_umem_handle;
+    bnxt_qps[i].attr.rq_len    = bnxt_qps[i].mem_info.rq_len;
+    bnxt_qps[i].attr.rq_slots  = bnxt_qps[i].mem_info.rq_slots;
+    bnxt_qps[i].attr.rq_wqe_sz = bnxt_qps[i].mem_info.rq_wqe_sz;
+    bnxt_qps[i].attr.comp_mask = bnxt_qps[i].mem_info.comp_mask;
+
+    /* Alloc QP */
+    qps[i] = bnxt_re_dv_create_qp(pd, &bnxt_qps[i].attr);
+    CHECK_NNULL(qps[i], "bnxt_re_dv_create_qp");
+  }
+}
+
+}  // namespace rocshmem
+
diff --git a/src/gda/bnxt/provider_gda_bnxt.hpp b/src/gda/bnxt/provider_gda_bnxt.hpp
new file mode 100644
index 0000000000..f255f21a13
--- /dev/null
+++ b/src/gda/bnxt/provider_gda_bnxt.hpp
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_
+#define LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_
+
+extern "C" {
+#include <infiniband/bnxt_re_dv.h>
+#include <infiniband/bnxt_re_hsi.h>
+}
+
+#define GDA_DEFAULT_GID    3
+#define GDA_MAX_ATOMIC     1
+#define GDA_OP_RDMA_WRITE  BNXT_RE_WR_OPCD_RDMA_WRITE
+#define GDA_OP_ATOMIC_FA   BNXT_RE_WR_OPCD_ATOMIC_FA
+#define GDA_OP_ATOMIC_CS   BNXT_RE_WR_OPCD_ATOMIC_CS
+
+#define bnxt_re_get_cqe_sz() (sizeof(struct bnxt_re_req_cqe) + \
+                              sizeof(struct bnxt_re_bcqe))
+
+#define bnxt_re_is_cqe_valid(valid, phase)              \
+        (((valid) & BNXT_RE_BCQE_PH_MASK) == (phase))
+
+struct bnxt_device_wq {
+  void *buf;
+  uint32_t depth;
+  uint32_t head;
+  uint32_t tail;
+  uint32_t flags;
+  uint32_t id;
+
+  uint32_t lock;
+
+  uint32_t db_cnt {0};
+} __attribute__((packed));
+
+struct bnxt_device_cq : public bnxt_device_wq {
+  uint32_t phase;
+} __attribute__((packed));
+
+struct bnxt_device_sq : public bnxt_device_wq {
+  uint32_t psn;
+  volatile uint32_t posted;
+
+  void *msntbl;
+  uint32_t msn;
+  uint32_t msn_tbl_sz;
+  uint32_t psn_sz_log2;
+  uint64_t mtu;
+} __attribute__((packed));
+
+struct bnxt_host_cq {
+  void *buf;
+  void *handle;
+  void *umem_handle;
+  uint64_t length;
+  uint32_t depth;
+} __attribute__((packed));
+
+struct bnxt_host_qp {
+  struct bnxt_re_dv_qp_mem_info mem_info;
+  struct bnxt_re_dv_qp_init_attr attr;
+  void *sq_buf;
+  void *rq_buf;
+  void *msntbl;
+  uint32_t msn_tbl_sz;
+} __attribute__((packed));
+
+/*****************************************************************************/
+
+#endif  //LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_
diff --git a/src/gda/bnxt/queue_pair_bnxt.cpp b/src/gda/bnxt/queue_pair_bnxt.cpp
new file mode 100644
index 0000000000..a141e3340e
--- /dev/null
+++ b/src/gda/bnxt/queue_pair_bnxt.cpp
@@ -0,0 +1,381 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "gda/queue_pair.hpp"
+#include "util.hpp"
+
+namespace rocshmem {
+
+__device__ static inline void bnxt_re_init_db_hdr(struct bnxt_re_db_hdr *hdr,
+                                                  uint32_t indx, uint32_t toggle,
+                                                  uint32_t qid, uint32_t typ) {
+  uint64_t key_lo;
+  uint64_t key_hi;
+
+  key_lo = (indx | toggle);
+
+  key_hi = (qid & BNXT_RE_DB_QID_MASK)
+         | ((typ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT)
+         | (0x1UL << BNXT_RE_DB_VALID_SHIFT);
+
+  hdr->typ_qid_indx = (key_lo | (key_hi << 32));
+}
+
+__device__ static inline struct bnxt_re_msns* bnxt_re_pull_psn_buff(struct bnxt_device_sq *sq) {
+  return (struct bnxt_re_msns*)(((char *) sq->msntbl) + ((sq->msn) << sq->psn_sz_log2));
+}
+
+__device__ static inline uint64_t bnxt_re_update_msn_tbl(uint32_t st_idx, uint32_t npsn,
+                                                         uint32_t start_psn) {
+   return ((((uint64_t)(st_idx) << BNXT_RE_SQ_MSN_SEARCH_START_IDX_SHIFT) &
+                       BNXT_RE_SQ_MSN_SEARCH_START_IDX_MASK) |
+                       (((uint64_t)(npsn) << BNXT_RE_SQ_MSN_SEARCH_NEXT_PSN_SHIFT) &
+                       BNXT_RE_SQ_MSN_SEARCH_NEXT_PSN_MASK) |
+                       (((start_psn) << BNXT_RE_SQ_MSN_SEARCH_START_PSN_SHIFT) &
+                       BNXT_RE_SQ_MSN_SEARCH_START_PSN_MASK));
+}
+
+__device__ static inline void bnxt_re_fill_psns_for_msntbl(struct bnxt_device_sq *sq,
+                                                           uint32_t msg_len) {
+   uint32_t npsn = 0, start_psn = 0, next_psn = 0;
+   struct bnxt_re_msns msns;
+   uint64_t *msns_ptr;
+   uint32_t pkt_cnt = 0;
+   /* Start slot index of the WQE */
+   uint32_t st_idx = sq->tail; // * BNXT_RE_STATIC_WQE_SIZE_SLOTS; Do we need this?
+   // Get the MSN table address
+   msns_ptr = (uint64_t *)bnxt_re_pull_psn_buff(sq);
+   // Start PSN is the last recorded PSN
+   // Calculate the packet count based on the len of the WQE/MTU
+   msns.start_idx_next_psn_start_psn = 0;
+   start_psn = sq->psn;
+   pkt_cnt = (msg_len / sq->mtu);
+
+   if (msg_len % sq->mtu)
+       pkt_cnt++;
+
+   /* Increment the psn even for 0 len packets
+    * e.g. for opcode rdma-write-with-imm-data
+    * with length field = 0
+    */
+   if (msg_len == 0)
+       pkt_cnt = 1;
+
+   /* make it 24 bit */
+   next_psn = sq->psn + pkt_cnt;
+   npsn = next_psn;
+   sq->psn = next_psn;
+   msns.start_idx_next_psn_start_psn |= bnxt_re_update_msn_tbl(st_idx, npsn, start_psn);
+   sq->msn++;
+   sq->msn %= sq->msn_tbl_sz;
+
+   memcpy(msns_ptr, &msns, sizeof(uint64_t));
+}
+
+__device__ static inline void bnxt_re_incr_tail(struct bnxt_device_sq *sq, uint8_t cnt)
+{
+  sq->tail += cnt;
+  if (sq->tail >= sq->depth) {
+    sq->tail %= sq->depth;
+    /* Rolled over, Toggle Tail bit in epoch flags */
+    sq->flags ^= 1UL << BNXT_RE_FLAG_EPOCH_TAIL_SHIFT;
+  }
+}
+
+__device__ static inline void* bnxt_re_get_hwqe(struct bnxt_device_sq *sq, uint32_t idx)
+{
+  idx += sq->tail;
+  if (idx >= sq->depth)
+    idx -= sq->depth;
+  return (void *)((char*)sq->buf + (idx << 4));
+}
+
+__device__ static inline void bnxt_re_incr_head(struct bnxt_device_cq *cq, uint8_t cnt)
+{
+  cq->head += cnt;
+  if (cq->head >= cq->depth) {
+    cq->head %= cq->depth;
+    /* Rolled over, Toggle HEAD bit in epoch flags */
+    cq->flags ^= 1UL << BNXT_RE_FLAG_EPOCH_HEAD_SHIFT;
+  }
+}
+
+__device__ static inline void bnxt_re_change_cq_phase(struct bnxt_device_cq *cq)
+{
+  if (!cq->head) {
+    cq->phase = !(cq->phase & BNXT_RE_BCQE_PH_MASK);
+  }
+}
+
+__device__ static inline void aquire_lock(uint32_t *lock) {
+  uint32_t expected;
+
+  do {
+    expected = 0;
+  } while (0 == __hip_atomic_compare_exchange_strong(lock, &expected, 1,
+                                                     __ATOMIC_SEQ_CST,
+                                                     __ATOMIC_SEQ_CST,
+                                                     __HIP_MEMORY_SCOPE_SYSTEM));
+}
+
+__device__ static inline void release_lock(uint32_t *lock) {
+  *lock = 0;
+}
+
+__device__ void QueuePair::ring_cq_doorbell(uint32_t slot_idx) {
+  struct bnxt_re_db_hdr hdr;
+  uint32_t epoch;
+
+  epoch = (cq.flags & BNXT_RE_FLAG_EPOCH_HEAD_MASK) << BNXT_RE_DB_EPOCH_HEAD_SHIFT;
+
+  bnxt_re_init_db_hdr(&hdr, (slot_idx | epoch), 0, cq.flags, BNXT_RE_QUE_TYPE_CQ);
+
+  __threadfence_system();
+  __hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__ void QueuePair::ring_sq_doorbell(uint32_t slot_idx) {
+  struct bnxt_re_db_hdr hdr;
+  uint32_t epoch;
+
+  epoch = (sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT;
+
+  bnxt_re_init_db_hdr(&hdr, (slot_idx | epoch), 0, sq.id, BNXT_RE_QUE_TYPE_SQ);
+
+  __threadfence_system();
+  __hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__ int QueuePair::poll_cq() {
+  struct bnxt_re_bcqe *hdr;
+  void *cqe;
+  uint32_t flg_val;
+  int type;
+  uint8_t status;
+
+  cqe = (void*) ((char*) cq.buf + (cq.head * bnxt_re_get_cqe_sz()));
+  hdr = (struct bnxt_re_bcqe*) ((char*)cqe + sizeof(struct bnxt_re_req_cqe));
+
+  flg_val = hdr->flg_st_typ_ph;
+
+  __threadfence_system();
+
+  if (bnxt_re_is_cqe_valid(flg_val, cq.phase)) {
+    // Is the CQE valid?
+    status = (flg_val >> BNXT_RE_BCQE_STATUS_SHIFT)
+           & BNXT_RE_BCQE_STATUS_MASK;
+
+    if (status != BNXT_RE_REQ_ST_OK) {
+      printf("CQ Error (%x)\n", status);
+      abort();
+      return -1;
+    }
+
+    /* Update the CQ Ptr */
+    bnxt_re_incr_head(&cq, 1);
+    bnxt_re_change_cq_phase(&cq);
+
+    /* Ring Doorbell */
+    ring_cq_doorbell(cq.head);
+
+    __hip_atomic_fetch_sub(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
+
+    return 1;
+  }
+
+  return 0;
+}
+
+__device__ void QueuePair::quiet() {
+  uint64_t active_lane_mask;
+  uint8_t active_lane_id;
+
+  active_lane_mask  = get_active_lane_mask();
+  active_lane_id    = get_active_lane_num(active_lane_mask);
+
+  if (0 == active_lane_id) {
+    aquire_lock(&cq.lock);
+    while (__hip_atomic_load(&sq.posted, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT)) {
+      poll_cq();
+    }
+    release_lock(&cq.lock);
+  }
+}
+
+__device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
+  uint64_t active_lane_mask;
+  uint8_t active_lane_count;
+  uint8_t active_lane_id;
+
+  active_lane_mask  = get_active_lane_mask();
+  active_lane_count = get_active_lane_count(active_lane_mask);
+  active_lane_id    = get_active_lane_num(active_lane_mask);
+
+  if (0 == active_lane_id) {
+    aquire_lock(&sq.lock);
+  }
+
+  for (int i = 0; i < active_lane_count; i++) {
+    if (i == active_lane_id) {
+      struct bnxt_re_bsqe hdr;
+      struct bnxt_re_rdma rdma;
+      struct bnxt_re_sge sge;
+      struct bnxt_re_bsqe *hdr_ptr;
+      struct bnxt_re_rdma *rdma_ptr;
+      struct bnxt_re_sge *sge_ptr;
+      uint32_t wqe_size;
+      uint32_t wqe_type;
+      uint32_t hdr_flags;
+      uint32_t rma_slots = 3; // (Three slots: hdr, rdma)
+
+      hdr_ptr  = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
+      rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1);
+      sge_ptr  = (struct bnxt_re_sge*)  bnxt_re_get_hwqe(&sq, 2);
+
+      /* Populate Header Segment */
+      wqe_size  = BNXT_RE_HDR_WS_MASK    & rma_slots;
+      hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK)
+                & ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED);
+      wqe_type  = BNXT_RE_HDR_WT_MASK    & opcode;
+
+      hdr.rsv_ws_fl_wt  = (wqe_size  << BNXT_RE_HDR_WS_SHIFT)
+                        | (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT)
+                        | wqe_type;
+      hdr.key_immd      = 0;
+      hdr.lhdr.qkey_len = length;
+
+      /* Populate RDMA Segment */
+      rdma.rva  = (uint64_t) raddr;
+      rdma.rkey = rkey;
+
+      /* Populate SG Segment */
+      sge.pa     = (uint64_t) laddr;
+      sge.lkey   = lkey;
+      sge.length = length;
+
+      /* Write WQE to SQ */
+      memcpy(hdr_ptr,  &hdr,  sizeof(struct bnxt_re_bsqe));
+      memcpy(rdma_ptr, &rdma, sizeof(struct bnxt_re_rdma));
+      memcpy(sge_ptr,  &sge,  sizeof(struct bnxt_re_sge));
+
+      /* Populate MSN Table */
+      bnxt_re_fill_psns_for_msntbl(&sq, length);
+
+      /* Update SQ Pointer */
+      bnxt_re_incr_tail(&sq, rma_slots);
+
+      /* Ring Doorbell */
+      ring_sq_doorbell(sq.tail);
+
+      __hip_atomic_fetch_add(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
+
+    }
+    __threadfence_system();
+    quiet();
+  }
+
+  if (0 == active_lane_id) {
+    release_lock(&sq.lock);
+  }
+}
+
+__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t length, uintptr_t *raddr, uint8_t opcode,
+                                            int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
+  uint64_t active_lane_mask;
+  uint8_t active_lane_count;
+  uint8_t active_lane_id;
+
+  active_lane_mask  = get_active_lane_mask();
+  active_lane_count = get_active_lane_count(active_lane_mask);
+  active_lane_id    = get_active_lane_num(active_lane_mask);
+
+  if (0 == active_lane_id) {
+    aquire_lock(&sq.lock);
+  }
+
+  for (int i = 0; i < active_lane_count; i++) {
+    if (i == active_lane_id) {
+      struct bnxt_re_bsqe hdr;
+      struct bnxt_re_atomic amo;
+      struct bnxt_re_sge sge;
+      struct bnxt_re_bsqe *hdr_ptr;
+      struct bnxt_re_atomic *amo_ptr;
+      struct bnxt_re_sge *sge_ptr;
+      uint32_t wqe_size;
+      uint32_t wqe_type;
+      uint32_t hdr_flags;
+      uint32_t amo_slots = 3; // (Three slots: hdr, amo, sge)
+
+      hdr_ptr = (struct bnxt_re_bsqe*)   bnxt_re_get_hwqe(&sq, 0);
+      amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&sq, 1);
+      sge_ptr = (struct bnxt_re_sge*)    bnxt_re_get_hwqe(&sq, 2);
+
+      /* Populate Header Segment */
+      wqe_size  = BNXT_RE_HDR_WS_MASK    & amo_slots;
+      hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK)
+                & ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED);
+      wqe_type  = BNXT_RE_HDR_WT_MASK    & opcode;
+
+      hdr.rsv_ws_fl_wt  = (wqe_size  << BNXT_RE_HDR_WS_SHIFT)
+                        | (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT)
+                        | wqe_type;
+      hdr.key_immd = rkey;
+      hdr.lhdr.rva = (uint64_t) raddr;
+
+      /* Populate AMO Segment */
+      amo.swp_dt = atomic_data;
+
+      /* Populate SG Segment - (Return address of atomic) */
+      sge.pa     = (uint64_t) nonfetching_atomic;
+      sge.lkey   = nonfetching_atomic_lkey;
+      sge.length = length;
+
+      /* Write WQE to SQ */
+      memcpy(hdr_ptr, &hdr, sizeof(struct bnxt_re_bsqe));
+      memcpy(amo_ptr, &amo, sizeof(struct bnxt_re_atomic));
+      memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge));
+
+      /* Populate MSN Table */
+      bnxt_re_fill_psns_for_msntbl(&sq, length);
+
+      /* Update SQ Pointer */
+      bnxt_re_incr_tail(&sq, amo_slots);
+
+      /* Ring Doorbell */
+      ring_sq_doorbell(sq.tail);
+
+      __hip_atomic_fetch_add(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
+    }
+    __threadfence_system();
+    quiet();
+  }
+
+  if (0 == active_lane_id) {
+    release_lock(&sq.lock);
+  }
+
+  return 0;
+}
+
+}  // namespace rocshmem
diff --git a/src/gda/context_gda_device.cpp b/src/gda/context_gda_device.cpp
new file mode 100644
index 0000000000..52eef85623
--- /dev/null
+++ b/src/gda/context_gda_device.cpp
@@ -0,0 +1,306 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include <hip/amd_detail/amd_device_functions.h>
+
+#include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
+#include "rocshmem/rocshmem.hpp"
+#include "backend_gda.hpp"
+#include "context_gda_device.hpp"
+#include "context_gda_tmpl_device.hpp"
+#include "queue_pair.hpp"
+
+namespace rocshmem {
+
+__host__ GDAContext::GDAContext(Backend *b, unsigned int ctx_id)
+    : Context(b, false) {
+  GDABackend *backend{static_cast<GDABackend *>(b)};
+  base_heap = backend->heap.get_heap_bases().data();
+
+  barrier_sync = backend->barrier_sync;
+  wrk_sync_pool_bases_ = backend->get_wrk_sync_bases();
+
+  CHECK_HIP(hipMalloc(&qps, sizeof(QueuePair) * num_pes));
+  CHECK_HIP(hipMemset(qps, 0, sizeof(QueuePair) * num_pes));
+  for (int i = 0; i < num_pes; i++) {
+    int offset = num_pes * ctx_id + i;
+    CHECK_HIP(hipMemcpy(&qps[i], &backend->gpu_qps[offset], sizeof(QueuePair), hipMemcpyDefault));
+    qps[i].base_heap = base_heap;
+  }
+  ctx_id_ = ctx_id;
+}
+
+__host__ GDAContext::~GDAContext() {
+  CHECK_HIP(hipFree(qps));
+}
+
+__device__ void GDAContext::ctx_create() {
+}
+
+__device__ void GDAContext::ctx_destroy(){
+}
+
+__device__ void GDAContext::putmem(void *dest, const void *source, size_t nelems,
+                                  int pe) {
+  uint64_t L_offset = reinterpret_cast<char*>(dest) - base_heap[my_pe];
+  bool need_turn {true};
+  uint64_t turns = __ballot(need_turn);
+  while (turns) {
+    uint8_t lane = __ffsll((unsigned long long)turns) - 1;
+    int pe_turn = __shfl(pe, lane);
+    if (pe_turn == pe) {
+      qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe);
+      qps[pe].quiet();
+      need_turn = false;
+    }
+    turns = __ballot(need_turn);
+  }
+}
+
+__device__ void GDAContext::getmem(void *dest, const void *source, size_t nelems,
+                                  int pe) {
+  printf("rocshmem::gda:getmem not implemented\n");
+  abort();
+}
+
+__device__ void GDAContext::putmem_nbi(void *dest, const void *source,
+                                      size_t nelems, int pe) {
+  uint64_t L_offset = reinterpret_cast<char*>(dest) - base_heap[my_pe];
+  bool need_turn {true};
+  uint64_t turns = __ballot(need_turn);
+  while (turns) {
+    uint8_t lane = __ffsll((unsigned long long)turns) - 1;
+    int pe_turn = __shfl(pe, lane);
+    if (pe_turn == pe) {
+      qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe);
+      need_turn = false;
+    }
+    turns = __ballot(need_turn);
+  }
+}
+
+__device__ void GDAContext::getmem_nbi(void *dest, const void *source,
+                                      size_t nelems, int pe) {
+  printf("rocshmem::gda:getmem_nbi  not implemented\n");
+  abort();
+}
+
+__device__ void GDAContext::fence() { //TODO: optimize
+  for (int i = 0; i < num_pes; i++) {
+    qps[i].quiet();
+  }
+  __threadfence_system();
+}
+
+__device__ void GDAContext::fence(int pe) {
+  fence(); //TODO: optimize
+}
+
+__device__ void GDAContext::quiet() {
+  for (int i = 0; i < num_pes; i++) {
+    qps[i].quiet();
+  }
+}
+
+__device__ void *GDAContext::shmem_ptr(const void *dest, int pe) {
+  return nullptr;
+}
+
+__device__ void GDAContext::putmem_wg(void *dest, const void *source,
+                                     size_t nelems, int pe) {
+  if (is_thread_zero_in_block()) {
+    printf("rocshmem::gda:putmem_wg not implemented\n");
+    abort();
+  }
+}
+
+__device__ void GDAContext::getmem_wg(void *dest, const void *source,
+                                     size_t nelems, int pe) {
+  if (is_thread_zero_in_block()) {
+    printf("rocshmem::gda:getmem_wg not implemented\n");
+    abort();
+  }
+}
+
+__device__ void GDAContext::putmem_nbi_wg(void *dest, const void *source,
+                                         size_t nelems, int pe) {
+  if (is_thread_zero_in_block()) {
+    printf("rocshmem::gda:putmem_nbi_wg not implemented\n");
+    abort();
+  }
+}
+
+__device__ void GDAContext::getmem_nbi_wg(void *dest, const void *source,
+                                         size_t nelems, int pe) {
+  if (is_thread_zero_in_block()) {
+    printf("rocshmem::gda:getmem_nbi_wg not implemented\n");
+    abort();
+  }
+}
+
+__device__ void GDAContext::putmem_wave(void *dest, const void *source,
+                                       size_t nelems, int pe) {
+  uint64_t L_offset = reinterpret_cast<char*>(dest) - base_heap[my_pe];
+  if (is_thread_zero_in_wave()) {
+    qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe);
+    qps[pe].quiet();
+  }
+}
+
+__device__ void GDAContext::getmem_wave(void *dest, const void *source,
+                                       size_t nelems, int pe) {
+  if (is_thread_zero_in_wave()) {
+    printf("rocshmem::gda:getmem_wave not implemented\n");
+    abort();
+  }
+}
+
+__device__ void GDAContext::putmem_nbi_wave(void *dest, const void *source,
+                                           size_t nelems, int pe) {
+  uint64_t L_offset = reinterpret_cast<char*>(dest) - base_heap[my_pe];
+  if (is_thread_zero_in_wave()) {
+    qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe);
+  }
+}
+
+__device__ void GDAContext::getmem_nbi_wave(void *dest, const void *source,
+                                           size_t nelems, int pe) {
+  if (is_thread_zero_in_wave()) {
+    printf("rocshmem::gda:getmem_nbi_wave not implemented\n");
+    abort();
+  }
+}
+
+
+//TODO: copied from IPC, needs review
+__device__ void GDAContext::putmem_signal(void *dest, const void *source, size_t nelems,
+                                          uint64_t *sig_addr, uint64_t signal, int sig_op,
+                                          int pe) {
+  putmem(dest, source, nelems, pe);
+  fence();
+
+  switch (sig_op) {
+  case ROCSHMEM_SIGNAL_SET:
+    amo_set<uint64_t>(static_cast<void*>(sig_addr), signal, pe);
+    break;
+  case ROCSHMEM_SIGNAL_ADD:
+    amo_add<uint64_t>(static_cast<void*>(sig_addr), signal, pe);
+    break;
+  default:
+    DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op);
+    break;
+  }
+  //TODO: missing quiet_pe?
+}
+
+__device__ void GDAContext::putmem_signal_wg(void *dest, const void *source, size_t nelems,
+                                             uint64_t *sig_addr, uint64_t signal, int sig_op,
+                                             int pe) {
+  putmem_wg(dest, source, nelems, pe);
+  fence();
+
+  if (is_thread_zero_in_block()) {
+    switch (sig_op) {
+    case ROCSHMEM_SIGNAL_SET:
+      amo_set<uint64_t>(static_cast<void*>(sig_addr), signal, pe);
+      break;
+    case ROCSHMEM_SIGNAL_ADD:
+      amo_add<uint64_t>(static_cast<void*>(sig_addr), signal, pe);
+      break;
+    default:
+      DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op);
+      break;
+    }
+    //TODO: missing quiet_pe?
+  }
+}
+
+__device__ void GDAContext::putmem_signal_wave(void *dest, const void *source, size_t nelems,
+                                               uint64_t *sig_addr, uint64_t signal, int sig_op,
+                                               int pe) {
+  putmem_wave(dest, source, nelems, pe);
+  fence();
+
+  if (is_thread_zero_in_wave()) {
+    switch (sig_op) {
+    case ROCSHMEM_SIGNAL_SET:
+      amo_set<uint64_t>(static_cast<void*>(sig_addr), signal, pe);
+      break;
+    case ROCSHMEM_SIGNAL_ADD:
+      amo_add<uint64_t>(static_cast<void*>(sig_addr), signal, pe);
+      break;
+    default:
+      DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op);
+      break;
+    }
+    //TODO: missing quiet_pe?
+  }
+}
+
+__device__ void GDAContext::putmem_signal_nbi(void *dest, const void *source, size_t nelems,
+                                              uint64_t *sig_addr, uint64_t signal, int sig_op,
+                                              int pe) {
+  putmem_signal(dest, source, nelems, sig_addr, signal, sig_op, pe); //TODO: optimize
+}
+
+__device__ void GDAContext::putmem_signal_nbi_wg(void *dest, const void *source, size_t nelems,
+                                                 uint64_t *sig_addr, uint64_t signal, int sig_op,
+                                                 int pe) {
+  putmem_signal_wg(dest, source, nelems, sig_addr, signal, sig_op, pe); //TODO: optimize
+}
+
+__device__ void GDAContext::putmem_signal_nbi_wave(void *dest, const void *source, size_t nelems,
+                                                   uint64_t *sig_addr, uint64_t signal, int sig_op,
+                                                   int pe) {
+  putmem_signal_wave(dest, source, nelems, sig_addr, signal, sig_op, pe); //TODO: optimize
+}
+
+__device__ uint64_t GDAContext::signal_fetch(const uint64_t *sig_addr) {
+  uint64_t *dst = const_cast<uint64_t*>(sig_addr);
+  return amo_fetch_add<uint64_t>(static_cast<void*>(dst), 0, my_pe);
+}
+
+__device__ uint64_t GDAContext::signal_fetch_wg(const uint64_t *sig_addr) {
+  __shared__ uint64_t value;
+  if (is_thread_zero_in_block()) {
+    uint64_t *dst = const_cast<uint64_t*>(sig_addr);
+    value = amo_fetch_add<uint64_t>(static_cast<void*>(dst), 0, my_pe);
+  }
+  __threadfence_block();
+  return value;
+}
+
+__device__ uint64_t GDAContext::signal_fetch_wave(const uint64_t *sig_addr) {
+  uint64_t value;
+  if (is_thread_zero_in_wave()) {
+    uint64_t *dst = const_cast<uint64_t*>(sig_addr);
+    value = amo_fetch_add<uint64_t>(static_cast<void*>(dst), 0, my_pe);
+  }
+  __threadfence_block();
+  value = __shfl(value, 0);
+  return value;
+}
+
+}  // namespace rocshmem
diff --git a/src/gda/context_gda_device.hpp b/src/gda/context_gda_device.hpp
new file mode 100644
index 0000000000..b555e84a58
--- /dev/null
+++ b/src/gda/context_gda_device.hpp
@@ -0,0 +1,309 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_CONTEXT_DEVICE_HPP_
+#define LIBRARY_SRC_GDA_CONTEXT_DEVICE_HPP_
+
+#include "context.hpp"
+#include "team.hpp"
+
+namespace rocshmem {
+
+class QueuePair;
+
+class GDAContext : public Context {
+ public:
+  __host__ GDAContext(Backend *b, unsigned int ctx_id);
+
+  __host__ ~GDAContext();
+
+  __device__ GDAContext(Backend *b, unsigned int ctx_id); //TODO is this used?
+
+  __device__ void ctx_create();
+
+  __device__ void ctx_destroy();
+
+  __device__ void putmem(void *dest, const void *source, size_t nelems, int pe);
+
+  __device__ void getmem(void *dest, const void *source, size_t nelems, int pe);
+
+  __device__ void putmem_nbi(void *dest, const void *source, size_t nelems,
+                             int pe);
+
+  __device__ void getmem_nbi(void *dest, const void *source, size_t size,
+                             int pe);
+
+  __device__ void fence();
+
+  __device__ void fence(int pe);
+
+  __device__ void quiet();
+
+  __device__ void *shmem_ptr(const void *dest, int pe);
+
+  __device__ void barrier_all();
+
+  __device__ void barrier_all_wave();
+
+  __device__ void barrier_all_wg();
+
+  __device__ void barrier(rocshmem_team_t team);
+
+  __device__ void barrier_wave(rocshmem_team_t team);
+
+  __device__ void barrier_wg(rocshmem_team_t team);
+
+  __device__ void sync_all();
+
+  __device__ void sync_all_wave();
+
+  __device__ void sync_all_wg();
+
+  __device__ void sync(rocshmem_team_t team);
+
+  __device__ void sync_wave(rocshmem_team_t team);
+
+  __device__ void sync_wg(rocshmem_team_t team);
+
+  template <typename T>
+  __device__ void p(T *dest, T value, int pe);
+
+  template <typename T>
+  __device__ void put(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ void put_nbi(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ T g(const T *source, int pe);
+
+  template <typename T>
+  __device__ void get(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
+
+  // Atomic operations
+  template <typename T>
+  __device__ void amo_add(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ void amo_set(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ T amo_swap(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ T amo_fetch_and(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ void amo_and(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ T amo_fetch_or(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ void amo_or(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ T amo_fetch_xor(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ void amo_xor(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ void amo_cas(void *dst, T value, T cond, int pe);
+
+  template <typename T>
+  __device__ T amo_fetch_add(void *dst, T value, int pe);
+
+  template <typename T>
+  __device__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
+
+  // Collectives
+  template <typename T, ROCSHMEM_OP Op>
+  __device__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce);
+
+  template <typename T>
+  __device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
+                            int nelems, int pe_root);
+
+  template <typename T>
+  __device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
+                           int nelems);
+  template <typename T>
+  __device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
+                           int nelems);
+
+
+  // Block/wave functions
+  __device__ void putmem_wg(void *dest, const void *source, size_t nelems,
+                            int pe);
+
+  __device__ void getmem_wg(void *dest, const void *source, size_t nelems,
+                            int pe);
+
+  __device__ void putmem_nbi_wg(void *dest, const void *source, size_t nelems,
+                                int pe);
+
+  __device__ void getmem_nbi_wg(void *dest, const void *source, size_t size,
+                                int pe);
+
+  __device__ void putmem_wave(void *dest, const void *source, size_t nelems,
+                              int pe);
+
+  __device__ void getmem_wave(void *dest, const void *source, size_t nelems,
+                              int pe);
+
+  __device__ void putmem_nbi_wave(void *dest, const void *source, size_t nelems,
+                                  int pe);
+
+  __device__ void getmem_nbi_wave(void *dest, const void *source, size_t size,
+                                  int pe);
+
+  template <typename T>
+  __device__ void put_wg(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ void put_nbi_wg(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ void put_wave(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ void put_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ void get_wg(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ void get_nbi_wg(T *dest, const T *source, size_t nelems, int pe);
+
+
+  template <typename T>
+  __device__ void get_wave(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
+
+#define GDA_CONTEXT_PUT_SIGNAL_DEC(SUFFIX)                                               \
+  template <typename T>                                                                  \
+  __device__ void put_signal##SUFFIX(T *dest, const T *source, size_t nelems,            \
+                                     uint64_t *sig_addr, uint64_t signal, int sig_op,    \
+                                     int pe);                                            \
+                                                                                         \
+  __device__ void putmem_signal##SUFFIX(void *dest, const void *source, size_t nelems,   \
+                                        uint64_t *sig_addr, uint64_t signal, int sig_op, \
+                                        int pe);
+
+  GDA_CONTEXT_PUT_SIGNAL_DEC()
+  GDA_CONTEXT_PUT_SIGNAL_DEC(_wg)
+  GDA_CONTEXT_PUT_SIGNAL_DEC(_wave)
+  GDA_CONTEXT_PUT_SIGNAL_DEC(_nbi)
+  GDA_CONTEXT_PUT_SIGNAL_DEC(_nbi_wg)
+  GDA_CONTEXT_PUT_SIGNAL_DEC(_nbi_wave)
+
+  __device__ uint64_t signal_fetch(const uint64_t *sig_addr);
+  __device__ uint64_t signal_fetch_wg(const uint64_t *sig_addr);
+  __device__ uint64_t signal_fetch_wave(const uint64_t *sig_addr);
+
+ private:
+
+  //internal functions used by collective operations
+  template <typename T>
+  __device__ void internal_broadcast(T *dest, const T *source, int nelems, int pe_root,
+                                     int pe_start, int stride, int pe_size,
+                                     long *p_sync);  // NOLINT(runtime/int)
+
+  template <typename T>
+  __device__ void internal_put_broadcast(T *dst, const T *src, int nelems,
+                                         int pe_root, int PE_start,
+                                         int logPE_stride, int PE_size);  // NOLINT(runtime/int)
+
+  template <typename T>
+  __device__ void internal_get_broadcast(T *dst, const T *src, int nelems,
+                                         int pe_root);  // NOLINT(runtime/int)
+
+  template <typename T>
+  __device__ void fcollect_linear(rocshmem_team_t team, T *dest,
+                                  const T *source, int nelems);
+
+  template <typename T>
+  __device__ void alltoall_linear(rocshmem_team_t team, T *dest,
+                                  const T *source, int nelems);
+
+  __device__ void internal_sync(int pe, int PE_start, int stride, int PE_size,
+                                int64_t *pSync);
+
+  __device__ void internal_sync_wave(int pe, int PE_start, int stride, int PE_size,
+                                int64_t *pSync);
+
+  __device__ void internal_sync_wg(int pe, int PE_start, int stride, int PE_size,
+                                int64_t *pSync);
+
+  __device__ void internal_direct_barrier(int pe, int PE_start, int stride,
+                                          int n_pes, int64_t *pSync);
+
+  __device__ void internal_atomic_barrier(int pe, int PE_start, int stride,
+                                          int n_pes, int64_t *pSync);
+
+  template <typename T, ROCSHMEM_OP Op>
+  __device__ void internal_direct_allreduce(T *dst, const T *src,
+                                            int nelems, GDATeam *team_obj);
+  template <typename T, ROCSHMEM_OP Op>
+  __device__ void internal_ring_allreduce(T *dst, const T *src,
+                                          int nelems, GDATeam *team_obj,
+                                          int n_seg, int seg_size, int chunk_size);
+
+
+  //Temporary scratchpad memory used by internal barrier algorithms.
+  int64_t *barrier_sync{nullptr};
+
+  /**
+   * @brief Array containing the addresses of the work/sync buffer bases
+   * of other PEs
+  */
+  char **wrk_sync_pool_bases_{nullptr};
+
+  /**
+   * @brief Device context Id
+   */
+  unsigned int ctx_id_{};
+
+ public:
+  QueuePair *qps{nullptr};
+
+  char *const *base_heap{nullptr};
+
+  //TODO(Avinash):
+  //Make tinfo private variable, it requires changes to the context
+  //creation API in backend
+
+  //Team information for the team associated with the context
+  TeamInfo *tinfo{nullptr};
+};
+
+}  // namespace rocshmem
+
+#endif // LIBRARY_SRC_GDA_CONTEXT_DEVICE_HPP_
diff --git a/src/gda/context_gda_device_coll.cpp b/src/gda/context_gda_device_coll.cpp
new file mode 100644
index 0000000000..be224fb302
--- /dev/null
+++ b/src/gda/context_gda_device_coll.cpp
@@ -0,0 +1,242 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "rocshmem/rocshmem.hpp"
+#include "context_incl.hpp"
+#include "context_gda_tmpl_device.hpp"
+#include "util.hpp"
+#include "gda_team.hpp"
+
+namespace rocshmem {
+
+__device__ void GDAContext::internal_direct_barrier(int pe, int PE_start,
+                                                    int stride, int n_pes,
+                                                    int64_t *pSync) {
+  int64_t flag_val{1};
+  if (pe == PE_start) {
+    // Go through all PE offsets (except current offset = 0)
+    // and wait until they all reach
+#if defined(__gfx90a__)
+    __threadfence_system();
+#endif /* __gfx90a__ */
+    for (int i = 1; i < n_pes; i++) {
+      wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
+      pSync[i] = ROCSHMEM_SYNC_VALUE;
+    }
+    __threadfence_system();
+
+    // Announce to other PEs that all have reached
+    for (int i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
+      pSync[0] = flag_val;
+      put(&pSync[0], &pSync[0], 1, j);
+#if defined(__gfx90a__)
+      __threadfence_system();
+#endif /* __gfx90a__ */
+    }
+    pSync[0] = ROCSHMEM_SYNC_VALUE;
+  } else {
+    // Mark current PE offset as reached
+    size_t pe_offset = (pe - PE_start) / stride;
+    pSync[pe_offset] = flag_val;
+    put(&pSync[pe_offset], &pSync[pe_offset], 1, PE_start);
+#if defined(__gfx90a__)
+    __threadfence_system();
+#endif /* __gfx90a__ */
+    wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
+    pSync[0] = ROCSHMEM_SYNC_VALUE;
+    pSync[pe_offset] = ROCSHMEM_SYNC_VALUE;
+    __threadfence_system();
+  }
+}
+
+__device__ void GDAContext::internal_atomic_barrier(int pe, int PE_start,
+                                                    int stride, int n_pes,
+                                                    int64_t *pSync) {
+  int64_t flag_val{1};
+  if (pe == PE_start) {
+    wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
+    pSync[0] = ROCSHMEM_SYNC_VALUE;
+    __threadfence_system();
+
+    pSync[0] = flag_val;
+    for (int i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
+      put_nbi(&pSync[0], &pSync[0], 1, j);
+    }
+    quiet();
+    pSync[0] = ROCSHMEM_SYNC_VALUE;
+  } else {
+    amo_add<int64_t>(&pSync[0], flag_val, PE_start);
+    wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
+    pSync[0] = ROCSHMEM_SYNC_VALUE;
+    __threadfence_system();
+  }
+}
+
+__device__ void GDAContext::internal_sync(int pe, int PE_start, int stride,
+                                          int PE_size, int64_t *pSync) {
+  if (PE_size < 64) {
+    internal_direct_barrier(pe, PE_start, stride, PE_size, pSync);
+  } else {
+    internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync);
+  }
+}
+
+__device__ void GDAContext::internal_sync_wave(int pe, int PE_start, int stride,
+                                               int PE_size, int64_t *pSync) {
+  if (is_thread_zero_in_wave()) {
+    if (PE_size < 64) {
+      internal_direct_barrier(pe, PE_start, stride, PE_size, pSync);
+    } else {
+      internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync);
+    }
+  }
+}
+
+__device__ void GDAContext::internal_sync_wg(int pe, int PE_start, int stride,
+                                             int PE_size, int64_t *pSync) {
+  __syncthreads();
+  if (is_thread_zero_in_block()) {
+    if (PE_size < 64) {
+      internal_direct_barrier(pe, PE_start, stride, PE_size, pSync);
+    } else {
+      internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync);
+    }
+  }
+  __threadfence_system();
+  __syncthreads();
+}
+
+__device__ void GDAContext::sync(rocshmem_team_t team) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int pe = team_obj->my_pe_in_world;
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_stride = team_obj->tinfo_wrt_world->stride;
+  int pe_size = team_obj->num_pes;
+  long *p_sync = team_obj->barrier_pSync;
+
+  internal_sync(pe, pe_start, pe_stride, pe_size, p_sync);
+}
+
+__device__ void GDAContext::sync_wave(rocshmem_team_t team) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int pe = team_obj->my_pe_in_world;
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_stride = team_obj->tinfo_wrt_world->stride;
+  int pe_size = team_obj->num_pes;
+  long *p_sync = team_obj->barrier_pSync;
+
+  internal_sync_wave(pe, pe_start, pe_stride, pe_size, p_sync);
+}
+
+__device__ void GDAContext::sync_wg(rocshmem_team_t team) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int pe = team_obj->my_pe_in_world;
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_stride = team_obj->tinfo_wrt_world->stride;
+  int pe_size = team_obj->num_pes;
+  long *p_sync = team_obj->barrier_pSync;
+
+  internal_sync_wg(pe, pe_start, pe_stride, pe_size, p_sync);
+}
+
+__device__ void GDAContext::sync_all() {
+  internal_sync(my_pe, 0, 1, num_pes, barrier_sync);
+}
+
+__device__ void GDAContext::sync_all_wave() {
+  internal_sync_wave(my_pe, 0, 1, num_pes, barrier_sync);
+}
+
+__device__ void GDAContext::sync_all_wg() {
+  internal_sync_wg(my_pe, 0, 1, num_pes, barrier_sync);
+}
+
+__device__ void GDAContext::barrier_all() {
+  quiet();
+  sync_all();
+}
+
+__device__ void GDAContext::barrier_all_wave() {
+  if (is_thread_zero_in_wave()) {
+    quiet();
+  }
+  sync_all_wave();
+}
+
+__device__ void GDAContext::barrier_all_wg() {
+  if (is_thread_zero_in_block()) {
+    quiet();
+  }
+  sync_all_wg();
+  __syncthreads();
+}
+
+__device__ void GDAContext::barrier(rocshmem_team_t team) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int pe = team_obj->my_pe_in_world;
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_stride = team_obj->tinfo_wrt_world->stride;
+  int pe_size = team_obj->num_pes;
+  long *p_sync = team_obj->barrier_pSync;
+
+  quiet();
+  internal_sync(pe, pe_start, pe_stride, pe_size, p_sync);
+}
+
+__device__ void GDAContext::barrier_wave(rocshmem_team_t team) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int pe = team_obj->my_pe_in_world;
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_stride = team_obj->tinfo_wrt_world->stride;
+  int pe_size = team_obj->num_pes;
+  long *p_sync = team_obj->barrier_pSync;
+
+  if (is_thread_zero_in_wave()) {
+    quiet();
+  }
+  internal_sync_wave(pe, pe_start, pe_stride, pe_size, p_sync);
+}
+
+__device__ void GDAContext::barrier_wg(rocshmem_team_t team) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int pe = team_obj->my_pe_in_world;
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_stride = team_obj->tinfo_wrt_world->stride;
+  int pe_size = team_obj->num_pes;
+  long *p_sync = team_obj->barrier_pSync;
+
+  if (is_thread_zero_in_block()) {
+    quiet();
+  }
+  internal_sync_wg(pe, pe_start, pe_stride, pe_size, p_sync);
+  __syncthreads();
+}
+
+}  // namespace rocshmem
diff --git a/src/gda/context_gda_host.cpp b/src/gda/context_gda_host.cpp
new file mode 100644
index 0000000000..5190e8637f
--- /dev/null
+++ b/src/gda/context_gda_host.cpp
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "context_gda_host.hpp"
+
+#include <mpi.h>
+
+#include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
+#include "backend_type.hpp"
+#include "context_incl.hpp"
+#include "backend_gda.hpp"
+#include "host/host.hpp"
+
+namespace rocshmem {
+
+__host__ GDAHostContext::GDAHostContext(Backend *backend,
+                                        [[maybe_unused]] int64_t options)
+    : Context(backend, true) {
+  GDABackend *b{static_cast<GDABackend *>(backend)};
+
+  host_interface = b->host_interface;
+
+  context_window_info = host_interface->acquire_window_context();
+}
+
+__host__ GDAHostContext::~GDAHostContext() {
+  host_interface->release_window_context(context_window_info);
+}
+
+__host__ void GDAHostContext::putmem_nbi(void *dest, const void *source,
+                                         size_t nelems, int pe) {
+  host_interface->putmem_nbi(dest, source, nelems, pe, context_window_info);
+}
+
+__host__ void GDAHostContext::getmem_nbi(void *dest, const void *source,
+                                         size_t nelems, int pe) {
+  host_interface->getmem_nbi(dest, source, nelems, pe, context_window_info);
+}
+
+__host__ void GDAHostContext::putmem(void *dest, const void *source,
+                                     size_t nelems, int pe) {
+  host_interface->putmem(dest, source, nelems, pe, context_window_info);
+}
+
+__host__ void GDAHostContext::getmem(void *dest, const void *source,
+                                     size_t nelems, int pe) {
+  host_interface->getmem(dest, source, nelems, pe, context_window_info);
+}
+
+__host__ void GDAHostContext::fence() {
+  host_interface->fence(context_window_info);
+}
+
+__host__ void GDAHostContext::quiet() {
+  host_interface->quiet(context_window_info);
+}
+
+__host__ void *GDAHostContext::shmem_ptr(const void *dest, int pe) {
+  void *ret = nullptr;
+  //not implemented, returning nullptr is spec-valid
+  //TODO: copy ipc handover from RO when IPC+GDA is implemented
+  return ret;
+}
+
+__host__ void GDAHostContext::sync_all() {
+  host_interface->sync_all(context_window_info);
+}
+
+__host__ void GDAHostContext::barrier_all() {
+  host_interface->barrier_all(context_window_info);
+}
+
+}  // namespace rocshmem
diff --git a/src/gda/context_gda_host.hpp b/src/gda/context_gda_host.hpp
new file mode 100644
index 0000000000..7f7f86b4d6
--- /dev/null
+++ b/src/gda/context_gda_host.hpp
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_CONTEXT_HOST_HPP_
+#define LIBRARY_SRC_GDA_CONTEXT_HOST_HPP_
+
+#include "context.hpp"
+
+namespace rocshmem {
+
+class GDAHostContext : public Context {
+ public:
+  __host__ GDAHostContext(Backend *b, int64_t options);
+
+  __host__ ~GDAHostContext();
+
+  template <typename T>
+  __host__ void p(T *dest, T value, int pe);
+
+  template <typename T>
+  __host__ T g(const T *source, int pe);
+
+  template <typename T>
+  __host__ void put(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __host__ void get(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __host__ void put_nbi(T *dest, const T *source, size_t nelems, int pe);
+
+  template <typename T>
+  __host__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
+
+  __host__ void putmem(void *dest, const void *source, size_t nelems, int pe);
+
+  __host__ void getmem(void *dest, const void *source, size_t nelems, int pe);
+
+  __host__ void putmem_nbi(void *dest, const void *source, size_t nelems,
+                           int pe);
+
+  __host__ void getmem_nbi(void *dest, const void *source, size_t size, int pe);
+
+  template <typename T>
+  __host__ void amo_add(void *dst, T value, int pe);
+
+  template <typename T>
+  __host__ void amo_cas(void *dst, T value, T cond, int pe);
+
+  template <typename T>
+  __host__ T amo_fetch_add(void *dst, T value, int pe);
+
+  template <typename T>
+  __host__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
+
+  __host__ void fence();
+
+  __host__ void quiet();
+
+  __host__ void *shmem_ptr(const void *dest, int pe);
+
+  __host__ void barrier_all();
+
+  __host__ void sync_all();
+
+  template <typename T>
+  __host__ void broadcast(T *dest, const T *source, int nelems, int pe_root,
+                          int pe_start, int log_pe_stride, int pe_size,
+                          long *p_sync);
+
+  template <typename T>
+  __host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
+                          int nelems, int pe_root);
+
+  template <typename T, ROCSHMEM_OP Op>
+  __host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
+                       int log_pe_stride, int pe_size, T *p_wrk,
+                       long *p_sync);
+
+  template <typename T, ROCSHMEM_OP Op>
+  __host__ int reduce(rocshmem_team_t team, T *dest, const T *source, int nreduce);
+
+  template <typename T>
+  __host__ void wait_until(T *ivars, int cmp, T val);
+
+  template <typename T>
+  __host__ size_t wait_until_any(T *ivars, size_t nelems,
+                                 const int *status,
+                                 int cmp, T val);
+
+  template <typename T>
+  __host__ void wait_until_all(T *ivars, size_t nelems,
+                               const int *status,
+                               int cmp, T val);
+
+  template <typename T>
+  __host__ size_t wait_until_some(T *ivars, size_t nelems,
+                                  size_t* indices,
+                                  const int *status,
+                                  int cmp, T val);
+
+  template <typename T>
+  __host__ void wait_until_all_vector(T *ivars, size_t nelems,
+                                      const int *status,
+                                      int cmp, T* vals);
+
+  template <typename T>
+  __host__ size_t wait_until_any_vector(T *ivars, size_t nelems,
+                                        const int *status,
+                                        int cmp, T* vals);
+
+  template <typename T>
+  __host__ size_t wait_until_some_vector(T *ivars, size_t nelems,
+                                         size_t* indices,
+                                         const int *status,
+                                         int cmp, T* vals);
+
+  template <typename T>
+  __host__ int test(T *ivars, int cmp, T val);
+
+ public:
+  /* Shared pointer to the backend's host interface */
+  std::shared_ptr<HostInterface> host_interface{nullptr};
+
+  /* An MPI Window implements a context */
+  WindowInfo *context_window_info{nullptr};
+};
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_CONTEXT_HOST_HPP_
diff --git a/src/gda/context_gda_tmpl_device.hpp b/src/gda/context_gda_tmpl_device.hpp
new file mode 100644
index 0000000000..265afb232d
--- /dev/null
+++ b/src/gda/context_gda_tmpl_device.hpp
@@ -0,0 +1,636 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_CONTEXT_TMPL_DEVICE_HPP_
+#define LIBRARY_SRC_GDA_CONTEXT_TMPL_DEVICE_HPP_
+
+#include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
+#include "rocshmem/rocshmem.hpp"
+#include "util.hpp"
+#include "context_gda_device.hpp"
+#include "gda_team.hpp"
+#include "queue_pair.hpp"
+#include "rocshmem_calc.hpp"
+
+#include <hip/hip_runtime.h>
+
+namespace rocshmem {
+
+/******************************************************************************
+ ************************** TEMPLATE SPECIALIZATIONS **************************
+ *****************************************************************************/
+template <typename T>
+__device__ void GDAContext::p(T *dest, T value, int pe) {
+  printf("rocshmem::gda:p not implemented\n");
+  abort();
+  //TODO the following is incorrect because value is not ibv registered memory
+  //putmem_nbi(dest, &value, sizeof(T), pe);
+}
+
+template <typename T>
+__device__ void GDAContext::put(T *dest, const T *source, size_t nelems, int pe) {
+  putmem(dest, source, nelems * sizeof(T), pe);
+}
+
+template <typename T>
+__device__ void GDAContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) {
+  putmem_nbi(dest, source, sizeof(T) * nelems, pe);
+}
+
+template <typename T>
+__device__ T GDAContext::g(const T *source, int pe) {
+  T ret;
+  printf("rocshmem::gda:g not implemented\n");
+  abort();
+  //TODO the following is incorrect because ret is not ibv registered memory
+  //getmem(&ret, source, sizeof(T), pe);
+  return ret;
+}
+
+template <typename T>
+__device__ void GDAContext::get(T *dest, const T *source, size_t nelems, int pe) {
+  getmem(dest, source, sizeof(T) * nelems, pe);
+}
+
+template <typename T>
+__device__ void GDAContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) {
+  getmem_nbi(dest, source, sizeof(T) * nelems, pe);
+}
+
+// Atomics
+template <typename T>
+__device__ void GDAContext::amo_add(void *dst, T value, int pe) {
+  if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_add not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t
+  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
+  bool need_turn {true};
+  uint64_t turns = __ballot(need_turn);
+  while (turns) {
+    uint8_t lane = __ffsll((unsigned long long)turns) - 1;
+    int pe_turn = __shfl(pe, lane);
+    if (pe_turn == pe) {
+      qps[pe].atomic_nofetch(base_heap[pe] + L_offset, value, 0, pe, GDA_OP_ATOMIC_FA);
+      need_turn = false;
+    }
+    turns = __ballot(need_turn);
+  }
+}
+
+template <typename T>
+__device__ void GDAContext::amo_set(void *dst, T value, int pe) {
+  if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_set not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t
+  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
+  T ret_val;
+  T cond = 0;
+  for (int i = 0; i < WF_SIZE; i++) { //TODO: this looks wrong
+    while ((ret_val = qps[pe].atomic_fetch(base_heap[pe] + L_offset, value, cond, pe, GDA_OP_ATOMIC_CS))) {
+      if (ret_val == cond) { break; }
+      cond = ret_val;
+    }
+  }
+}
+
+template <typename T>
+__device__ T GDAContext::amo_swap(void *dst, T value, int pe) {
+  printf("rocshmem::gda:amo_swap not implemented\n");
+  abort();
+  return 0;
+}
+
+template <typename T>
+__device__ T GDAContext::amo_fetch_and(void *dst, T value, int pe) {
+  printf("rocshmem::gda:amo_fetch_and not implemented\n");
+  abort();
+  return 0;
+}
+
+template <typename T>
+__device__ void GDAContext::amo_and(void *dst, T value, int pe) {
+  printf("rocshmem::gda:amo_and not implemented\n");
+  abort();
+}
+
+template <typename T>
+__device__ T GDAContext::amo_fetch_or(void *dst, T value, int pe) {
+  printf("rocshmem::gda:amo_fetch_or not implemented\n");
+  abort();
+  return 0;
+}
+
+template <typename T>
+__device__ void GDAContext::amo_or(void *dst, T value, int pe) {
+  printf("rocshmem::gda:amo_or not implemented\n");
+  abort();
+}
+
+template <typename T>
+__device__ T GDAContext::amo_fetch_xor(void *dst, T value, int pe) {
+  printf("rocshmem::gda:amo_fetch_xor not implemented\n");
+  abort();
+  return 0;
+}
+
+template <typename T>
+__device__ void GDAContext::amo_xor(void *dst, T value, int pe) {
+  printf("rocshmem::gda:amo_xor not implemented\n");
+  abort();
+}
+
+template <typename T>
+__device__ void GDAContext::amo_cas(void *dst, T value, T cond, int pe) {
+  if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_cas not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t
+  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
+  for (int i = 0; i < WF_SIZE; i++) { //TODO: this looks wrong
+    qps[pe].atomic_nofetch(base_heap[pe] + L_offset, value, cond, pe, GDA_OP_ATOMIC_CS);
+  }
+}
+
+template <typename T>
+__device__ T GDAContext::amo_fetch_add(void *dst, T value, int pe) {
+  if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_fadd not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t
+  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
+  T ret_val = 0;
+  bool need_turn {true};
+  uint64_t turns = __ballot(need_turn);
+  while (turns) {
+    uint8_t lane = __ffsll((unsigned long long)turns) - 1;
+    int pe_turn = __shfl(pe, lane);
+    if (pe_turn == pe) {
+      ret_val =  qps[pe].atomic_fetch(base_heap[pe] + L_offset, value, 0, pe, GDA_OP_ATOMIC_FA);
+      need_turn = false;
+    }
+    turns = __ballot(need_turn);
+  }
+  return ret_val;
+}
+
+template <typename T>
+__device__ T GDAContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
+  if constexpr (sizeof(T) != 8) { printf("rocshmem::gda:amo_fcas not implemented for non-64bit types.\n"); abort(); }//TODO:support for non-uint64t
+  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
+  T ret_val;
+  for (int i = 0; i < WF_SIZE; i++) {
+    ret_val = qps[pe].atomic_fetch(base_heap[pe] + L_offset, value, cond, pe, GDA_OP_ATOMIC_CS);
+  }
+  return ret_val;
+}
+
+// Collectives TODO: loosely adapted from IPC, needs review
+template <typename T, ROCSHMEM_OP Op>
+__device__ void compute_reduce(T *src, T *dst, int size, int wg_id, int wg_size) {
+  for (int i = wg_id; i < size; i += wg_size) {
+    OpWrap<Op>::Calc(src, dst, i);
+  }
+  __syncthreads();
+}
+
+template <typename T, ROCSHMEM_OP Op>
+__device__ void GDAContext::internal_direct_allreduce(
+    T *dst, const T *src, int nelems, GDATeam *team_obj) {  // NOLINT(runtime/int)
+
+  int stride = team_obj->tinfo_wrt_world->stride;
+  int PE_start = team_obj->tinfo_wrt_world->pe_start;
+  int PE_size = team_obj->tinfo_wrt_world->size;
+  long *pSync = team_obj->barrier_pSync;
+  T *pWrk = reinterpret_cast<T *>(team_obj->pWrk);
+
+  int finish = PE_start + stride * PE_size;
+  int pe = my_pe;
+
+  int wg_id = get_flat_block_id();
+  int wg_size = get_flat_block_size();
+  int64_t flag_val = 1;
+
+  for (int i = wg_id; i < nelems; i += wg_size) {
+    dst[i] = src[i];
+  }
+  __syncthreads();
+
+  for (int i = PE_start; i < finish; i += stride) {
+    if (i != pe) {
+      putmem_wg(&pWrk[pe * nelems], reinterpret_cast<const void *>(src),
+                nelems * sizeof(T), i);
+
+      if (is_thread_zero_in_block()) {
+        fence();
+        putmem(&pSync[pe], &flag_val, sizeof(*pSync), i);
+      }
+    }
+  }
+  threadfence_system();
+  __syncthreads();
+
+  // Do the compute and pSync reset in parallel.
+  for (int i = PE_start; i < finish; i += stride) {
+    if (i != pe) {
+      // Wait for leader thread to see that the buffer is ready.
+      if (is_thread_zero_in_block()) {
+        wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
+      }
+      __syncthreads();
+
+      T *ptr = &pWrk[i * nelems];
+      compute_reduce<T, Op>(ptr, dst, nelems, wg_id, wg_size);
+      threadfence_system();
+    }
+  }
+
+  __syncthreads();
+
+  for (int i = wg_id; i < num_pes; i += wg_size) {
+    pSync[i] = ROCSHMEM_SYNC_VALUE;
+  }
+  threadfence_system();
+  __syncthreads();
+}
+
+/*
+ * Visual representation of the ring_allreduce algorithm below
+ * assuming 4 PEs and a single segment.
+ *
+ *         Initial state
+ *  PE#     0              1             2              3
+ *        [00]           [10]          [20]           [30]
+ *        [01]           [11]          [21]           [31]
+ *        [02]           [12]          [22]           [32]
+ *        [03]           [13]          [23]           [33]
+ *
+ * Loop 1:
+ *        iter 0
+ *  PE#     0              1             2              3
+ *        [00+30]        [10]          [20]           [30]
+ *        [01]           [01+11]       [21]           [31]
+ *        [02]           [12]          [12+22]        [32]
+ *        [03]           [13]          [23]           [23+33]
+ *
+ *        iter 1
+ *  PE#     0              1             2              3
+ *        [00+30]        [00+10+30]    [20]           [30]
+ *        [01]           [01+11]       [01+11+21]     [31]
+ *        [02]           [12]          [12+22]        [12+22+32]
+ *        [03+23+33]     [13]          [23]           [23+33]
+ *
+ *        iter 2
+ *  PE#     0              1             2              3
+ *        [00+30]        [00+10+30]    [00+10+20+30]  [30]
+ *        [01]           [01+11]       [01+11+21]     [01+11+21+31]
+ *        [02+12+22+32]  [12]          [12+22]        [12+22+32]
+ *        [03+23+33]     [03+13+23+33] [23]           [23+33]
+ *
+ * Loop 2:
+ *
+ *       iter 3
+ *  PE#     0              1             2              3
+ *        [00+30]        [00+10+30]    [00+10+20+30]  [00+10+20+30]
+ *        [01+11+21+31]  [01+11]       [01+11+21]     [01+11+21+31]
+ *        [02+12+22+32]  [02+12+22+32] [12+22]        [12+22+32]
+ *        [03+23+33]     [03+13+23+33] [03+13+23+33]  [23+33]
+ *
+ *       iter 4
+ *  PE#     0              1             2              3
+ *        [00+10+20+30]  [00+10+30]    [00+10+20+30]  [00+10+20+30]
+ *        [01+11+21+31]  [01+11+21+31] [01+11+21]     [01+11+21+31]
+ *        [02+12+22+32]  [02+12+22+32] [02+12+22+32]  [12+22+32]
+ *        [03+23+33]     [03+13+23+33] [03+13+23+33]  [03+13+23+33]
+ *
+ *        iter 5
+ *  PE#     0              1             2              3
+ *        [00+10+20+30]  [00+10+20+30] [00+10+20+30]  [00+10+20+30]
+ *        [01+11+21+31]  [01+11+21+31] [01+11+21+31]  [01+11+21+31]
+ *        [02+12+22+32]  [02+12+22+32] [02+12+22+32]  [02+12+22+32]
+ *        [03+13+23+33]  [03+13+23+33] [03+13+23+33]  [03+13+23+33]
+ */
+template <typename T, ROCSHMEM_OP Op>
+__device__ void GDAContext::internal_ring_allreduce(
+    T *dst, const T *src, int nelems, GDATeam *team_obj,  // NOLINT(runtime/int)
+    int n_seg, int seg_size, int chunk_size) {
+
+  int stride = team_obj->tinfo_wrt_world->stride;
+  int PE_start = team_obj->tinfo_wrt_world->pe_start;
+  int PE_size = team_obj->tinfo_wrt_world->size;
+  long *pSync = team_obj->barrier_pSync;
+  T *pWrk = reinterpret_cast<T *>(team_obj->pWrk);
+  int my_pe_in_team = team_obj->my_pe;
+
+  int off_seg, off_send, off_recv;
+  int send_pe = (my_pe_in_team + 1) % PE_size;
+  // send_pe is relative to team, convert it relative to team world
+  send_pe = team_obj->get_pe_in_world(send_pe);
+  long wait_val;  // NOLINT(runtime/int)
+
+  int wg_size = get_flat_block_size();
+  int wg_id = get_flat_block_id();
+
+  for (int i = wg_id; i < nelems; i += wg_size) {
+    dst[i] = src[i];
+  }
+  __syncthreads();
+
+  for (int seg = 0; seg < n_seg; seg++) {
+    off_seg = seg * seg_size;
+    // Loop 2 in the algorithm above
+    for (int iter = 0; iter < PE_size - 1; iter++) {
+      off_send = (((my_pe_in_team + 1 - iter + 2 * PE_size) % PE_size) * chunk_size);
+      off_recv = (((my_pe_in_team - iter + 2 * PE_size) % PE_size) * chunk_size);
+
+      putmem_wg(reinterpret_cast<void *>(&pWrk[off_send]),
+                reinterpret_cast<void *>(&dst[off_send + off_seg]),
+                chunk_size * sizeof(T), send_pe);
+
+      if (is_thread_zero_in_block()) {
+        fence();
+
+        wait_val = seg + 100;
+        putmem(&pSync[iter], &wait_val, sizeof(*pSync), send_pe);
+#if defined(__gfx90a__)
+        __threadfence_system();
+#endif /* __gfx90a__ */
+        wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val);
+      }
+      __syncthreads();
+      compute_reduce<T, Op>(&pWrk[off_recv], &dst[off_seg + off_recv],
+                            chunk_size, wg_id, wg_size);
+    }
+
+    // Loop 2 in the example above
+    for (int iter = PE_size - 1; iter < 2 * PE_size - 2; iter++) {
+      off_send = (((my_pe_in_team + 1 - iter + 2 * PE_size) % PE_size) * chunk_size);
+      putmem_nbi_wg(reinterpret_cast<void *>(&dst[off_send + off_seg]),
+                    reinterpret_cast<void *>(&dst[off_send + off_seg]),
+                    chunk_size * sizeof(T), send_pe);
+
+      if (is_thread_zero_in_block()) {
+        fence();
+        wait_val = seg + 100;
+        putmem(&pSync[iter], &wait_val, sizeof(*pSync), send_pe);
+#if defined(__gfx90a__)
+        __threadfence_system();
+#endif /* __gfx90a__ */
+        wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val);
+      }
+      __syncthreads();
+    }
+  }
+  __syncthreads();
+
+  for (int i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
+    pSync[i] = ROCSHMEM_SYNC_VALUE;
+  }
+  __syncthreads();
+}
+
+template <typename T, ROCSHMEM_OP Op>
+__device__ int GDAContext::reduce(rocshmem_team_t team, T *dest,
+                                  const T *source, int nreduce) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int PE_size = team_obj->tinfo_wrt_world->size;
+
+  size_t direct_pWrk = PE_size * nreduce;
+  size_t direct_pSync = PE_size;
+  size_t ring_pSync = 2 * PE_size;
+  size_t provided_pWrk = max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
+  size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE;
+
+  if (provided_pWrk >= direct_pWrk && provided_pSync >= direct_pSync) {
+    internal_direct_allreduce<T, Op>(dest, source, nreduce, team_obj);
+  } else {
+    if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) {
+      size_t ring_pWrk = ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE;
+      // integer division truncating value
+      int chunk_size = ring_pWrk / PE_size;
+      int seg_size = chunk_size * PE_size;
+
+      // integer division truncating value
+      int n_seg = nreduce / seg_size;
+      // integer division rounding up
+      int n_seg_up = (nreduce - 1) / seg_size + 1;
+      // recalculate chunk_size
+      chunk_size = seg_size / PE_size;
+      if (n_seg == 0) {
+        n_seg = 1;
+      }
+      internal_ring_allreduce<T, Op>(dest, source, nreduce, team_obj, n_seg,
+                                     seg_size, chunk_size);
+      if (n_seg_up > n_seg) {
+        T *p_dst = (dest + (n_seg * seg_size));
+        const T *p_src = (source + (n_seg * seg_size));
+        int p_count = nreduce - (n_seg * seg_size);
+        int p_chunk = p_count / PE_size;
+
+        internal_ring_allreduce<T, Op>(p_dst, p_src, p_count, team_obj, 1,
+                                      (p_chunk * PE_size), p_chunk);
+
+        if ((p_chunk * PE_size) < p_count) {
+          // Final elements need to use direct_allreduce
+          p_count -= (p_chunk * PE_size);
+          p_dst += (p_chunk * PE_size);
+          const T *p_src2 = p_src + (p_chunk * PE_size);
+
+          internal_direct_allreduce<T, Op>(p_dst, p_src2, p_count, team_obj);
+        }
+      }
+    } else {
+      GPU_DPRINTF("Unsupported reduction size for GDA conduit.\n");
+      return ROCSHMEM_ERROR;
+    }
+  }
+  return ROCSHMEM_SUCCESS;
+}
+
+template <typename T>
+__device__ void GDAContext::internal_put_broadcast(
+    T *dst, const T *src, int nelems, int pe_root, int pe_start,
+    int stride, int pe_size) {  // NOLINT(runtime/int)
+  if (my_pe == pe_root) {
+    int finish = pe_start + stride * pe_size;
+    for (int i = pe_start; i < finish; i += stride) {
+      if (i != my_pe) {
+        put_nbi_wg(dst, src, nelems, i);
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ void GDAContext::internal_get_broadcast(
+  T *dst, const T *src, int nelems, int pe_root) {  // NOLINT(runtime/int)
+  if (my_pe != pe_root) {
+    get_wg(dst, src, nelems, pe_root);
+  }
+}
+
+template <typename T>
+__device__ void GDAContext::broadcast(rocshmem_team_t team, T *dst,
+                                      const T *src, int nelems, int pe_root) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int stride = team_obj->tinfo_wrt_world->stride;
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_size = team_obj->tinfo_wrt_world->size;
+  long *p_sync = team_obj->bcast_pSync;
+
+  // Passed pe_root is relative to team, convert to world root
+  int pe_root_world = team_obj->get_pe_in_world(pe_root);
+  internal_broadcast<T>(dst, src, nelems, pe_root_world, pe_start, stride,
+               pe_size, p_sync);
+}
+
+template <typename T>
+__device__ void GDAContext::internal_broadcast(T *dst, const T *src, int nelems,
+                                      int pe_root, int pe_start,
+                                      int stride, int pe_size,
+                                      long *p_sync) {  // NOLINT(runtime/int)
+  if (num_pes < 4) { //TODO: optimized for IPC
+    internal_put_broadcast(dst, src, nelems, pe_root, pe_start, stride,
+                           pe_size);
+  } else {
+    internal_get_broadcast(dst, src, nelems, pe_root);
+  }
+
+  // Synchronize on completion of broadcast
+  internal_sync_wg(my_pe, pe_start, stride, pe_size, p_sync);
+}
+
+template <typename T>
+__device__ void GDAContext::alltoall(rocshmem_team_t team, T *dst,
+                                     const T *src, int nelems) {
+  alltoall_linear(team, dst, src, nelems);
+}
+
+template <typename T>
+__device__ void GDAContext::alltoall_linear(rocshmem_team_t team, T *dst,
+                                            const T *src, int nelems) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_size = team_obj->num_pes;
+  int stride = team_obj->tinfo_wrt_world->stride;
+  long *pSync = team_obj->alltoall_pSync;
+  int my_pe_in_team = team_obj->my_pe;
+
+  // Have each PE put their designated data to the other PEs
+  for (int j = 0; j < pe_size; j++) {
+    int dest_pe = team_obj->get_pe_in_world(j);
+    put_nbi_wg(&dst[my_pe_in_team * nelems], &src[j * nelems], nelems, dest_pe);
+  }
+  if (is_thread_zero_in_block()) {
+    quiet();
+  }
+  // wait until everyone has obtained their designated data
+  internal_sync_wg(my_pe, pe_start, stride, pe_size, pSync);
+}
+
+template <typename T>
+__device__ void GDAContext::fcollect(rocshmem_team_t team, T *dst,
+                                     const T *src, int nelems) {
+  fcollect_linear(team, dst, src, nelems);
+}
+
+template <typename T>
+__device__ void GDAContext::fcollect_linear(rocshmem_team_t team, T *dst,
+                                            const T *src, int nelems) {
+  GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
+
+  int pe_start = team_obj->tinfo_wrt_world->pe_start;
+  int pe_size = team_obj->num_pes;
+  int stride = team_obj->tinfo_wrt_world->stride;
+  long *pSync = team_obj->alltoall_pSync;
+  int my_pe_in_team = team_obj->my_pe;
+
+  // Have each PE put their designated data to the other PEs
+  for (int j = 0; j < pe_size; j++) {
+    int dest_pe = team_obj->get_pe_in_world(j);
+    put_nbi_wg(&dst[my_pe_in_team * nelems], src, nelems, dest_pe);
+  }
+
+  if (is_thread_zero_in_block()) {
+    quiet();
+  }
+  // wait until everyone has obtained their designated data
+  internal_sync_wg(my_pe, pe_start, stride, pe_size, pSync);
+}
+
+// Block/wave functions
+template <typename T>
+__device__ void GDAContext::put_wg(T *dest, const T *source, size_t nelems, int pe) {
+  putmem_wg(dest, source, nelems * sizeof(T), pe);
+}
+
+template <typename T>
+__device__ void GDAContext::put_nbi_wg(T *dest, const T *source, size_t nelems, int pe) {
+  putmem_nbi_wg(dest, source, nelems * sizeof(T), pe);
+}
+
+  template <typename T>
+__device__ void GDAContext::put_wave(T *dest, const T *source, size_t nelems, int pe) {
+  putmem_wave(dest, source, nelems * sizeof(T), pe);
+}
+
+template <typename T>
+__device__ void GDAContext::put_nbi_wave(T *dest, const T *source, size_t nelems, int pe) {
+  putmem_nbi_wave(dest, source, nelems * sizeof(T), pe);
+}
+
+template <typename T>
+__device__ void GDAContext::get_wg(T *dest, const T *source, size_t nelems, int pe) {
+  getmem_wg(dest, source, nelems * sizeof(T), pe);
+}
+
+template <typename T>
+__device__ void GDAContext::get_nbi_wg(T *dest, const T *source, size_t nelems, int pe) {
+  getmem_nbi_wg(dest, source, nelems * sizeof(T), pe);
+}
+
+template <typename T>
+__device__ void GDAContext::get_wave(T *dest, const T *source, size_t nelems, int pe) {
+  getmem_wave(dest, source, nelems * sizeof(T), pe);
+}
+
+template <typename T>
+__device__ void GDAContext::get_nbi_wave(T *dest, const T *source, size_t nelems, int pe) {
+  getmem_nbi_wave(dest, source, nelems * sizeof(T), pe);
+}
+
+#define GDA_CONTEXT_PUT_SIGNAL_DEF(SUFFIX)                                                            \
+  template <typename T>                                                                               \
+  __device__ void GDAContext::put_signal##SUFFIX(T *dest, const T *source, size_t nelems,             \
+                                                 uint64_t *sig_addr, uint64_t signal, int sig_op,     \
+                                                 int pe) {                                            \
+    putmem_signal##SUFFIX(dest, source, nelems * sizeof(T), sig_addr, signal, sig_op, pe);            \
+  }                                                                                                   \
+                                                                                                      \
+  template <typename T>                                                                               \
+  __device__ void GDAContext::put_signal_nbi##SUFFIX(T *dest, const T *source, size_t nelems,         \
+                                                     uint64_t *sig_addr, uint64_t signal, int sig_op, \
+                                                     int pe) {                                        \
+    putmem_signal##SUFFIX(dest, source, nelems * sizeof(T), sig_addr, signal, sig_op, pe);            \
+  }
+
+GDA_CONTEXT_PUT_SIGNAL_DEF()
+GDA_CONTEXT_PUT_SIGNAL_DEF(_wg)
+GDA_CONTEXT_PUT_SIGNAL_DEF(_wave)
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_CONTEXT_TMPL_DEVICE_HPP_
diff --git a/src/gda/context_gda_tmpl_host.hpp b/src/gda/context_gda_tmpl_host.hpp
new file mode 100644
index 0000000000..b4006331b4
--- /dev/null
+++ b/src/gda/context_gda_tmpl_host.hpp
@@ -0,0 +1,169 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_CONTEXT_TMPL_HOST_HPP_
+#define LIBRARY_SRC_GDA_CONTEXT_TMPL_HOST_HPP_
+
+#include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
+#include "host/host_templates.hpp"
+
+namespace rocshmem {
+
+template <typename T>
+__host__ void GDAHostContext::p(T *dest, T value, int pe) {
+  host_interface->p<T>(dest, value, pe, context_window_info);
+}
+
+template <typename T>
+__host__ T GDAHostContext::g(const T *source, int pe) {
+  return host_interface->g<T>(source, pe, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::put(T *dest, const T *source, size_t nelems, int pe) {
+  host_interface->put<T>(dest, source, nelems, pe, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::get(T *dest, const T *source, size_t nelems, int pe) {
+  host_interface->get<T>(dest, source, nelems, pe, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) {
+  host_interface->put_nbi<T>(dest, source, nelems, pe, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) {
+  host_interface->get_nbi<T>(dest, source, nelems, pe, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::amo_add(void *dst, T value, int pe) {
+  host_interface->amo_add(dst, value, pe, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::amo_cas(void *dst, T value, T cond, int pe) {
+  host_interface->amo_cas(dst, value, cond, pe, context_window_info);
+}
+
+template <typename T>
+__host__ T GDAHostContext::amo_fetch_add(void *dst, T value, int pe) {
+  return host_interface->amo_fetch_add(dst, value, pe, context_window_info);
+}
+
+template <typename T>
+__host__ T GDAHostContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
+  return host_interface->amo_fetch_cas(dst, value, cond, pe, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::broadcast(
+    T *dest, const T *source, int nelems, int pe_root, int pe_start,
+    int log_pe_stride, int pe_size,
+    long *p_sync) {  // NOLINT(runtime/int)
+  host_interface->broadcast<T>(dest, source, nelems, pe_root, pe_start,
+                               log_pe_stride, pe_size, p_sync);
+}
+
+template <typename T>
+__host__ void GDAHostContext::broadcast(rocshmem_team_t team, T *dest,
+                                        const T *source, int nelems,
+                                        int pe_root) {
+  host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
+}
+
+template <typename T, ROCSHMEM_OP Op>
+__host__ void GDAHostContext::to_all(T *dest, const T *source, int nreduce,
+                                     int pe_start, int log_pe_stride,
+                                     int pe_size, T *p_wrk,
+                                     long *p_sync) {  // NOLINT(runtime/int)
+  host_interface->to_all<T, Op>(dest, source, nreduce, pe_start, log_pe_stride,
+                                pe_size, p_wrk, p_sync);
+}
+
+template <typename T, ROCSHMEM_OP Op>
+__host__ int GDAHostContext::reduce(rocshmem_team_t team, T *dest,
+                                    const T *source, int nreduce) {
+  return host_interface->reduce<T, Op>(team, dest, source, nreduce);
+}
+
+template <typename T>
+__host__ void GDAHostContext::wait_until(T *ivars, int cmp, T val) {
+  host_interface->wait_until<T>(ivars, cmp, val, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::wait_until_all(T *ivars, size_t nelems,
+                                             const int* status,
+                                             int cmp, T val) {
+  host_interface->wait_until_all<T>(ivars, nelems, status, cmp, val, context_window_info);
+}
+
+template <typename T>
+__host__ size_t GDAHostContext::wait_until_any(T *ivars, size_t nelems,
+                                               const int* status,
+                                               int cmp, T val) {
+  return host_interface->wait_until_any<T>(ivars, nelems, status, cmp, val, context_window_info);
+}
+
+template <typename T>
+__host__ size_t GDAHostContext::wait_until_some(T *ivars, size_t nelems, size_t* indices,
+                                                const int* status,
+                                                int cmp, T val) {
+  return host_interface->wait_until_some<T>(ivars, nelems, indices, status, cmp, val, context_window_info);
+}
+
+template <typename T>
+__host__ void GDAHostContext::wait_until_all_vector(T *ivars, size_t nelems,
+                                                    const int* status,
+                                                    int cmp, T* vals) {
+  host_interface->wait_until_all_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
+}
+
+template <typename T>
+__host__ size_t GDAHostContext::wait_until_any_vector(T *ivars, size_t nelems,
+                                                      const int* status,
+                                                      int cmp, T* vals) {
+  return host_interface->wait_until_any_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
+}
+
+template <typename T>
+__host__ size_t GDAHostContext::wait_until_some_vector(T *ivars, size_t nelems,
+                                                       size_t* indices,
+                                                       const int* status,
+                                                       int cmp, T* vals) {
+  return host_interface->wait_until_some_vector<T>(ivars, nelems, indices, status, cmp, vals, context_window_info);
+}
+
+template <typename T>
+__host__ int GDAHostContext::test(T *ivars, int cmp, T val) {
+  return host_interface->test<T>(ivars, cmp, val, context_window_info);
+}
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_CONTEXT_TMPL_HOST_HPP_
diff --git a/src/gda/endian.cpp b/src/gda/endian.cpp
new file mode 100644
index 0000000000..c8baeb18d5
--- /dev/null
+++ b/src/gda/endian.cpp
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "endian.hpp"
+
+namespace rocshmem {
+
+template <typename T>
+__device__ void swap_endian_store(T *dst, const T val) {
+  typedef union U {
+    T val;
+    uint8_t bytes[sizeof(T)];
+  } union_type;
+  union_type src;
+  union_type dst_tmp;
+
+  src.val = val;
+  std::reverse_copy(src.bytes, src.bytes + sizeof(T), dst_tmp.bytes);
+  *dst = dst_tmp.val;
+}
+
+template <>
+__device__ void swap_endian_store(uint64_t *dst, const uint64_t val) {
+  uint64_t new_val = ((val << 8) & 0xFF00FF00FF00FF00ULL) |
+                     ((val >> 8) & 0x00FF00FF00FF00FFULL);
+
+  new_val = ((new_val << 16) & 0xFFFF0000FFFF0000ULL) |
+            ((new_val >> 16) & 0x0000FFFF0000FFFFULL);
+
+  *dst = (new_val << 32) | (new_val >> 32);
+}
+
+template <>
+__device__ void swap_endian_store(int64_t *dst, const int64_t val) {
+  swap_endian_store(reinterpret_cast<uint64_t*>(dst), (const uint64_t)val);
+}
+
+template <>
+__device__ void swap_endian_store(uint32_t *dst, const uint32_t val) {
+  uint32_t new_val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
+
+  *dst = (new_val << 16) | (new_val >> 16);
+}
+
+template <>
+__device__ void swap_endian_store(int32_t *dst, const int32_t val) {
+  swap_endian_store(reinterpret_cast<uint32_t*>(dst), (const uint32_t)val);
+}
+
+template <>
+__device__ void swap_endian_store(uint16_t *dst, const uint16_t val) {
+  *dst = ((val << 8) & 0xFF00) | ((val >> 8) & 0x00FF);
+}
+
+template <>
+__device__ void swap_endian_store(int16_t *dst, const int16_t val) {
+  swap_endian_store(reinterpret_cast<uint16_t*>(dst), (const uint16_t)val);
+}
+
+}  // namespace rocshmem
diff --git a/src/gda/endian.hpp b/src/gda/endian.hpp
new file mode 100644
index 0000000000..61663cdbcb
--- /dev/null
+++ b/src/gda/endian.hpp
@@ -0,0 +1,62 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_ENDIAN_HPP_
+#define LIBRARY_SRC_GDA_ENDIAN_HPP_
+
+#include <hip/hip_runtime.h>
+
+namespace rocshmem {
+
+template <typename T>
+__device__ void swap_endian_store(T *dst, const T val);
+
+template <>
+__device__ void swap_endian_store(uint64_t *dst, const uint64_t val);
+
+template <>
+__device__ void swap_endian_store(int64_t *dst, const int64_t val);
+
+template <>
+__device__ void swap_endian_store(uint32_t *dst, const uint32_t val);
+
+template <>
+__device__ void swap_endian_store(int32_t *dst, const int32_t val);
+
+template <>
+__device__ void swap_endian_store(uint16_t *dst, const uint16_t val);
+
+template <>
+__device__ void swap_endian_store(int16_t *dst, const int16_t val);
+
+template <typename T>
+__device__ T swap_endian_val(const T val) {
+  T dst;
+  swap_endian_store(&dst, val);
+  return dst;
+}
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_ENDIAN_HPP_
diff --git a/src/gda/gda_context_proxy.hpp b/src/gda/gda_context_proxy.hpp
new file mode 100644
index 0000000000..14cac518f5
--- /dev/null
+++ b/src/gda/gda_context_proxy.hpp
@@ -0,0 +1,102 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_CONTEXT_PROXY_HPP_
+#define LIBRARY_SRC_GDA_CONTEXT_PROXY_HPP_
+
+
+#include "device_proxy.hpp"
+#include "backend_gda.hpp"
+
+namespace rocshmem {
+
+class GDABackend;
+
+template <typename ALLOCATOR>
+class GDADefaultContextProxy {
+  using ProxyT = DeviceProxy<ALLOCATOR, GDAContext>;
+
+ public:
+  GDADefaultContextProxy() = default;
+
+  /*
+   * Placement new the memory which is allocated by proxy_
+   */
+  explicit GDADefaultContextProxy(GDABackend* backend, TeamInfo *tinfo,
+                                  size_t num_elems = 1)
+  : constructed_{true}, proxy_{num_elems} {
+    auto ctx{proxy_.get()};
+    new (ctx) GDAContext(reinterpret_cast<Backend*>(backend), 0);
+    ctx->tinfo = tinfo;
+    rocshmem_ctx_t local{ctx, tinfo};
+    set_internal_ctx(&local);
+  }
+
+  /*
+   * Since placement new is called in the constructor, then
+   * delete must be called manually.
+   */
+  ~GDADefaultContextProxy() {
+    if (constructed_) {
+      proxy_.get()->~GDAContext();
+    }
+  }
+
+  GDADefaultContextProxy(const GDADefaultContextProxy& other) = delete;
+
+  GDADefaultContextProxy& operator=(const GDADefaultContextProxy& other) = delete;
+
+  GDADefaultContextProxy(GDADefaultContextProxy&& other) = default;
+
+  GDADefaultContextProxy& operator=(GDADefaultContextProxy&& other) {
+    if (this != &other) {
+      proxy_ = std::move(other.proxy_);
+      constructed_ = true;
+      other.constructed_ = false;
+    }
+    return *this;
+  }
+
+  /*
+   * @brief Provide access to the memory referenced by the proxy
+   */
+  __host__ __device__ Context* get() { return proxy_.get(); }
+
+ private:
+  /*
+   * @brief Memory managed by the lifetime of this object
+   */
+  ProxyT proxy_{};
+
+  /*
+   * @brief denotes if an objects was constructed in proxy
+   */
+  bool constructed_{false};
+};
+
+using GDADefaultContextProxyT = GDADefaultContextProxy<HIPAllocator>;
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_CONTEXT_PROXY_HPP_
diff --git a/src/gda/gda_team.cpp b/src/gda/gda_team.cpp
new file mode 100644
index 0000000000..d419ada7a7
--- /dev/null
+++ b/src/gda/gda_team.cpp
@@ -0,0 +1,54 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "gda_team.hpp"
+
+#include "constants.hpp"
+#include "backend_type.hpp"
+#include "backend_gda.hpp"
+
+namespace rocshmem {
+
+GDATeam::GDATeam(Backend *backend, TeamInfo *team_info_parent,
+                 TeamInfo *team_info_world, int num_pes, int my_pe,
+                 MPI_Comm mpi_comm, int pool_index)
+    : Team(backend, team_info_parent, team_info_world, num_pes, my_pe,
+           mpi_comm) {
+  type = BackendType::GDA_BACKEND;
+  const GDABackend *b = static_cast<const GDABackend *>(backend);
+
+  pool_index_ = pool_index;
+
+  barrier_pSync = &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
+  reduce_pSync = &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
+  bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
+  alltoall_pSync = &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
+
+  pWrk = reinterpret_cast<char *>(b->pWrk_pool) + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
+  pAta = reinterpret_cast<char *>(b->pAta_pool) + ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
+}
+
+GDATeam::~GDATeam() {}
+
+}  // namespace rocshmem
diff --git a/src/gda/gda_team.hpp b/src/gda/gda_team.hpp
new file mode 100644
index 0000000000..4d4a4e54b0
--- /dev/null
+++ b/src/gda/gda_team.hpp
@@ -0,0 +1,52 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_TEAM_HPP_
+#define LIBRARY_SRC_GDA_TEAM_HPP_
+
+#include "team.hpp"
+
+namespace rocshmem {
+
+class GDATeam : public Team {
+ public:
+  GDATeam(Backend* handle, TeamInfo* team_info_wrt_parent,
+          TeamInfo* team_info_wrt_world, int num_pes, int my_pe,
+          MPI_Comm team_comm, int pool_index);
+
+  virtual ~GDATeam();
+
+  long* barrier_pSync{nullptr};
+  long* reduce_pSync{nullptr};
+  long* bcast_pSync{nullptr};
+  long* alltoall_pSync{nullptr};
+  void* pWrk{nullptr};
+  void* pAta{nullptr};
+
+  int pool_index_{-1};
+};
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_TEAM_HPP_
diff --git a/src/gda/queue_pair.cpp b/src/gda/queue_pair.cpp
new file mode 100644
index 0000000000..e44292d364
--- /dev/null
+++ b/src/gda/queue_pair.cpp
@@ -0,0 +1,623 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "queue_pair.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include "backend_gda.hpp"
+#include "endian.hpp"
+#if !defined(GDA_IONIC) && !defined(GDA_BNXT)
+#include "segment_builder.hpp"
+#endif
+#include "util.hpp"
+#include "constants.hpp"
+
+namespace rocshmem {
+
+QueuePair::QueuePair(struct ibv_pd* pd) {
+  allocator.allocate((void**)&nonfetching_atomic, 8);
+  CHECK_HIP(hipMemset(nonfetching_atomic, 0, 8));
+  int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+
+  ibv_mr *mr = ibv_reg_mr(pd, nonfetching_atomic, 8, access);
+  CHECK_NNULL(mr, "ibv_reg_mr");
+
+#if defined(GDA_IONIC) || defined(GDA_BNXT)
+  nonfetching_atomic_lkey = mr->lkey;
+#else
+  nonfetching_atomic_lkey = htobe32(mr->lkey);
+#endif
+
+  allocator.allocate((void**)&fetching_atomic, 8 * FETCHING_ATOMIC_CNT);
+  CHECK_HIP(hipMemset(fetching_atomic, 0, 8 * FETCHING_ATOMIC_CNT));
+  access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+  mr = ibv_reg_mr(pd, fetching_atomic, 8 * FETCHING_ATOMIC_CNT, access);
+  CHECK_NNULL(mr, "ibv_reg_mr");
+#if defined(GDA_IONIC) || defined(GDA_BNXT)
+  fetching_atomic_lkey = mr->lkey;
+#else
+  fetching_atomic_lkey = htobe32(mr->lkey);
+#endif
+
+  allocator.allocate((void**)&fetching_atomic_freelist, sizeof(FreeListT*));
+  new (fetching_atomic_freelist) FreeListT();
+  for(int i{0}; i < FETCHING_ATOMIC_CNT; i+=WF_SIZE) {
+    fetching_atomic_freelist->push_back(fetching_atomic + i);
+  }
+}
+
+
+/******************************************************************************
+ ************************ PROVIDER-SPECIFIC HELPERS ***************************
+ *****************************************************************************/
+#ifdef GDA_IONIC
+__device__ uint64_t QueuePair::get_same_qp_lane_mask() {
+  uint64_t lane_mask = get_active_lane_mask();
+  uintptr_t this_val = reinterpret_cast<uintptr_t>(this);
+
+  // exclude threads operating on a different qp from this thread lane mask
+  #pragma unroll
+  for (int i = 0; i < 64; ++i) {
+    uint64_t bit_i = 1ull << i;
+    if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) {
+      lane_mask &= ~bit_i;
+    }
+  }
+
+  return lane_mask;
+}
+
+__device__ bool QueuePair::cq_lock_try_acquire(uint64_t activemask) {
+  uint32_t cq_lock_val = SPIN_LOCK_INVALID;
+
+  if (is_first_active_lane(activemask)) {
+    cq_lock_val = SPIN_LOCK_UNLOCKED;
+    __hip_atomic_compare_exchange_strong(&cq_lock, &cq_lock_val, SPIN_LOCK_LOCKED,
+                                         __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  cq_lock_val = __shfl(cq_lock_val, get_first_active_lane_id(activemask));
+
+  return (cq_lock_val == SPIN_LOCK_UNLOCKED);
+}
+
+__device__ void QueuePair::cq_lock_release(uint64_t activemask) {
+  if (is_first_active_lane(activemask)) {
+    __hip_atomic_store(&cq_lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
+  }
+}
+
+__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) {
+  uint32_t my_sq_prod = 0;
+
+  // reserve space for wqes in sq
+  if (is_first_active_lane(activemask)) {
+    my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask));
+
+  // wait for that space to be available
+  quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask);
+
+  return my_sq_prod;
+}
+
+__device__ uint32_t QueuePair::commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe) {
+  uint32_t dbprod = my_sq_prod + num_wqes;
+
+  if (last) {
+    // signal last wqe before the doorbell
+    wqe->base.flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
+
+    while (__hip_atomic_load(&sq_dbprod, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT) != my_sq_prod) {
+      // spin
+    }
+
+    ring_doorbell(dbprod);
+
+    __hip_atomic_exchange(&sq_dbprod, dbprod, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
+  }
+
+  return dbprod;
+}
+
+__device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
+  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
+  uint32_t my_cq_pos = cq_pos + my_logical_lane_id;
+
+  /* Look at the cqe at the current position in the cq buffer */
+  struct ionic_v1_cqe *cqe = &cq_buf[my_cq_pos & cq_mask];
+
+  /* Determine expected color based on cq wrap count */
+  uint32_t qtf_color_bit = swap_endian_val<uint32_t>(IONIC_V1_CQE_COLOR);
+  uint32_t qtf_color_exp = qtf_color_bit;
+  if (my_cq_pos & (cq_mask + 1)) {
+    qtf_color_exp = 0;
+  }
+
+  /* Wait for at least one thread cqe color == expected color */
+  uint32_t qtf_be;
+  bool ready;
+  uint64_t ballot_ready;
+  do {
+    qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
+    ready = (qtf_be & qtf_color_bit) == qtf_color_exp;
+    ballot_ready = __ballot(ready);
+  } while (!ballot_ready);
+
+  /* Other threads saw a ready cqe, but not this thread */
+  if (!ready) {
+    return;
+  }
+
+  uint32_t msn = swap_endian_val<uint32_t>(cqe->send.msg_msn);
+
+  /* Report if the completion indicates an error. */
+  if (!!(qtf_be & swap_endian_val<uint32_t>(IONIC_V1_CQE_ERROR))) {
+#ifdef DEBUG
+    uint32_t qtf = swap_endian_val<uint32_t>(qtf_be);
+    uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT;
+    uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
+    uint32_t flag = qtf & 0xf;
+    uint32_t status = swap_endian_val<uint32_t>(cqe->status_length);
+    uint64_t npg = swap_endian_val<uint64_t>(cqe->send.npg_wqe_id);
+
+    printf("QUIET ERROR: qid %u type %u flag %#x status %u msn %u npg %lu\n",
+        qid, type, flag, status, msn, npg);
+#endif
+    /* No other way to signal an error, so just crash. */
+    abort();
+  }
+
+  /* Only proceed with the furthest ahead cqe to update the sq state */
+  uint64_t my_lane_mask = 1ull << __lane_id();
+  uint64_t lesser_lane_mask = my_lane_mask - 1;
+  if (my_lane_mask != (ballot_ready & ~lesser_lane_mask)) {
+    return;
+  }
+
+  /* update position in the cq */
+  cq_pos = my_cq_pos + 1;
+
+  /*
+   * Ring cq doorbell frequently enough to avoid cq full.
+   *
+   * NB: IONIC_CQ_GRACE is 100
+   */
+  if (((cq_pos - cq_dbpos) & cq_mask) >= 100) {
+    cq_dbpos = cq_pos;
+    __atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed?
+  }
+
+  sq_msn = msn;
+}
+
+__device__ void QueuePair::quiet_internal(uint64_t activemask, uint32_t cons) {
+  /* wait for sq_msn to catch up or pass cons. */
+  /* 0x800000 - sign bit for 24-bit fields     */
+  while ((sq_msn - cons) & 0x800000) {
+    if (!cq_lock_try_acquire(activemask)) {
+      continue;
+    }
+
+    /* with lock acquired, this wave polls cqes until caught up */
+    while ((sq_msn - cons) & 0x800000) {
+      poll_wave_cqes(activemask);
+    }
+
+    cq_lock_release(activemask);
+    break;
+  }
+}
+#endif // GDA_IONIC
+
+#ifndef GDA_BNXT
+#ifdef GDA_IONIC
+__device__ void QueuePair::ring_doorbell(uint32_t pos) {
+  // TODO When threads write at once to the same address, not all writes reach the bus.
+  for (int i = 0; i < 64; ++i) {
+    if (__lane_id() == i) {
+      __threadfence();
+      __atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST);
+    }
+  }
+  __threadfence();
+}
+#else // !GDA_IONIC
+__device__ void QueuePair::ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) {
+  swap_endian_store(const_cast<uint32_t*>(dbrec), (uint32_t)my_sq_counter);
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+  __hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
+  uint64_t db_uint = __hip_atomic_load(&db.uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  db_uint ^= 0x100;
+  __hip_atomic_store(&db.uint, db_uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+#endif // !GDA_IONIC
+#endif // !GDA_BNXT
+
+#ifndef GDA_BNXT
+#ifdef GDA_IONIC
+__device__ void QueuePair::quiet() {
+  quiet_internal(get_same_qp_lane_mask(), sq_prod);
+}
+#else // !GDA_IONIC
+__device__ void QueuePair::quiet() {
+  constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE;
+  __shared__ uint64_t wqe_broadcast[BROADCAST_SIZE];
+  uint8_t wavefront_id = get_flat_block_id() / WF_SIZE;
+  wqe_broadcast[wavefront_id] = 0;
+
+  uint64_t activemask = get_active_lane_mask();
+  uint8_t num_active_lanes = get_active_lane_count(activemask);
+  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
+  bool is_leader{my_logical_lane_id == 0};
+  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
+
+  while (true) {
+    bool done{false};
+    uint64_t quiet_amount{0};
+    uint64_t wave_cq_consumer{0};
+    while (!done) {
+      uint64_t active = __hip_atomic_load(&quiet_active, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      uint64_t posted = __hip_atomic_load(&quiet_posted, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      uint64_t completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      if (!(posted - completed)) {
+        return;
+      }
+      int64_t quiet_val = posted - active;
+      if (quiet_val <= 0) {
+        continue;
+      }
+      quiet_amount = min(num_active_lanes, quiet_val);
+      if (is_leader) {
+        done = __hip_atomic_compare_exchange_strong(&quiet_active, &active, active + quiet_amount, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+        if (done) {
+          wave_cq_consumer = __hip_atomic_fetch_add(&cq_consumer, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+        }
+      }
+      done = __shfl(done, leader_phys_lane_id);
+    }
+    wave_cq_consumer = __shfl(wave_cq_consumer, leader_phys_lane_id);
+    uint64_t my_cq_consumer = wave_cq_consumer + my_logical_lane_id;
+    uint64_t my_cq_index = my_cq_consumer % cq_cnt;
+
+    if (my_logical_lane_id < quiet_amount) {
+      volatile mlx5_cqe64 *cqe_entry = &cq_buf[my_cq_index];
+      uint16_t be_wqe_counter{0};
+      uint8_t op_own{0};
+      uint8_t owner_bit = (my_cq_consumer >> cq_log_cnt) & 1;
+      bool vote_failed{true};
+
+      while (vote_failed) {
+        op_own = *((volatile uint8_t*)&cqe_entry->op_own);
+        bool my_ownership_vote = (op_own & 1) == owner_bit;
+        bool my_opcode_vote = (op_own >> 4) != MLX5_CQE_INVALID;
+        uint64_t votes = __ballot(my_ownership_vote && my_opcode_vote);
+        vote_failed = __popcll(votes) < quiet_amount;
+        if (!vote_failed) {
+          be_wqe_counter = *((volatile uint16_t*)&cqe_entry->wqe_counter);
+        }
+      }
+
+      uint16_t wqe_counter;
+      swap_endian_store(const_cast<uint16_t*>(&wqe_counter), reinterpret_cast<uint16_t>(be_wqe_counter));
+      uint64_t wqe_id =  outstanding_wqes[wqe_counter];
+      __hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+      uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit;
+      *((volatile uint8_t*)&cqe_entry->op_own) = mlx5_invld_bits;
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    }
+    if (is_leader) {
+      uint64_t completed {0};
+      do {
+        completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      } while (completed != wave_cq_consumer);
+
+      swap_endian_store(const_cast<uint32_t*>(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount));
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+      uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id];
+      __hip_atomic_fetch_max(&sq_sunk, sunk_wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      __hip_atomic_fetch_add(&quiet_completed, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+  }
+}
+#endif // !GDA_IONIC
+#endif // !GDA_BNXT
+
+#ifndef GDA_BNXT
+#ifdef GDA_IONIC
+__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
+  uint64_t activemask = get_same_qp_lane_mask();
+  uint32_t num_wqes = get_active_lane_count(activemask);
+  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
+  uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
+  uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
+  struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask];
+
+  // TODO why is this needed?
+  if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) {
+    size = 1;
+  }
+
+  wqe->base.wqe_id = my_sq_pos;
+  wqe->base.op = opcode;
+  wqe->base.num_sge_key = size ? 1 : 0;
+  wqe->base.flags = swap_endian_val<uint16_t>(0);
+  wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
+
+  wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
+  wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
+  wqe->common.rdma.remote_rkey = swap_endian_val<uint32_t>(rkey);
+  wqe->common.length = swap_endian_val<uint32_t>(size);
+
+  if (size) {
+    if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) {
+      wqe->base.flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
+      wqe->base.num_sge_key = 0;
+      if (!laddr) {
+        // TODO why is this needed?
+        wqe->common.pld.data[0] = 1;
+      } else {
+        memcpy(wqe->common.pld.data, laddr, size);
+      }
+    } else {
+      wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(laddr));
+      wqe->common.pld.sgl[0].len = swap_endian_val<uint32_t>(size);
+      wqe->common.pld.sgl[0].lkey = swap_endian_val<uint32_t>(lkey);
+    }
+  }
+
+  commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe);
+}
+#else // !GDA_IONIC
+__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
+  uint64_t activemask = get_active_lane_mask();
+  uint8_t num_active_lanes = get_active_lane_count(activemask);
+  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
+  bool is_leader{my_logical_lane_id == 0};
+  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
+  uint8_t num_wqes{num_active_lanes};
+  uint64_t wave_sq_counter{0};
+
+  if (is_leader) {
+    wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
+  uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
+  uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
+
+  while (true) {
+    uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    int64_t num_active_sq_entries = db_touched - sunk;
+    if (num_active_sq_entries < 0) {
+      continue;
+    }
+    uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
+    uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
+    if (num_free_entries > num_entries_until_wave_last_entry) {
+      break;
+    }
+    quiet();
+  }
+
+  outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
+
+  SegmentBuilder seg_build(my_sq_index, sq_buf);
+  seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0);
+  seg_build.update_raddr_seg(raddr, rkey);
+  seg_build.update_data_seg(laddr, size, lkey);
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+  if (is_leader) {
+    uint64_t db_touched {0};
+    do {
+      db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    } while (db_touched != wave_sq_counter);
+
+    uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
+    uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
+    ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
+
+    __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+}
+#endif // !GDA_IONIC
+#endif // !GDA_BNXT
+
+#ifndef GDA_BNXT
+#ifdef GDA_IONIC
+__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
+                                            int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
+  uint64_t activemask = get_same_qp_lane_mask();
+  uint32_t num_wqes = get_active_lane_count(activemask);
+  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
+  bool is_leader{my_logical_lane_id == 0};
+  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
+  uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
+  uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
+  struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask];
+  uint32_t cons;
+
+  uint64_t* wave_fetch_atomic{nullptr};
+  if (fetching) {
+    if (is_leader) {
+      auto res = fetching_atomic_freelist->pop_front();
+      while (!res.success) {
+        res = fetching_atomic_freelist->pop_front();
+      }
+      wave_fetch_atomic = res.value;
+    }
+    wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
+  }
+
+  wqe->base.wqe_id = my_sq_pos;
+  wqe->base.op = opcode;
+  wqe->base.num_sge_key = 1;
+  wqe->base.flags = swap_endian_val<uint16_t>(0);
+  wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
+
+  wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
+  wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
+  wqe->atomic_v2.remote_rkey = swap_endian_val<uint32_t>(rkey);
+  wqe->atomic_v2.swap_add_high = swap_endian_val<uint32_t>(atomic_data >> 32);
+  wqe->atomic_v2.swap_add_low = swap_endian_val<uint32_t>(atomic_data);
+  wqe->atomic_v2.compare_high = swap_endian_val<uint32_t>(atomic_cmp >> 32);
+  wqe->atomic_v2.compare_low = swap_endian_val<uint32_t>(atomic_cmp);
+
+  if (fetching) {
+    wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(wave_fetch_atomic + my_logical_lane_id));
+    wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(fetching_atomic_lkey);
+  } else {
+    wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(nonfetching_atomic));
+    wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(nonfetching_atomic_lkey);
+  }
+
+  cons = commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe);
+
+  uint64_t ret{0};
+  if (fetching) {
+    quiet_internal(activemask, cons);
+    ret = wave_fetch_atomic[my_logical_lane_id];
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    if (is_leader) {
+      fetching_atomic_freelist->push_back(wave_fetch_atomic);
+    }
+  }
+  return ret;
+}
+#else // !GDA_IONIC || !GDA_BNXT
+__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
+                                            int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
+  uint64_t activemask = get_active_lane_mask();
+  uint8_t num_active_lanes = get_active_lane_count(activemask);
+  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
+  bool is_leader{my_logical_lane_id == 0};
+  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
+  uint8_t num_wqes{num_active_lanes};
+  uint64_t wave_sq_counter{0};
+
+  if (is_leader) {
+    wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
+  uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
+  uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
+
+  while (true) {
+    uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    int64_t num_active_sq_entries = db_touched - sunk;
+    if (num_active_sq_entries < 0) {
+      continue;
+    }
+    uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
+    uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
+    if (num_free_entries > num_entries_until_wave_last_entry) {
+      break;
+    }
+    quiet();
+  }
+
+  uint64_t* wave_fetch_atomic{nullptr};
+  if (fetching) {
+    if (is_leader) {
+      uint64_t db_touched {0};
+      do {
+        db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      } while (db_touched != wave_sq_counter);
+
+      auto res = fetching_atomic_freelist->pop_front();
+      while (!res.success) {
+        res = fetching_atomic_freelist->pop_front();
+      }
+      wave_fetch_atomic = res.value;
+    }
+    wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
+  }
+
+  outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
+
+  SegmentBuilder seg_build(my_sq_index, sq_buf);
+  seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 4, 0, 0);
+  seg_build.update_raddr_seg(raddr, rkey);
+  seg_build.update_atomic_seg(atomic_data, atomic_cmp);
+  if (fetching) {
+    seg_build.update_data_seg(wave_fetch_atomic + my_logical_lane_id, 8, fetching_atomic_lkey);
+  } else {
+    seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey);
+  }
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+  if (is_leader) {
+    uint64_t db_touched {0};
+    do {
+      db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    } while (db_touched != wave_sq_counter);
+
+    uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
+    uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
+    ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
+
+    __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+
+  uint64_t ret{0};
+  if (fetching) {
+    quiet();
+    ret = wave_fetch_atomic[my_logical_lane_id];
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    if (is_leader) {
+      fetching_atomic_freelist->push_back(wave_fetch_atomic);
+    }
+  }
+  return ret;
+}
+#endif // !GDA_IONIC
+#endif // !GDA_BNXT
+
+/******************************************************************************
+ ****************************** SHMEM INTERFACE *******************************
+ *****************************************************************************/
+__device__ void QueuePair::put_nbi(void *dest, const void *source, size_t nelems, int pe) {
+  uintptr_t *src = reinterpret_cast<uintptr_t*>(const_cast<void*>(source));
+  uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
+  post_wqe_rma(pe, nelems, src, dst, GDA_OP_RDMA_WRITE);
+}
+
+__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe, uint8_t atomic_op) {
+  uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
+  return post_wqe_amo(pe, sizeof(int64_t), dst, atomic_op, atomic_data, atomic_cmp, true);
+}
+
+__device__ void QueuePair::atomic_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe, uint8_t atomic_op) {
+  uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
+  post_wqe_amo(pe, sizeof(int64_t), dst, atomic_op, atomic_data, atomic_cmp, false);
+}
+
+}  // namespace rocshmem
diff --git a/src/gda/queue_pair.hpp b/src/gda/queue_pair.hpp
new file mode 100644
index 0000000000..eac8107648
--- /dev/null
+++ b/src/gda/queue_pair.hpp
@@ -0,0 +1,310 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_
+#define LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_
+
+/**
+ * @file queue_pair.hpp
+ *
+ * @section DESCRIPTION
+ * An IB QueuePair (SQ and CQ) that the device can use to perform network
+ * operations. Most important rocSHMEM operations are performed by this
+ * class.
+ */
+
+#include "rocshmem_config.h"
+#include "endian.h"
+#include "constants.hpp"
+#ifdef GDA_IONIC
+extern "C" {
+#include <infiniband/ionic_dv.h>
+#include <infiniband/ionic_fw.h>
+}
+#elif defined(GDA_BNXT)
+#include "bnxt/provider_gda_bnxt.hpp"
+#else
+#include <infiniband/mlx5dv.h>
+#endif
+
+#include "containers/free_list.hpp"
+#include "memory/hip_allocator.hpp"
+
+#ifdef GDA_IONIC
+#define GDA_MAX_ATOMIC     15
+#define GDA_OP_RDMA_WRITE  IONIC_V2_OP_RDMA_WRITE
+#define GDA_OP_ATOMIC_FA   IONIC_V2_OP_ATOMIC_FA
+#define GDA_OP_ATOMIC_CS   IONIC_V2_OP_ATOMIC_CS
+#elif !defined(GDA_BNXT)
+#define GDA_MAX_ATOMIC     1
+#define GDA_OP_RDMA_WRITE  MLX5_OPCODE_RDMA_WRITE
+#define GDA_OP_ATOMIC_FA   MLX5_OPCODE_ATOMIC_FA
+#define GDA_OP_ATOMIC_CS   MLX5_OPCODE_ATOMIC_CS
+#endif
+
+namespace rocshmem {
+
+class GDABackend;
+
+typedef union db_reg {
+  uint64_t *ptr;
+  uintptr_t uint;
+} db_reg_t;
+
+#define SPIN_LOCK_INVALID  0xdead
+#define SPIN_LOCK_UNLOCKED 0x1234
+#define SPIN_LOCK_LOCKED   0xabcd
+
+class QueuePair {
+ public:
+  friend GDABackend;
+
+  /**
+   * @brief Constructor.
+   */
+  explicit QueuePair(struct ibv_pd* pd);
+
+  /**
+   * @brief Create and enqueue a non-blocking put work queue entry (wqe).
+   *
+   * @param[in] dest Destination address for data transmission.
+   * @param[in] source Source address for data transmission.
+   * @param[in] nelems Size in bytes of data transmission.
+   * @param[in] pe Destination processing element of data transmission.
+   */
+  __device__ void put_nbi(void *dest, const void *source, size_t nelems, int pe);
+
+  /**
+   * @brief Empty all completions from the completion queue.
+   */
+  __device__ void quiet();
+
+  /**
+   * @brief Create and enqueue an atomic fetch work queue entry (wqe).
+   *
+   * @param[in] dest Destination address for data transmission.
+   * @param[in] value Data value for the atomic operation.
+   * @param[in] cond Used in atomic comparisons.
+   * @param[in] pe Destination processing element of data transmission.
+   * @param[in] atomic_op The atomic operation to perform.
+   *
+   * @return An atomic value
+   */
+  __device__ int64_t atomic_fetch(void *dest, int64_t value, int64_t cond, int pe, uint8_t atomic_op);
+
+  /**
+   * @brief Create and enqueue an atomic fetch work queue entry (wqe).
+   *
+   * @param[in] dest Destination address for data transmission.
+   * @param[in] value Data value for the atomic operation.
+   * @param[in] cond Used in atomic comparisons.
+   * @param[in] pe Destination processing element of data transmission.
+   * @param[in] atomic_op The atomic operation to perform.
+   */
+  __device__ void atomic_nofetch(void *dest, int64_t value, int64_t cond, int pe, uint8_t atomic_op);
+
+  char *const *base_heap{nullptr};
+
+ private:
+  /**
+   * @brief Helper method to build work requests for the send queue.
+   *
+   * @param[in] pe Destination processing element of data transmission.
+   * @param[in] size Size in bytes of data transmission.
+   * @param[in] raddr Remote address.
+   * @param[in] opcode Operation to be performed.
+   * @param[in] atomic_data An atomic data value to be used.
+   * @param[in] atomic_cmp An atomic comparison operation to be performed.
+   * @param[in] fetching True if the operation returns a value.
+   */
+  __device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
+
+  /**
+   * @brief Helper method to build work requests for the send queue.
+   *
+   * @param[in] pe Destination processing element of data transmission.
+   * @param[in] size Size in bytes of data transmission.
+   * @param[in] laddr Local address.
+   * @param[in] raddr Remote address.
+   * @param[in] opcode Operation to be performed.
+   */
+  __device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
+
+  /**
+   * @brief Helper method to ring the doorbell
+   *
+   * @param[in] db_val Doorbell value is written by method.
+   */
+#if defined(GDA_IONIC)
+  __device__ void ring_doorbell(uint32_t pos);
+#elif defined(GDA_BNXT)
+  __device__ void ring_sq_doorbell(uint32_t slot_idx);
+  __device__ void ring_cq_doorbell(uint32_t slot_idx);
+#else
+  __device__ void ring_doorbell(uint64_t db_val, uint64_t my_sq_counter);
+#endif
+
+#ifdef GDA_IONIC
+  __device__ uint64_t get_same_qp_lane_mask();
+
+  __device__ bool cq_lock_try_acquire(uint64_t active_lane_mask);
+  __device__ void cq_lock_release(uint64_t active_lane_mask);
+
+  /**
+   * @brief Reserve space in the sq to post this many wqes.
+   * @param my_tid my logical thread id.
+   * @param num_wqes number of sq wqes to reserve for this wave.
+   * @return position of my_tid=0's wqe.
+   */
+  __device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes);
+
+  /**
+   * @brief Ring the sq doorbell maintaining order between waves.
+   * @param last this is the last wqe posted in this wave.
+   * @param my_sq_prod position of my_tid=0's wqe.
+   * @param num_wqes number of sq wqes posted in this wave.
+   * @param wqe this thread's wqe.
+   * @return doorbell producer index.
+   */
+  __device__ uint32_t commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe);
+
+  /**
+   * @brief Helper method to poll the next completion queue entry.
+   */
+  __device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask);
+
+  /**
+   * @brief Helper method to drain completion queue entries.
+   * @param cons wait for sq_msn to catch up to this position.
+   */
+  __device__ __attribute__((noinline)) void quiet_internal(uint64_t active_lane_mask, uint32_t cons);
+
+  uint64_t *cq_dbreg{nullptr};
+  uint64_t cq_dbval{0};
+  uint64_t cq_mask{0};
+  struct ionic_v1_cqe *cq_buf{nullptr};
+  uint32_t cq_lock{SPIN_LOCK_UNLOCKED};
+  uint32_t cq_pos{0};
+  uint32_t cq_dbpos{0};
+
+  uint64_t *sq_dbreg{nullptr};
+  uint64_t sq_dbval{0};
+  uint64_t sq_mask{0};
+  struct ionic_v1_wqe *sq_buf{nullptr};
+  uint32_t sq_dbprod{0};
+  uint32_t sq_prod{0};
+  uint32_t sq_msn{0};
+
+  uint32_t inline_threshold{0};
+
+#elif defined(GDA_BNXT)
+  uint64_t *dbr;
+  struct bnxt_device_cq cq;
+  struct bnxt_device_sq sq;
+
+  __device__ int poll_cq();
+#else // !GDA_IONIC && !GDA_BNXT
+
+  db_reg_t db{};
+
+  uint64_t cq_consumer{0};
+  uint64_t quiet_posted{0};
+  uint64_t quiet_active{0};
+  uint64_t quiet_completed{0};
+
+  /*
+   * struct mlx5dv_cq {
+   *   void                    *buf;
+   *   __be32                  *dbrec;
+   *   uint32_t                cqe_cnt;
+   *   uint32_t                cqe_size;
+   *   void                    *cq_uar;
+   *   uint32_t                cqn;
+   *   uint64_t                comp_mask;
+   * };
+  */
+  mlx5_cqe64 *cq_buf{nullptr};
+  volatile uint32_t *cq_dbrec{nullptr};
+  uint32_t cq_cnt{0};
+  uint32_t cq_log_cnt{0};
+
+  /*
+   * struct mlx5dv_qp {
+   *   __be32 *dbrec;
+   *   struct {
+   *     void *buf;
+   *     uint32_t wqe_cnt;
+   *     uint32_t stride;
+   *   } sq;
+   *   struct {
+   *     void *buf;
+   *     uint32_t wqe_cnt;
+   *     uint32_t stride;
+   *   } rq;
+   *   struct {
+   *     void *reg;
+   *     uint32_t size;
+   *   } bf;
+   *   uint64_t comp_mask;
+   *   off_t uar_mmap_offset;
+   *   uint32_t tirn;
+   *   uint32_t tisn;
+   *   uint32_t rqn;
+   *   uint32_t sqn;
+   *   uint64_t tir_icm_addr;
+   * };
+   */
+  volatile uint32_t *dbrec{nullptr};
+  uint64_t *sq_buf{nullptr};
+  uint16_t sq_wqe_cnt{0};
+  uint64_t sq_posted{0};
+  uint64_t sq_db_touched{0};
+  uint64_t sq_sunk{0};
+
+  static constexpr size_t OUTSTANDING_TABLE_SIZE = 65536;
+  uint64_t outstanding_wqes[OUTSTANDING_TABLE_SIZE]{0};
+
+#endif // GDA_IONIC
+
+  uint32_t qp_num{0};
+  uint32_t rkey{0};
+  uint32_t lkey{0};
+
+  uint64_t* nonfetching_atomic{nullptr};
+  uint32_t nonfetching_atomic_lkey{0};
+
+  uint64_t* fetching_atomic{nullptr};
+  uint32_t fetching_atomic_lkey{0};
+
+  static const uint32_t FETCHING_ATOMIC_CNT{1024};
+  static_assert(FETCHING_ATOMIC_CNT % WF_SIZE == 0);
+  using FreeListT = FreeList<uint64_t*, HIPAllocator>;
+  FreeListT* fetching_atomic_freelist{nullptr};
+
+  HIPAllocator allocator{};
+};
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_
diff --git a/src/gda/segment_builder.cpp b/src/gda/segment_builder.cpp
new file mode 100644
index 0000000000..ca3033d103
--- /dev/null
+++ b/src/gda/segment_builder.cpp
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "segment_builder.hpp"
+
+#include "util.hpp"
+#include "endian.hpp"
+
+namespace rocshmem {
+
+__device__ SegmentBuilder::SegmentBuilder(uint64_t wqe_idx, void *base) {
+  mlx5_segment *base_ptr = static_cast<mlx5_segment*>(base);
+  size_t segment_offset = wqe_idx * SEGMENTS_PER_WQE;
+  segp = &base_ptr[segment_offset];
+}
+
+/*
+ * Control segment - contains some control information for the current WQE.
+ *
+ * Output:
+ *      seg       - control segment to be filled
+ * Input:
+ *      pi        - WQEBB number of the first block of this WQE.
+ *                  This number should wrap at 0xffff, regardless of
+ *                  size of the WQ.
+ *      opcode    - Opcode of this WQE. Encodes the type of operation
+ *                  to be executed on the QP.
+ *      opmod     - Opcode modifier.
+ *      qp_num    - QP/SQ number this WQE is posted to.
+ *      fm_ce_se  - FM (fence mode), CE (completion and event mode)
+ *                  and SE (solicited event).
+ *      ds        - WQE size in octowords (16-byte units). DS accounts for all
+ *                  the segments in the WQE as summarized in WQE construction.
+ *      signature - WQE signature.
+ *      imm       - Immediate data/Invalidation key/UMR mkey.
+ */
+/*
+ * static MLX5DV_ALWAYS_INLINE
+ * void mlx5dv_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *seg, uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm)
+ * {
+ *   seg->opmod_idx_opcode   = htobe32(((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode);
+ *   seg->qpn_ds             = htobe32((qp_num << 8) | ds);
+ *   seg->fm_ce_se           = fm_ce_se;
+ *   seg->signature          = signature;
+ *   // The caller should prepare "imm" in advance based on WR opcode.
+ *   // For IBV_WR_SEND_WITH_IMM and IBV_WR_RDMA_WRITE_WITH_IMM,
+ *   // the "imm" should be assigned as is.
+ *   // For the IBV_WR_SEND_WITH_INV, it should be htobe32(imm).
+ *   seg->imm                = imm;
+ * }
+ */
+__device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm) {
+  segp->ctrl_seg = {0};
+  swap_endian_store(&segp->ctrl_seg.opmod_idx_opcode, ((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode);
+  swap_endian_store(&segp->ctrl_seg.qpn_ds, qp_num << 8 | ds);
+  segp->ctrl_seg.fm_ce_se = fm_ce_se;
+  segp->ctrl_seg.signature = signature;
+  segp->ctrl_seg.imm = imm;
+  segp++;
+}
+
+__device__ void SegmentBuilder::update_raddr_seg(uintptr_t *raddr, uint32_t rkey) {
+  segp->raddr_seg = {0};
+  swap_endian_store(reinterpret_cast<uint64_t*>(&segp->raddr_seg.raddr), reinterpret_cast<uint64_t>(raddr));
+  segp->raddr_seg.rkey = rkey;
+  segp++;
+}
+
+/*
+ * Data Segments - contain pointers and a byte count for the scatter/gather list.
+ * They can optionally contain data, which will save a memory read access for
+ * gather Work Requests.
+ */
+/*
+ * static MLX5DV_ALWAYS_INLINE
+ * void mlx5dv_set_data_seg(struct mlx5_wqe_data_seg *seg, uint32_t length, uint32_t lkey, uintptr_t address) {
+ *   seg->byte_count = htobe32(length);
+ *   seg->lkey       = htobe32(lkey);
+ *   seg->addr       = htobe64(address);
+ * }
+ */
+__device__ void SegmentBuilder::update_data_seg(uintptr_t *address, uint32_t length, uint32_t lkey) {
+  segp->data_seg = {0};
+  swap_endian_store(&segp->data_seg.byte_count, length);
+  segp->data_seg.lkey = lkey;
+  swap_endian_store(reinterpret_cast<uint64_t*>(&segp->data_seg.addr), reinterpret_cast<uint64_t>(address));
+  segp++;
+}
+
+__device__ void SegmentBuilder::update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp) {
+  segp->atomic_seg = {0};
+  swap_endian_store(reinterpret_cast<uint64_t*>(&segp->atomic_seg.swap_add), atomic_data);
+  swap_endian_store(reinterpret_cast<uint64_t*>(&segp->atomic_seg.compare), atomic_cmp);
+  segp++;
+}
+
+}  // namespace rocshmem
diff --git a/src/gda/segment_builder.hpp b/src/gda/segment_builder.hpp
new file mode 100644
index 0000000000..130a9604f3
--- /dev/null
+++ b/src/gda/segment_builder.hpp
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_GDA_SEGMENT_BUILDER_HPP_
+#define LIBRARY_SRC_GDA_SEGMENT_BUILDER_HPP_
+
+#include <infiniband/mlx5dv.h>
+
+#include "util.hpp"
+
+namespace rocshmem {
+
+class SegmentBuilder {
+ public:
+  __device__ SegmentBuilder(uint64_t wqe_idx, void *base);
+
+  /*
+   * struct mlx5_wqe_ctrl_seg {
+   *   __be32 opmod_idx_opcode;
+   *   __be32 qpn_ds;
+   *   uint8_t signature;
+   *   __be16 dci_stream_channel_id;
+   *   uint8_t fm_ce_se;
+   *   __be32 imm;
+   * } __attribute__((__packed__)) __attribute__((__aligned__(4)));
+   */
+  __device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm);
+
+  /*
+   * struct mlx5_wqe_raddr_seg {
+   *   __be64 raddr;
+   *   __be32 rkey;
+   *   __be32 reserved;
+   * };
+   */
+  __device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey);
+
+  /*
+   * struct mlx5_wqe_data_seg {
+   * __be32 byte_count;
+   * __be32 lkey;
+   * __be64 addr;
+   * };
+   */
+  __device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey);
+
+  /*
+   * struct mlx5_wqe_atomic_seg {
+   *   __be64 swap_add;
+   *   __be64 compare;
+   * };
+   */
+  __device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp);
+
+ private:
+  const int SEGMENTS_PER_WQE = 4;
+
+  union mlx5_segment {
+    mlx5_wqe_ctrl_seg ctrl_seg;
+    mlx5_wqe_raddr_seg raddr_seg;
+    mlx5_wqe_data_seg data_seg;
+    mlx5_wqe_atomic_seg atomic_seg;
+  };
+
+  mlx5_segment *segp;
+};
+
+}  // namespace rocshmem
+
+#endif  // LIBRARY_SRC_GDA_SEGMENT_BUILDER_HPP_
diff --git a/src/gda/topology.cpp b/src/gda/topology.cpp
new file mode 100644
index 0000000000..a49312bb99
--- /dev/null
+++ b/src/gda/topology.cpp
@@ -0,0 +1,884 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *****************************************************************************/
+
+#include "topology.hpp"
+
+using namespace rocshmem;
+
+namespace rocshmem
+{
+
+  const char* GidPriorityStr[] = {
+    "RoCEv1 Link-local",
+    "RoCEv2 Link-local",
+    "RoCEv1 IPv6",
+    "RoCEv2 IPv6",
+    "RoCEv1 IPv4-mapped IPv6",
+    "RoCEv2 IPv4-mapped IPv6"
+  };
+
+  // Check that CPU memory array of numBytes has been allocated on targetId NUMA node
+  static int CheckPages(char* array, size_t numBytes, int targetId)
+  {
+    size_t const pageSize = getpagesize();
+    size_t const numPages = (numBytes + pageSize - 1) / pageSize;
+
+    std::vector<void *> pages(numPages);
+    std::vector<int> status(numPages);
+
+    pages[0] = array;
+    for (int i = 1; i < numPages; i++) {
+      pages[i] = (char*)pages[i-1] + pageSize;
+    }
+
+    long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+    if (retCode) {
+      fprintf(stderr,"Unable to collect page table information for allocated memory. "
+              "Ensure NUMA library is installed properly");
+      return -1;
+    }
+
+    size_t mistakeCount = 0;
+    for (size_t i = 0; i < numPages; i++) {
+      if (status[i] < 0) {
+        fprintf(stderr, "Unexpected page status (%d) for page %zu", status[i], i);
+        return -1;
+      }
+      if (status[i] != targetId) mistakeCount++;
+    }
+    if (mistakeCount > 0) {
+      fprintf(stderr, "%lu out of %lu pages for memory allocation were not on NUMA node %d."
+              " This could be due to hardware memory issues, or the use of numa-rebalancing daemons such as numad",
+              mistakeCount, numPages, targetId);
+      return -1;
+    }
+    return ROCSHMEM_SUCCESS;
+  }
+
+  // Allocate memory
+  static int AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr)
+  {
+    if (numBytes == 0) {
+      fprintf(stderr, "Unable to allocate 0 bytes");
+      return -1;
+    }
+    *memPtr = nullptr;
+
+    MemType const& memType = memDevice.memType;
+
+    if (IsCpuMemType(memType)) {
+      // Set numa policy prior to call to hipHostMalloc
+      numa_set_preferred(memDevice.memIndex);
+
+      // Allocate host-pinned memory (should respect NUMA mem policy)
+      CHECK_HIP(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
+
+      // Check that the allocated pages are actually on the correct NUMA node
+      memset(*memPtr, 0, numBytes);
+      ERR_CHECK(CheckPages((char*)*memPtr, numBytes, memDevice.memIndex));
+      // Reset to default numa mem policy
+      numa_set_preferred(-1);
+    } else if (IsGpuMemType(memType)) {
+      int prev_dev;
+      CHECK_HIP(hipGetDevice(&prev_dev));
+
+      // Switch to the appropriate GPU
+      CHECK_HIP(hipSetDevice(memDevice.memIndex));
+
+      // Allocate GPU memory on appropriate device
+      CHECK_HIP(hipMalloc((void**)memPtr, numBytes));
+
+      // Clear the memory
+      CHECK_HIP(hipMemset(*memPtr, 0, numBytes));
+      CHECK_HIP(hipDeviceSynchronize());
+
+      // Reset to original GPU
+      CHECK_HIP(hipSetDevice(prev_dev));
+    } else {
+      printf("Unsupported memory type (%d)", memType);
+      return -1;
+    }
+    return ROCSHMEM_SUCCESS;
+  }
+
+  // Deallocate memory
+  static int DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
+  {
+    // Avoid deallocating nullptr
+    if (memPtr == nullptr) {
+      fprintf(stderr, "Attempted to free null pointer for %lu bytes", bytes);
+      return -1;
+    }
+
+    switch (memType) {
+    case MEM_CPU:
+      {
+        CHECK_HIP(hipHostFree(memPtr));
+        break;
+      }
+    case MEM_GPU:
+      {
+        CHECK_HIP(hipFree(memPtr));
+        break;
+      }
+    default:
+      fprintf(stderr, "Attempting to deallocate unrecognized memory type (%d)", memType);
+      return -1;
+    }
+    return ROCSHMEM_SUCCESS;
+  }
+
+
+  // HSA-related functions
+  //========================================================================================
+
+  static int GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent)
+  {
+    static bool isInitialized = false;
+    static std::vector<hsa_agent_t> cpuAgents;
+    static std::vector<hsa_agent_t> gpuAgents;
+
+    int const& exeIndex = exeDevice.exeIndex;
+    int const numCpus   = GetNumDevices(EXE_CPU);
+    int const numGpus   = GetNumDevices(EXE_GPU);
+
+    // Initialize results on first use
+    if (!isInitialized) {
+      hsa_amd_pointer_info_t info;
+      info.size = sizeof(info);
+
+      int err;
+      int32_t* tempBuffer;
+
+      // Index CPU agents
+      cpuAgents.clear();
+      for (int i = 0; i < numCpus; i++) {
+        ERR_CHECK(AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer));
+        CHECK_HSA(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+        cpuAgents.push_back(info.agentOwner);
+        ERR_CHECK(DeallocateMemory(MEM_CPU, tempBuffer, 1024));
+      }
+
+      // Index GPU agents
+      gpuAgents.clear();
+      for (int i = 0; i < numGpus; i++) {
+        ERR_CHECK(AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer));
+        CHECK_HSA(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+        gpuAgents.push_back(info.agentOwner);
+        ERR_CHECK(DeallocateMemory(MEM_GPU, tempBuffer, 1024));
+      }
+      isInitialized = true;
+    }
+
+    switch (exeDevice.exeType) {
+    case EXE_CPU:
+      if (exeIndex < 0 || exeIndex >= numCpus) {
+        fprintf(stderr, "CPU index must be between 0 and %d inclusively", numCpus - 1);
+        return -1;
+      }
+      agent = cpuAgents[exeDevice.exeIndex];
+      break;
+    case EXE_GPU:
+      if (exeIndex < 0 || exeIndex >= numGpus) {
+        fprintf(stderr, "GPU index must be between 0 and %d inclusively", numGpus - 1);
+        return -1;
+      }
+      agent = gpuAgents[exeIndex];
+      break;
+    default:
+      fprintf(stderr, "Attempting to get HSA agent of unknown or unsupported executor type (%d)",
+             exeDevice.exeType);
+      return -1;
+    }
+    return ROCSHMEM_SUCCESS;
+  }
+
+  // Get the hsa_agent_t associated with a MemDevice
+  static int GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
+  {
+    if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
+    if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU, memDevice.memIndex}, agent);
+
+    fprintf(stderr, "Unable to get HSA agent for memDevice (%d,%d)",
+           memDevice.memType, memDevice.memIndex);
+    return -1;
+  }
+
+  // Structure to track PCIe topology
+  struct PCIeNode
+  {
+    std::string        address;                   ///< PCIe address for this PCIe node
+    std::string        description;               ///< Description for this PCIe node
+    std::set<PCIeNode> children;                  ///< Children PCIe nodes
+
+    // Default constructor
+    PCIeNode() : address(""), description("") {}
+
+    // Constructor
+    PCIeNode(std::string const& addr) : address(addr) {}
+
+    // Constructor
+    PCIeNode(std::string const& addr, std::string const& desc)
+      :address(addr), description(desc) {}
+
+    // Comparison operator for std::set
+    bool operator<(PCIeNode const& other) const {
+      return address < other.address;
+    }
+  };
+
+  // Structure to track information about IBV devices
+  struct IbvDevice
+  {
+    ibv_device* devicePtr;
+    std::string name;
+    std::string busId;
+    bool        hasActivePort;
+    int         numaNode;
+    int         gidIndex;
+    std::string gidDescriptor;
+    bool        isRoce;
+  };
+
+  // Function to collect information about IBV devices
+  //========================================================================================
+  static bool IsConfiguredGid(union ibv_gid const& gid)
+  {
+    const struct in6_addr *a = (struct in6_addr *) gid.raw;
+    int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
+    if (((a->s6_addr32[0] | trailer) == 0UL) ||
+        ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
+      return false;
+    }
+    return true;
+  }
+
+  static bool LinkLocalGid(union ibv_gid const& gid)
+  {
+    const struct in6_addr *a = (struct in6_addr *) gid.raw;
+    if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
+      return true;
+    }
+    return false;
+  }
+
+  static int GetRoceVersionNumber(struct ibv_context* const& context,
+                                  int const&  portNum,
+                                  int const&  gidIndex,
+                                  int&        version)
+  {
+    char const* deviceName = ibv_get_device_name(context->device);
+    char gidRoceVerStr[16]      = {};
+    char roceTypePath[PATH_MAX] = {};
+    sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
+            deviceName, portNum, gidIndex);
+
+    int fd = open(roceTypePath, O_RDONLY);
+    if (fd == -1) {
+      fprintf(stderr, "Failed while opening RoCE file path (%s)", roceTypePath);
+      return -1;
+    }
+
+    int ret = read(fd, gidRoceVerStr, 15);
+    close(fd);
+
+    if (ret == -1) {
+      fprintf(stderr, "Failed while reading RoCE version");
+      return -1;
+    }
+
+    if (strlen(gidRoceVerStr)) {
+      if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
+          || strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
+        version = 1;
+      }
+      else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
+        version = 2;
+      }
+    }
+    return ROCSHMEM_SUCCESS;
+  }
+
+  static bool IsIPv4MappedIPv6(const union ibv_gid &gid)
+  {
+    // look for ::ffff:x.x.x.x format
+    // From Broadcom documentation
+    // https://techdocs.broadcom.com/us/en/storage-and-ethernet-connectivity/ethernet-nic-controllers/bcm957xxx/adapters/frequently-asked-questions1.html
+    // "The IPv4 address is really an IPv4 address mapped into the IPv6 address space.
+    // This can be identified by 80 “0” bits, followed by 16 “1” bits (“FFFF” in hexadecimal)
+    // followed by the original 32-bit IPv4 address."
+    return (gid.global.subnet_prefix == 0    &&
+            gid.raw[8]               == 0    &&
+            gid.raw[9]               == 0    &&
+            gid.raw[10]              == 0xff &&
+            gid.raw[11]              == 0xff);
+  }
+
+  static int GetGidIndex(struct ibv_context*          context,
+                         int const&                   gidTblLen,
+                         int const&                   portNum,
+                         std::pair<int, std::string>& gidInfo)
+  {
+    if(gidInfo.first >= 0) return ROCSHMEM_SUCCESS; // honor user choice
+    union ibv_gid gid;
+
+    GidPriority highestPriority = GidPriority::UNKNOWN;
+    int gidIndex = -1;
+
+    for (int i = 0; i < gidTblLen; ++i) {
+      IBV_CALL(ibv_query_gid, context, portNum, i, &gid);
+      if (!IsConfiguredGid(gid)) continue;
+      int gidCurrRoceVersion;
+      if(GetRoceVersionNumber(context, portNum, i, gidCurrRoceVersion) != ROCSHMEM_SUCCESS) continue;
+      GidPriority currPriority;
+      if (IsIPv4MappedIPv6(gid)) {
+        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV4 : GidPriority::ROCEV1_IPV4;
+      } else if (!LinkLocalGid(gid)) {
+        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV6 : GidPriority::ROCEV1_IPV6;
+      } else {
+        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_LINK_LOCAL : GidPriority::ROCEV1_LINK_LOCAL;
+      }
+      if(currPriority > highestPriority) {
+        highestPriority = currPriority;
+        gidIndex = i;
+      }
+    }
+
+    if (highestPriority == GidPriority::UNKNOWN) {
+      gidInfo.first = -1;
+      fprintf(stderr, "Failed to auto-detect a valid GID index. Try setting it manually through IB_GID_INDEX");
+      return -1;
+    }
+    gidInfo.first = gidIndex;
+    gidInfo.second = GidPriorityStr[highestPriority];
+    return ROCSHMEM_SUCCESS;
+  }
+
+  static vector<IbvDevice>& GetIbvDeviceList()
+  {
+    static bool isInitialized = false;
+    static vector<IbvDevice> ibvDeviceList = {};
+
+    // Build list on first use
+    if (!isInitialized) {
+
+      // Query the number of IBV devices
+      int numIbvDevices = 0;
+      ibv_device** deviceList = ibv_get_device_list(&numIbvDevices);
+
+      if (deviceList && numIbvDevices > 0) {
+        // Loop over each device to collect information
+        for (int i = 0; i < numIbvDevices; i++) {
+          IbvDevice ibvDevice;
+          ibvDevice.devicePtr = deviceList[i];
+          ibvDevice.name = deviceList[i]->name;
+          ibvDevice.hasActivePort = false;
+          {
+            struct ibv_context *context = ibv_open_device(ibvDevice.devicePtr);
+            if (context) {
+              struct ibv_device_attr deviceAttr;
+              if (!ibv_query_device(context, &deviceAttr)) {
+                int activePort;
+                ibvDevice.gidIndex = -1;
+                for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) {
+                  struct ibv_port_attr portAttr;
+                  if (ibv_query_port(context, port, &portAttr)) continue;
+                  if (portAttr.state == IBV_PORT_ACTIVE) {
+                    activePort = port;
+                    ibvDevice.hasActivePort = true;
+                    if(portAttr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+                      ibvDevice.isRoce = true;
+                      std::pair<int, std::string> gidInfo (-1, "");
+                      auto res = GetGidIndex(context, portAttr.gid_tbl_len, activePort, gidInfo);
+                      if (res == ROCSHMEM_SUCCESS) {
+                        ibvDevice.gidIndex = gidInfo.first;
+                        ibvDevice.gidDescriptor = gidInfo.second;
+                      }
+                    }
+                    break;
+                  }
+                }
+              }
+              ibv_close_device(context);
+            }
+          }
+          ibvDevice.busId = "";
+          {
+            std::string device_path(ibvDevice.devicePtr->dev_path);
+            if (std::filesystem::exists(device_path)) {
+              std::string pciPath = std::filesystem::canonical(device_path + "/device").string();
+              std::size_t pos = pciPath.find_last_of('/');
+              if (pos != std::string::npos) {
+                ibvDevice.busId = pciPath.substr(pos + 1);
+              }
+            }
+          }
+
+          // Get nearest numa node for this device
+          ibvDevice.numaNode = -1;
+          std::filesystem::path devicePath = "/sys/bus/pci/devices/" + ibvDevice.busId + "/numa_node";
+          std::string canonicalPath = std::filesystem::canonical(devicePath).string();
+
+          if (std::filesystem::exists(canonicalPath)) {
+            std::ifstream file(canonicalPath);
+            if (file.is_open()) {
+              std::string numaNodeStr;
+              std::getline(file, numaNodeStr);
+              int numaNodeVal;
+              if (sscanf(numaNodeStr.c_str(), "%d", &numaNodeVal) == 1)
+                ibvDevice.numaNode = numaNodeVal;
+              file.close();
+            }
+          }
+          ibvDeviceList.push_back(ibvDevice);
+        }
+      }
+      ibv_free_device_list(deviceList);
+      isInitialized = true;
+    }
+    return ibvDeviceList;
+  }
+
+  // PCIe-related functions
+  //========================================================================================
+
+  // Prints off PCIe tree
+  static void PrintPCIeTree(PCIeNode    const& node,
+                            std::string const& prefix = "",
+                            bool               isLast = true)
+  {
+    if (!node.address.empty()) {
+      printf("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str());
+      if (!node.description.empty()) {
+        printf("(%s)", node.description.c_str());
+      }
+      printf("\n");
+    }
+    auto const& children = node.children;
+    for (auto it = children.begin(); it != children.end(); ++it) {
+      PrintPCIeTree(*it, prefix + (isLast ? "    " : "│   "), std::next(it) == children.end());
+    }
+  }
+
+  // Inserts nodes along pcieAddress down a tree starting from root
+  static int InsertPCIePathToTree(std::string const& pcieAddress,
+                                  std::string const& description,
+                                  PCIeNode&          root)
+  {
+    std::filesystem::path devicePath = "/sys/bus/pci/devices/" + pcieAddress;
+    std::string canonicalPath = std::filesystem::canonical(devicePath).string();
+
+    if (!std::filesystem::exists(devicePath)) {
+      fprintf(stderr, "Device path %s does not exist", devicePath.c_str());
+      return -1;
+    }
+
+    std::istringstream iss(canonicalPath);
+    std::string token;
+
+    PCIeNode* currNode = &root;
+    while (std::getline(iss, token, '/')) {
+      auto it = (currNode->children.insert(PCIeNode(token))).first;
+      currNode = const_cast<PCIeNode*>(&(*it));
+    }
+    currNode->description = description;
+
+    return ROCSHMEM_SUCCESS;
+  }
+
+  // Returns root node for PCIe tree.  Constructed on first use
+  static PCIeNode* GetPCIeTreeRoot()
+  {
+    static bool isInitialized = false;
+    static PCIeNode pcieRoot;
+
+    // Build PCIe tree on first use
+    if (!isInitialized) {
+      // Add NICs to the tree
+      int numNics = rocshmem::GetNumDevices(rocshmem::EXE_NIC);
+      auto const& ibvDeviceList = rocshmem::GetIbvDeviceList();
+      for (IbvDevice const& ibvDevice : ibvDeviceList) {
+        if (!ibvDevice.hasActivePort || ibvDevice.busId == "") continue;
+        InsertPCIePathToTree(ibvDevice.busId, ibvDevice.name, pcieRoot);
+      }
+
+      // Add GPUs to the tree
+      int numGpus = rocshmem::GetNumDevices(rocshmem::EXE_GPU);
+      for (int i = 0; i < numGpus; ++i) {
+        char hipPciBusId[64];
+        if (hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i) == hipSuccess) {
+          InsertPCIePathToTree(hipPciBusId, "GPU " + std::to_string(i), pcieRoot);
+        }
+      }
+#ifdef VERBS_DEBUG
+      PrintPCIeTree(pcieRoot);
+#endif
+      isInitialized = true;
+    }
+    return &pcieRoot;
+  }
+
+  // Finds the lowest common ancestor in PCIe tree between two nodes
+  static PCIeNode const* GetLcaBetweenNodes(PCIeNode    const* root,
+                                            std::string const& node1Address,
+                                            std::string const& node2Address)
+  {
+    if (!root || root->address == node1Address || root->address == node2Address)
+      return root;
+
+    PCIeNode const* lcaFound1 = nullptr;
+    PCIeNode const* lcaFound2 = nullptr;
+
+    // Recursively iterate over children
+    for (auto const& child : root->children) {
+      PCIeNode const* lca = GetLcaBetweenNodes(&child, node1Address, node2Address);
+      if (!lca) continue;
+      if (!lcaFound1) {
+        // First time found
+        lcaFound1 = lca;
+      } else {
+        // Second time found
+        lcaFound2 = lca;
+        break;
+      }
+    }
+
+    // If two children were found, then current node is the lowest common ancestor
+    return (lcaFound1 && lcaFound2) ? root : lcaFound1;
+  }
+
+  // Gets the depth of an node in the PCIe tree
+  static int GetLcaDepth(std::string const&     targetBusID,
+                         PCIeNode const* const& node,
+                         int                    depth = 0)
+  {
+    if (!node) return -1;
+    if (targetBusID == node->address) return depth;
+
+    for (auto const& child : node->children) {
+      int distance = GetLcaDepth(targetBusID, &child, depth + 1);
+      if (distance != -1)
+        return distance;
+    }
+    return -1;
+  }
+
+  // Function to extract the bus number from a PCIe address (domain:bus:device.function)
+  static int ExtractBusNumber(std::string const& pcieAddress)
+  {
+    int domain, bus, device, function;
+    char delimiter;
+
+    std::istringstream iss(pcieAddress);
+    iss >> std::hex >> domain >> delimiter >> bus >> delimiter >> device >> delimiter >> function;
+    if (iss.fail()) {
+#ifdef VERBS_DEBUG
+      printf("Invalid PCIe address format: %s\n", pcieAddress.c_str());
+#endif
+      return -1;
+    }
+    return bus;
+  }
+
+  // Function to compute the distance between two bus IDs
+  static int GetBusIdDistance(std::string const& pcieAddress1,
+                              std::string const& pcieAddress2)
+  {
+    int bus1 = ExtractBusNumber(pcieAddress1);
+    int bus2 = ExtractBusNumber(pcieAddress2);
+    return (bus1 < 0 || bus2 < 0) ? -1 : std::abs(bus1 - bus2);
+  }
+
+  // Given a target busID and a set of candidate devices, returns a set of indices
+  // that is "closest" to the target
+  static std::set<int> GetNearestDevicesInTree(std::string              const& targetBusId,
+                                               std::vector<std::string> const& candidateBusIdList)
+  {
+    int maxDepth = -1;
+    int minDistance = std::numeric_limits<int>::max();
+    std::set<int> matches = {};
+
+    // Loop over the candidates to find the ones with the lowest common ancestor (LCA)
+    for (int i = 0; i < candidateBusIdList.size(); i++) {
+      std::string const& candidateBusId = candidateBusIdList[i];
+      if (candidateBusId == "") continue;
+      PCIeNode const* lca = GetLcaBetweenNodes(GetPCIeTreeRoot(), targetBusId, candidateBusId);
+      if (!lca) continue;
+
+      int depth = GetLcaDepth(lca->address, GetPCIeTreeRoot());
+      int currDistance = GetBusIdDistance(targetBusId, candidateBusId);
+
+      // When more than one LCA match is found, choose the one with smallest busId difference
+      // NOTE: currDistance could be -1, which signals problem with parsing, however still
+      //       remains a valid "closest" candidate, so is included
+      if (depth > maxDepth || (depth == maxDepth && depth >= 0 && currDistance < minDistance)) {
+        maxDepth = depth;
+        matches.clear();
+        matches.insert(i);
+        minDistance = currDistance;
+      } else if (depth == maxDepth && depth >= 0 && currDistance == minDistance) {
+        matches.insert(i);
+      }
+    }
+    return matches;
+  }
+
+  int GetNumDevices(DeviceType exeType)
+  {
+    switch (exeType) {
+    case rocshmem::EXE_CPU:
+      return numa_num_configured_nodes();
+    case rocshmem::EXE_GPU:
+      {
+        int numDetectedGpus = 0;
+        hipError_t status = hipGetDeviceCount(&numDetectedGpus);
+        if (status != hipSuccess) numDetectedGpus = 0;
+        return numDetectedGpus;
+      }
+    case rocshmem::EXE_NIC:
+      {
+        return GetIbvDeviceList().size();
+      }
+    default:
+      return 0;
+    }
+  }
+
+  int GetClosestCpuNumaToGpu(int gpuIndex)
+  {
+    hsa_agent_t gpuAgent;
+    ERR_CHECK(GetHsaAgent({EXE_GPU, gpuIndex}, gpuAgent));
+
+    hsa_agent_t closestCpuAgent;
+    if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, &closestCpuAgent)
+        == HSA_STATUS_SUCCESS) {
+      int numCpus = GetNumDevices(EXE_CPU);
+      for (int i = 0; i < numCpus; i++) {
+        hsa_agent_t cpuAgent;
+        ERR_CHECK(GetHsaAgent({EXE_CPU, i}, cpuAgent));
+        if (cpuAgent.handle == closestCpuAgent.handle) return i;
+      }
+    }
+    return -1;
+  }
+
+  int GetClosestCpuNumaToNic(int nicIndex)
+  {
+    int numNics = GetNumDevices(rocshmem::EXE_NIC);
+    if (nicIndex < 0 || nicIndex >= numNics) return -1;
+    return GetIbvDeviceList()[nicIndex].numaNode;
+  }
+
+
+  int GetClosestNicToGpu(int gpuIndex, char** dev_name)
+  {
+    static bool isInitialized = false;
+    static std::vector<int> closestNicId;
+    static auto const& ibvDeviceList = GetIbvDeviceList();
+
+    int numGpus = GetNumDevices(rocshmem::EXE_GPU);
+    if (gpuIndex < 0 || gpuIndex >= numGpus) return -1;
+
+    // Build closest NICs per GPU on first use
+    if (!isInitialized) {
+      closestNicId.resize(numGpus, -1);
+
+      // Build up list of NIC bus addresses
+      std::vector<std::string> ibvAddressList;
+      for (auto const& ibvDevice : ibvDeviceList)
+        ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : "");
+
+      // Track how many times a device has been assigned as "closest"
+      // This allows distributed work across devices using multiple ports (sharing the same busID)
+      // NOTE: This isn't necessarily optimal, but likely to work in most cases involving multi-port
+      // Counter example:
+      //
+      //  G0 prefers (N0,N1), picks N0
+      //  G1 prefers (N1,N2), picks N1
+      //  G2 prefers N0,      picks N0
+      //
+      //  instead of G0->N1, G1->N2, G2->N0
+
+      std::vector<int> assignedCount(ibvDeviceList.size(), 0);
+
+      // Loop over each GPU to find the closest NIC(s) based on PCIe address
+      for (int i = 0; i < numGpus; i++) {
+        // Collect PCIe address for the GPU
+        char hipPciBusId[64];
+        hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i);
+        if (err != hipSuccess) {
+#ifdef VERBS_DEBUG
+          printf("Failed to get PCI Bus ID for HIP device %d: %s\n", i, hipGetErrorString(err));
+#endif
+          closestNicId[i] = -1;
+          continue;
+        }
+
+        // Find closest NICs
+        std::set<int> closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList);
+
+        // Pick the least-used NIC to assign as closest
+        int closestIdx = -1;
+        for (auto idx : closestNicIdxs) {
+          if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx])
+            closestIdx = idx;
+        }
+
+        // The following will only use distance between bus IDs
+        // to determine the closest NIC to GPU if the PCIe tree approach fails
+        if (closestIdx < 0) {
+#ifdef VERBS_DEBUG
+          printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n");
+#endif
+
+          int minDistance = std::numeric_limits<int>::max();
+          for (int j = 0; j < ibvDeviceList.size(); j++) {
+            if (ibvDeviceList[j].busId != "") {
+              int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[j].busId);
+              if (distance < minDistance && distance >= 0) {
+                minDistance = distance;
+                closestIdx = j;
+              }
+            }
+          }
+        }
+        closestNicId[i] = closestIdx;
+        if (closestIdx != -1) assignedCount[closestIdx]++;
+      }
+      isInitialized = true;
+    }
+
+    DPRINTF("GPU Device id: %d closest NIC id : %d name: %s\n", gpuIndex, closestNicId[gpuIndex],
+           ibvDeviceList[closestNicId[gpuIndex]].name.c_str());
+    if (dev_name != NULL) {
+      *dev_name = strdup(ibvDeviceList[closestNicId[gpuIndex]].name.c_str());
+    }
+
+    return closestNicId[gpuIndex];
+  }
+
+  static int RemappedCpuIndex(int origIdx)
+  {
+    static std::vector<int> remappingCpu;
+
+    // Build CPU remapping on first use
+    // Skip numa nodes that are not configured
+    if (remappingCpu.empty()) {
+      for (int node = 0; node <= numa_max_node(); node++)
+        if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
+          remappingCpu.push_back(node);
+    }
+    return remappingCpu[origIdx];
+  }
+
+  static void PrintNicToGPUTopo(bool outputToCsv)
+  {
+    printf(" NIC | Device Name | Active | PCIe Bus ID  | NUMA | Closest GPU(s) | GID Index | GID Descriptor\n");
+    if(!outputToCsv)
+      printf("-----+-------------+--------+--------------+------+----------------+-----------+-------------------\n");
+
+    int numGpus = rocshmem::GetNumDevices(rocshmem::EXE_GPU);
+    auto const& ibvDeviceList = rocshmem::GetIbvDeviceList();
+    for (int i = 0; i < ibvDeviceList.size(); i++) {
+
+      std::string closestGpusStr = "";
+      for (int j = 0; j < numGpus; j++) {
+        if (rocshmem::GetClosestNicToGpu(j, nullptr) == i) {
+          if (closestGpusStr != "") closestGpusStr += ",";
+          closestGpusStr += std::to_string(j);
+        }
+      }
+
+      printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n",
+             i, ibvDeviceList[i].name.c_str(),
+             ibvDeviceList[i].hasActivePort ? "Yes" : "No",
+             ibvDeviceList[i].busId.c_str(),
+             ibvDeviceList[i].numaNode,
+             closestGpusStr.c_str(),
+             ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort?  std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
+             ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort?  ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
+             );
+    }
+    printf("\n");
+  }
+
+  void DisplayTopology(bool outputToCsv)
+  {
+    int numCpus = rocshmem::GetNumDevices(rocshmem::EXE_CPU);
+    int numGpus = rocshmem::GetNumDevices(rocshmem::EXE_GPU);
+    int numNics = rocshmem::GetNumDevices(rocshmem::EXE_NIC);
+    char sep = (outputToCsv ? ',' : '|');
+
+    if (outputToCsv) {
+      printf("NumCpus,%d\n", numCpus);
+      printf("NumGpus,%d\n", numGpus);
+      printf("NumNics,%d\n", numNics);
+    } else {
+      printf("\nDetected Topology:\n");
+      printf("==================\n");
+      printf("  %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1);
+      printf("  %d GPU device(s)\n", numGpus);
+      printf("  %d Supported NIC device(s)\n", numNics);
+    }
+
+    // Print out detected CPU topology
+    printf("\n            %c", sep);
+    for (int j = 0; j < numCpus; j++)
+      printf("NUMA %02d%c", j, sep);
+    printf(" #Cpus %c Closest GPU(s)\n", sep);
+
+    if (!outputToCsv) {
+      printf("------------+");
+      for (int j = 0; j <= numCpus; j++)
+        printf("-------+");
+      printf("---------------\n");
+    }
+
+    for (int i = 0; i < numCpus; i++) {
+      int nodeI = RemappedCpuIndex(i);
+      printf("NUMA %02d (%02d)%c", i, nodeI, sep);
+      for (int j = 0; j < numCpus; j++) {
+        int nodeJ = RemappedCpuIndex(j);
+        int numaDist = numa_distance(nodeI, nodeJ);
+        printf(" %5d %c", numaDist, sep);
+      }
+
+      int numCpuCores = 0;
+      for (int j = 0; j < numa_num_configured_cpus(); j++)
+        if (numa_node_of_cpu(j) == nodeI) numCpuCores++;
+      printf(" %5d %c", numCpuCores, sep);
+
+      for (int j = 0; j < numGpus; j++) {
+        if (rocshmem::GetClosestCpuNumaToGpu(j) == nodeI) {
+          printf(" %d", j);
+        }
+      }
+      printf("\n");
+    }
+    printf("\n");
+
+    // Print out detected NIC topology
+    PrintNicToGPUTopo(outputToCsv);
+  }
+}
diff --git a/src/gda/topology.hpp b/src/gda/topology.hpp
new file mode 100644
index 0000000000..2dfbfbbce9
--- /dev/null
+++ b/src/gda/topology.hpp
@@ -0,0 +1,247 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *****************************************************************************/
+
+#pragma once
+#include <algorithm>
+#include <cstring>
+#include <future>
+#include <map>
+#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
+#include <numaif.h>
+#include <random>
+#include <set>
+#include <sstream>
+#include <stdarg.h>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+#include <iostream>
+
+#include <infiniband/verbs.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <filesystem>
+#include <fstream>
+
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+
+#include "util.hpp"
+
+namespace rocshmem
+{
+  using std::map;
+  using std::pair;
+  using std::set;
+  using std::vector;
+
+  /**
+   * Enumeration of GID priority
+   *
+   * @note These are the GID types ordered in priority from lowest (0) to highest
+   */
+  enum GidPriority
+  {
+    UNKNOWN           = -1,                      ///< Default
+    ROCEV1_LINK_LOCAL = 0,                       ///< RoCEv1 Link-local
+    ROCEV2_LINK_LOCAL = 1,                       ///< RoCEv2 Link-local fe80::/10
+    ROCEV1_IPV6       = 2,                       ///< RoCEv1 IPv6
+    ROCEV2_IPV6       = 3,                       ///< RoCEv2 IPv6
+    ROCEV1_IPV4       = 4,                       ///< RoCEv1 IPv4-mapped IPv6
+    ROCEV2_IPV4       = 5,                       ///< RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x
+  };
+
+
+  /**
+   * Enumeration of supported memory types
+   *
+   * @note These are possible types of memory to be used as sources/destinations
+   */
+  enum MemType
+  {
+    MEM_CPU          = 0,                       ///< Coarse-grained pinned CPU memory
+    MEM_GPU          = 1,                       ///< Coarse-grained global GPU memory
+  };
+
+ /**
+   * Enumeration of supported Executor types
+   *
+   * @note The Executor is the device used to perform a Transfer
+   * @note IBVerbs executor is currently not implemented yet
+   */
+
+  enum DeviceType
+  {
+    EXE_CPU          = 0,
+    EXE_GPU          = 1,
+    EXE_NIC          = 2
+  };
+
+  inline bool IsCpuExeType(DeviceType e){ return e == EXE_CPU; }
+  inline bool IsGpuExeType(DeviceType e){ return e == EXE_GPU; }
+  inline bool IsNicExeType(DeviceType e){ return e == EXE_NIC; }
+
+  /**
+   * A ExeDevice defines a specific Executor
+   */
+  struct ExeDevice
+  {
+    DeviceType exeType;                         ///< Device type
+    int32_t exeIndex;                           ///< Device index
+
+    bool operator<(ExeDevice const& other) const {
+      return (exeType < other.exeType) || (exeType == other.exeType && exeIndex < other.exeIndex);
+    }
+  };
+
+
+  /**
+   * A MemDevice indicates a memory type on a specific device
+   */
+  struct MemDevice
+  {
+    MemType memType;                            ///< Memory type
+    int32_t memIndex;                           ///< Device index
+
+    bool operator<(MemDevice const& other) const {
+      return (memType < other.memType) || (memType == other.memType && memIndex < other.memIndex);
+    }
+  };
+
+  inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU); }
+  inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU); }
+
+  /**
+   * Returns the index of the NUMA node closest to the given GPU
+   *
+   * @param[in] gpuIndex Index of the GPU to query
+   * @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect
+   */
+  int GetClosestCpuNumaToGpu(int gpuIndex);
+
+  /**
+   * Returns the index of the NUMA node closest to the given NIC
+   *
+   * @param[in] nicIndex Index of the NIC to query
+   * @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect
+   */
+  int GetClosestCpuNumaToNic(int nicIndex);
+
+  /**
+   * Returns the index of the NIC closest to the given GPU
+   *
+   * @param[in] gpuIndex Index of the GPU to query
+   * @param[out] dev_name Name of of IB Verbs capable NIC index closest to GPU gpuIndex
+   * @returns index of IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect
+   */
+  int GetClosestNicToGpu(int gpuIndex, char **dev_name);
+
+  /**
+   * Returns information about number of available Devices
+   *
+   * @param[in]  Type    Hardware Device type to query
+   * @returns    Number of detected Devices of type Type
+   */
+  int GetNumDevices(DeviceType Type);
+
+  void DisplayTopology(bool outputToCsv);
+
+};
+
+//==========================================================================================
+// End of rocshmem API
+//==========================================================================================
+
+// Error check macros
+#define ROCSHMEM_SUCCESS 0
+
+#define ERR_CHECK(cmd)            \
+  do {                            \
+    int error = cmd;                                                      \
+    if (error != 0) {                                                \
+      fprintf(stderr, "error: %d at %s:%d\n", error, __FILE__, __LINE__);     \
+      exit(EXIT_FAILURE);                                                     \
+    }                                                                         \
+} while (0)
+
+#define CHECK_HSA(cmd)            \
+  do {                            \
+    hsa_status_t error = cmd;                                                      \
+    if (error != HSA_STATUS_SUCCESS) {                                        \
+      fprintf(stderr, "error: %d at %s:%d\n", error, __FILE__, __LINE__);     \
+      exit(EXIT_FAILURE);                                                     \
+    }                                                                         \
+} while (0)
+
+
+// Helper macros for calling RDMA functions and reporting errors
+#ifdef VERBS_DEBUG
+#define IBV_CALL(__func__, ...)                                         \
+  do {                                                                  \
+    int error = __func__(__VA_ARGS__);                                  \
+    if (error != 0) {                                                   \
+      fprintf(stderr,"Encountered IbVerbs error (%d) at line (%d) "        \
+              "and function (%s)", (error), __LINE__, #__func__);       \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                   \
+  } while (0)
+
+#define IBV_PTR_CALL(__ptr__, __func__, ...)                               \
+  do {                                                                     \
+    __ptr__ = __func__(__VA_ARGS__);                                       \
+    if (__ptr__ == nullptr) {                                              \
+      fprintf(stderr, "Encountered IbVerbs nullptr error at line (%d) " \
+              "and function (%s)", __LINE__, #__func__);                   \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                      \
+  } while (0)
+#else
+#define IBV_CALL(__func__, ...)                                         \
+  do {                                                                  \
+    int error = __func__(__VA_ARGS__);                                  \
+    if (error != 0) {                                                   \
+      fprintf(stderr, "Encountered IbVerbs error (%d) in func (%s) " \
+              , error, #__func__);                                      \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                   \
+  } while (0)
+
+#define IBV_PTR_CALL(__ptr__, __func__, ...)                               \
+  do {                                                                     \
+    __ptr__ = __func__(__VA_ARGS__);                                       \
+    if (__ptr__ == nullptr) {                                              \
+      fprintf(stderr, "Encountered IbVerbs nullptr error in func (%s) ",   \
+               #__func__);                                                \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                      \
+  } while (0)
+#endif
+
diff --git a/src/host/host.cpp b/src/host/host.cpp
index 87e8f351af..4cdc5f33a8 100644
--- a/src/host/host.cpp
+++ b/src/host/host.cpp
@@ -28,8 +28,8 @@
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "host_helpers.hpp"
-#include "../memory/window_info.hpp"
-#include "../util.hpp"
+#include "memory/window_info.hpp"
+#include "util.hpp"
 
 #include <cassert>
 
diff --git a/src/host/host.hpp b/src/host/host.hpp
index ef1ef32563..9c777d2e0b 100644
--- a/src/host/host.hpp
+++ b/src/host/host.hpp
@@ -39,10 +39,10 @@
 #include <map>
 
 #include "rocshmem/rocshmem.hpp"
-#include "../hdp_policy.hpp"
-#include "../memory/symmetric_heap.hpp"
-#include "../memory/window_info.hpp"
-#include "../bootstrap/bootstrap.hpp"
+#include "hdp_policy.hpp"
+#include "memory/symmetric_heap.hpp"
+#include "memory/window_info.hpp"
+#include "bootstrap/bootstrap.hpp"
 
 namespace rocshmem {
 
diff --git a/src/host/host_helpers.hpp b/src/host/host_helpers.hpp
index d6d450a38c..4490c7a9da 100644
--- a/src/host/host_helpers.hpp
+++ b/src/host/host_helpers.hpp
@@ -26,7 +26,7 @@
 #define LIBRARY_SRC_HOST_HOST_HELPERS_HPP_
 
 #include "host.hpp"
-#include "../memory/window_info.hpp"
+#include "memory/window_info.hpp"
 
 #include <cassert>
 
diff --git a/src/host/host_templates.hpp b/src/host/host_templates.hpp
index 79c837fa52..f95ef62e57 100644
--- a/src/host/host_templates.hpp
+++ b/src/host/host_templates.hpp
@@ -27,8 +27,8 @@
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "host_helpers.hpp"
-#include "../memory/window_info.hpp"
-#include "../team.hpp"
+#include "memory/window_info.hpp"
+#include "team.hpp"
 
 #include <utility>
 #include <cassert>
diff --git a/src/ipc/backend_ipc.cpp b/src/ipc/backend_ipc.cpp
index 6aea27b784..895bf924d1 100644
--- a/src/ipc/backend_ipc.cpp
+++ b/src/ipc/backend_ipc.cpp
@@ -61,8 +61,7 @@ int get_ls_non_zero_bit(char *bitmask, int mask_length) {
   return position;
 }
 
-IPCBackend::IPCBackend(MPI_Comm comm)
-    :  Backend(comm) {
+IPCBackend::IPCBackend(MPI_Comm comm):  Backend(comm) {
   type = BackendType::IPC_BACKEND;
 
   initIPC();
@@ -83,8 +82,7 @@ IPCBackend::IPCBackend(MPI_Comm comm)
   init();
 }
 
-IPCBackend::IPCBackend(TcpBootstrap *bootstrap)
-    :  Backend(bootstrap) {
+IPCBackend::IPCBackend(TcpBootstrap *bootstrap):  Backend(bootstrap) {
   type = BackendType::IPC_BACKEND;
 
   initIPC(bootstrap); // no MPI involved
@@ -115,7 +113,7 @@ void IPCBackend::init() {
 
   setup_team_world();
 
-  init_wrk_sync_buffer();
+  setup_wrk_sync_buffers();
 
   rocshmem_collective_init();
 
@@ -203,14 +201,14 @@ void IPCBackend::team_destroy(rocshmem_team_t team) {
   /* Mark the pool as available */
   int bit = team_obj->pool_index_;
   int byte_i = bit / CHAR_BIT;
-  pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT);
+  team_pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT);
 
   team_obj->~IPCTeam();
   CHECK_HIP(hipFree(team_obj));
 }
 
 void IPCBackend::Allreduce_char_BAND (char* inbuf, char *outbuf, size_t num_bytes,
-				      Team *team) {
+                                      Team *team) {
 
   // Implement an Allreduce outside of MPI. This is specialized for the scenario
   // required for the team creation, i.e. assuming bytes and using BAND operation.
@@ -251,16 +249,16 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team,
    * the pool of available work arrays.
    */
   if (team_comm != MPI_COMM_NULL) {
-    NET_CHECK(MPI_Allreduce(pool_bitmask_, reduced_bitmask_, bitmask_size_,
-			    MPI_CHAR, MPI_BAND, team_comm));
+    NET_CHECK(MPI_Allreduce(team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_,
+                            MPI_CHAR, MPI_BAND, team_comm));
   } else {
-    Allreduce_char_BAND (pool_bitmask_, reduced_bitmask_, bitmask_size_, parent_team);
+    Allreduce_char_BAND (team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_, parent_team);
   }
 
   /* Pick the least significant non-zero bit (logical layout) in the reduced
    * bitmask */
   auto max_num_teams{team_tracker.get_max_num_teams()};
-  int common_index = get_ls_non_zero_bit(reduced_bitmask_, max_num_teams);
+  int common_index = get_ls_non_zero_bit(team_reduced_bitmask_, max_num_teams);
   if (common_index < 0) {
     /* No team available */
     printf("Could not create team, all bits in use. Aborting.\n");
@@ -269,7 +267,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team,
 
   /* Mark the team as taken (by unsetting the bit in the pool bitmask) */
   int byte = common_index / CHAR_BIT;
-  pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT));
+  team_pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT));
 
   /**
    * Allocate device-side memory for team_world and
@@ -329,11 +327,11 @@ void IPCBackend::global_exit(int status) {
 }
 
 void IPCBackend::teams_destroy() {
-  free(pool_bitmask_);
-  free(reduced_bitmask_);
+  free(team_pool_bitmask_);
+  free(team_reduced_bitmask_);
 }
 
-void IPCBackend::init_wrk_sync_buffer() {
+void IPCBackend::setup_wrk_sync_buffers() {
   /**
    * calcualte work/sync buffer size
    */
@@ -342,12 +340,12 @@ void IPCBackend::init_wrk_sync_buffer() {
   /**
    * size of barrier sync
    */
-  Wrk_Sync_buffer_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE;
+  wrk_sync_pool_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE;
 
   /**
    * Size of sync arrays for the teams
   */
-  Wrk_Sync_buffer_size_ += sizeof(long) * max_num_teams *
+  wrk_sync_pool_size_ += sizeof(long) * max_num_teams *
                            (ROCSHMEM_BARRIER_SYNC_SIZE +
                             ROCSHMEM_REDUCE_SYNC_SIZE +
                             ROCSHMEM_BCAST_SYNC_SIZE +
@@ -357,23 +355,23 @@ void IPCBackend::init_wrk_sync_buffer() {
    * Size of work arrays for the teams
    * Accommodate largest possible data type for pWrk
   */
-  Wrk_Sync_buffer_size_ += sizeof(double) * max_num_teams *
+  wrk_sync_pool_size_ += sizeof(double) * max_num_teams *
                            (ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE +
                             ROCSHMEM_ATA_MAX_WRKDATA_SIZE);
 
   /**
    * Size of fence array
   */
-  Wrk_Sync_buffer_size_ += sizeof(int) * num_pes;
+  wrk_sync_pool_size_ += sizeof(int) * num_pes;
 
   /**
-   * Allocate a buffer of size Wrk_Sync_buffer_size_, using fine-grained
+   * Allocate a buffer of size wrk_sync_pool_size_, using fine-grained
    * memory allocator
   */
-  fine_grained_allocator_.allocate((void**)&Wrk_Sync_buffer_ptr_,
-                                   Wrk_Sync_buffer_size_);
-  assert(Wrk_Sync_buffer_ptr_);
-  temp_Wrk_Sync_buff_ptr_ = Wrk_Sync_buffer_ptr_;
+  fine_grained_allocator_.allocate((void**)&wrk_sync_pool_,
+                                   wrk_sync_pool_size_);
+  assert(wrk_sync_pool_);
+  wrk_sync_pool_top_ = wrk_sync_pool_;
 
   /*
    * Allocate a c-array to hold the IPC handles
@@ -383,16 +381,16 @@ void IPCBackend::init_wrk_sync_buffer() {
 
   /*
    * Call into the hip runtime to get an IPC handle for the allocated
-   * Wrk_Sync_buffer_ and store that IPC handle
+   * wrk_sync_pool_ buffer and store that IPC handle
    */
-  CHECK_HIP(hipIpcGetMemHandle(&ipc_handle[my_pe], Wrk_Sync_buffer_ptr_));
+  CHECK_HIP(hipIpcGetMemHandle(&ipc_handle[my_pe], wrk_sync_pool_));
 
   /*
    * all-to-all exchange with each PE to share the IPC handles.
    */
   if (backend_comm != MPI_COMM_NULL) {
     MPI_Allgather(MPI_IN_PLACE, sizeof(hipIpcMemHandle_t), MPI_CHAR,
-		  ipc_handle, sizeof(hipIpcMemHandle_t), MPI_CHAR, backend_comm);
+                  ipc_handle, sizeof(hipIpcMemHandle_t), MPI_CHAR, backend_comm);
   } else {
     assert (backend_bootstr != nullptr);
     backend_bootstr->allGather(ipc_handle, sizeof(hipIpcMemHandle_t));
@@ -403,9 +401,9 @@ void IPCBackend::init_wrk_sync_buffer() {
    * work/sync buffers
    */
   fine_grained_allocator_.allocate(
-    reinterpret_cast<void**>(&Wrk_Sync_buffer_bases_),
+    reinterpret_cast<void**>(&wrk_sync_pool_bases_),
     num_pes * sizeof(char*));
-  assert(Wrk_Sync_buffer_bases_);
+  assert(wrk_sync_pool_bases_);
 
   /*
    * For all local processing elements, initialize the device-side array
@@ -414,11 +412,11 @@ void IPCBackend::init_wrk_sync_buffer() {
   for (int i = 0; i < num_pes; i++) {
     if (i != my_pe) {
       CHECK_HIP(hipIpcOpenMemHandle(
-          reinterpret_cast<void**>(&Wrk_Sync_buffer_bases_[i]),
+          reinterpret_cast<void**>(&wrk_sync_pool_bases_[i]),
           ipc_handle[i],
           hipIpcMemLazyEnablePeerAccess));
     } else {
-      Wrk_Sync_buffer_bases_[i] = Wrk_Sync_buffer_ptr_;
+      wrk_sync_pool_bases_[i] = wrk_sync_pool_;
     }
   }
 }
@@ -426,19 +424,19 @@ void IPCBackend::init_wrk_sync_buffer() {
 void IPCBackend::cleanup_wrk_sync_buffer() {
   for (int i = 0; i < num_pes; i++) {
     if (i != my_pe) {
-      CHECK_HIP(hipIpcCloseMemHandle(Wrk_Sync_buffer_bases_[i]));
+      CHECK_HIP(hipIpcCloseMemHandle(wrk_sync_pool_bases_[i]));
     }
   }
-  fine_grained_allocator_.deallocate(Wrk_Sync_buffer_bases_);
-  fine_grained_allocator_.deallocate(Wrk_Sync_buffer_ptr_);
+  fine_grained_allocator_.deallocate(wrk_sync_pool_bases_);
+  fine_grained_allocator_.deallocate(wrk_sync_pool_);
 }
 
 void IPCBackend::setup_fence_buffer() {
   /*
   * Allocate memory for fence
   */
-  fence_pool = reinterpret_cast<int *>(temp_Wrk_Sync_buff_ptr_);
-  temp_Wrk_Sync_buff_ptr_ += sizeof(int) * num_pes;
+  fence_pool = reinterpret_cast<int *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(int) * num_pes;
 }
 
 void IPCBackend::rocshmem_collective_init() {
@@ -448,8 +446,8 @@ void IPCBackend::rocshmem_collective_init() {
   size_t one_sync_size_bytes {sizeof(*barrier_sync)};
   size_t sync_size_bytes {one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
 
-  barrier_sync = reinterpret_cast<int64_t*>(temp_Wrk_Sync_buff_ptr_);
-  temp_Wrk_Sync_buff_ptr_ += sync_size_bytes;
+  barrier_sync = reinterpret_cast<int64_t*>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sync_size_bytes;
 
   /*
    * Initialize the barrier synchronization array with default values.
@@ -475,30 +473,30 @@ void IPCBackend::teams_init() {
    */
   auto max_num_teams{team_tracker.get_max_num_teams()};
 
-  barrier_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
-  temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE
+  barrier_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE
                             * max_num_teams;
 
-  reduce_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
-  temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE
+  reduce_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE
                             * max_num_teams;
 
-  bcast_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
-  temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
+  bcast_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
                             * max_num_teams;
 
-  alltoall_pSync_pool = reinterpret_cast<long *>(temp_Wrk_Sync_buff_ptr_);
-  temp_Wrk_Sync_buff_ptr_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
+  alltoall_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
                             * max_num_teams;
 
   /* Accommodating for largest possible data type for pWrk */
-  pWrk_pool = reinterpret_cast<void *>(temp_Wrk_Sync_buff_ptr_);
-  temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
+  pWrk_pool = reinterpret_cast<void *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
                             * max_num_teams;
 
 
-  pAta_pool = reinterpret_cast<void *>(temp_Wrk_Sync_buff_ptr_);
-  temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE
+  pAta_pool = reinterpret_cast<void *>(wrk_sync_pool_top_);
+  wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE
                             * max_num_teams;
 
   /**
@@ -540,18 +538,18 @@ void IPCBackend::teams_init() {
    * Description shows only a 2-byte long mask but idea extends to any
    * arbitrary size.
    */
-  bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1)
+  team_bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1)
                                              : (max_num_teams / CHAR_BIT);
-  pool_bitmask_ = reinterpret_cast<char *>(malloc(bitmask_size_));
-  reduced_bitmask_ = reinterpret_cast<char *>(malloc(bitmask_size_));
+  team_pool_bitmask_ = reinterpret_cast<char *>(malloc(team_bitmask_size_));
+  team_reduced_bitmask_ = reinterpret_cast<char *>(malloc(team_bitmask_size_));
 
-  memset(pool_bitmask_, 0, bitmask_size_);
-  memset(reduced_bitmask_, 0, bitmask_size_);
+  memset(team_pool_bitmask_, 0, team_bitmask_size_);
+  memset(team_reduced_bitmask_, 0, team_bitmask_size_);
   /* Set all to available except the 0th one (reserved for TEAM_WORLD) */
   for (int bit_i = 1; bit_i < max_num_teams; bit_i++) {
     int byte_i = bit_i / CHAR_BIT;
 
-    pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT);
+    team_pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT);
   }
 
   /**
diff --git a/src/ipc/backend_ipc.hpp b/src/ipc/backend_ipc.hpp
index b783fc6dc4..3036fcfa28 100644
--- a/src/ipc/backend_ipc.hpp
+++ b/src/ipc/backend_ipc.hpp
@@ -25,14 +25,14 @@
 #ifndef LIBRARY_SRC_IPC_BACKEND_HPP_
 #define LIBRARY_SRC_IPC_BACKEND_HPP_
 
-#include "../backend_bc.hpp"
-#include "../containers/free_list_impl.hpp"
-#include "../hdp_proxy.hpp"
-#include "../memory/hip_allocator.hpp"
-#include "../context_incl.hpp"
+#include "backend_bc.hpp"
+#include "containers/free_list_impl.hpp"
+#include "hdp_proxy.hpp"
+#include "memory/hip_allocator.hpp"
+#include "context_incl.hpp"
 #include "ipc_context_proxy.hpp"
-#include "../ipc_policy.hpp"
-#include "../bootstrap/bootstrap.hpp"
+#include "ipc_policy.hpp"
+#include "bootstrap/bootstrap.hpp"
 
 namespace rocshmem {
 
@@ -113,7 +113,7 @@ class IPCBackend : public Backend {
    *
    * @return Vector containing the addresses of the work/sync bases
    */
-  char** get_wrk_sync_bases() { return Wrk_Sync_buffer_bases_; }
+  char** get_wrk_sync_bases() { return wrk_sync_pool_bases_; }
 
   /**
    * @brief The host-facing interface that will be used
@@ -244,7 +244,7 @@ class IPCBackend : public Backend {
   /**
    * @brief The bitmask representing the availability of teams in the pool
    */
-  char *pool_bitmask_{nullptr};
+  char *team_pool_bitmask_{nullptr};
 
   /**
    * @brief Bitmask to store the reduced result of bitmasks on pariticipating
@@ -253,12 +253,12 @@ class IPCBackend : public Backend {
    * With no thread-safety for this bitmask, multithreaded creation of teams is
    * not supported.
    */
-  char *reduced_bitmask_{nullptr};
+  char *team_reduced_bitmask_{nullptr};
 
   /**
    * @brief Size of the bitmask
    */
-  int bitmask_size_{-1};
+  int team_bitmask_size_{-1};
 
   /**
    * Fine grained memory allocator for buffers used in collectives Routines
@@ -268,31 +268,31 @@ class IPCBackend : public Backend {
   /**
    * @brief Collective routines work/sync buffer size
    */
-  size_t Wrk_Sync_buffer_size_{};
+  size_t wrk_sync_pool_size_{};
 
   /**
    * @brief Collective routines work/sync buffer base ptr
    */
-  char* const Wrk_Sync_buffer_ptr_{nullptr};
+  char* const wrk_sync_pool_{nullptr};
 
   /**
    * @brief Temporary buffer pointer pointing to the same address as
-   * Wrk_Sync_buffer_ptr_, used to calculate the starting addresses of
+   * wrk_sync_pool_, used to calculate the starting addresses of
    * different work and sync buffers.
   */
-  char *temp_Wrk_Sync_buff_ptr_{nullptr};
+  char *wrk_sync_pool_top_{nullptr};
 
   /**
    * @brief Array containing the addresses of the work/sync buffer bases
    * of other PEs
   */
-  char** Wrk_Sync_buffer_bases_{nullptr};
+  char** wrk_sync_pool_bases_{nullptr};
 
   /**
    * @brief Initialize memory required for work/sync buffers and open IPC
-   * handle on PE's Wrk_Sync_buffer_ptr.
+   * handle on PE's wrk_sync_pool.
    */
-  void init_wrk_sync_buffer();
+  void setup_wrk_sync_buffers();
 
   /**
    * @brief Close IPC memory handles for work/sync buffers and deallocate
diff --git a/src/ipc/context_ipc_device.cpp b/src/ipc/context_ipc_device.cpp
index f68144ceae..d6454c58ce 100644
--- a/src/ipc/context_ipc_device.cpp
+++ b/src/ipc/context_ipc_device.cpp
@@ -22,19 +22,14 @@
  * IN THE SOFTWARE.
  *****************************************************************************/
 
-#include "context_ipc_device.hpp"
-#include "context_ipc_tmpl_device.hpp"
-
 #include <hip/hip_runtime.h>
 #include <hip/amd_detail/amd_device_functions.h>
-#include <unistd.h>
-
-#include <cstdio>
-#include <cstdlib>
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "rocshmem/rocshmem.hpp"
 #include "backend_ipc.hpp"
+#include "context_ipc_device.hpp"
+#include "context_ipc_tmpl_device.hpp"
 
 namespace rocshmem {
 
@@ -46,7 +41,7 @@ __host__ IPCContext::IPCContext(Backend *b, unsigned int ctx_id)
 
   barrier_sync = backend->barrier_sync;
   fence_pool = backend->fence_pool;
-  Wrk_Sync_buffer_bases_ = backend->get_wrk_sync_bases();
+  wrk_sync_pool_bases_ = backend->get_wrk_sync_bases();
   ctx_id_ = ctx_id;
 
   orders_.store = detail::atomic::rocshmem_memory_order::memory_order_seq_cst;
@@ -64,18 +59,15 @@ __device__ void IPCContext::ctx_destroy(){
 
 __device__ void IPCContext::putmem(void *dest, const void *source, size_t nelems,
                                   int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
-  ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[pe] + L_offset,
-                   const_cast<void *>(source), nelems);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
+  ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[pe] + L_offset, const_cast<void *>(source), nelems);
   ipcImpl_.ipcFence();
 }
 
 __device__ void IPCContext::getmem(void *dest, const void *source, size_t nelems,
                                   int pe) {
   const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset =
-      const_cast<char *>(src_typed) - ipcImpl_.ipc_bases[my_pe];
+  uint64_t L_offset = const_cast<char *>(src_typed) - ipcImpl_.ipc_bases[my_pe];
   ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[pe] + L_offset, nelems);
   ipcImpl_.ipcFence();
 }
@@ -107,26 +99,22 @@ __device__ void IPCContext::quiet() {
 __device__ void *IPCContext::shmem_ptr(const void *dest, int pe) {
   void *ret = nullptr;
   void *dst = const_cast<void *>(dest);
-    uint64_t L_offset =
-        reinterpret_cast<char *>(dst) - ipcImpl_.ipc_bases[my_pe];
-    ret = ipcImpl_.ipc_bases[pe] + L_offset;
+  uint64_t L_offset = reinterpret_cast<char *>(dst) - ipcImpl_.ipc_bases[my_pe];
+  ret = ipcImpl_.ipc_bases[pe] + L_offset;
   return ret;
 }
 
 __device__ void IPCContext::putmem_wg(void *dest, const void *source,
                                      size_t nelems, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
-  ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[pe] + L_offset,
-                      const_cast<void *>(source), nelems);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
+  ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[pe] + L_offset, const_cast<void *>(source), nelems);
   __syncthreads();
 }
 
 __device__ void IPCContext::getmem_wg(void *dest, const void *source,
                                      size_t nelems, int pe) {
   const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset =
-      const_cast<char *>(src_typed) - ipcImpl_.ipc_bases[my_pe];
+  uint64_t L_offset = const_cast<char *>(src_typed) - ipcImpl_.ipc_bases[my_pe];
   ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[pe] + L_offset, nelems);
   __syncthreads();
 }
@@ -143,20 +131,16 @@ __device__ void IPCContext::getmem_nbi_wg(void *dest, const void *source,
 
 __device__ void IPCContext::putmem_wave(void *dest, const void *source,
                                        size_t nelems, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
-  ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[pe] + L_offset,
-                        const_cast<void *>(source), nelems);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
+  ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[pe] + L_offset, const_cast<void *>(source), nelems);
   ipcImpl_.ipcFence();
 }
 
 __device__ void IPCContext::getmem_wave(void *dest, const void *source,
                                        size_t nelems, int pe) {
   const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset =
-      const_cast<char *>(src_typed) - ipcImpl_.ipc_bases[my_pe];
-  ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[pe] + L_offset,
-                        nelems);
+  uint64_t L_offset = const_cast<char *>(src_typed) - ipcImpl_.ipc_bases[my_pe];
+  ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[pe] + L_offset, nelems);
   ipcImpl_.ipcFence();
 }
 
@@ -172,56 +156,46 @@ __device__ void IPCContext::getmem_nbi_wave(void *dest, const void *source,
 
 __device__ void IPCContext::internal_putmem(void *dest, const void *source,
                                             size_t nelems, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - Wrk_Sync_buffer_bases_[my_pe];
-  memcpy(Wrk_Sync_buffer_bases_[pe] + L_offset,
-                   const_cast<void *>(source), nelems);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - wrk_sync_pool_bases_[my_pe];
+  memcpy(wrk_sync_pool_bases_[pe] + L_offset, const_cast<void *>(source), nelems);
   ipcImpl_.ipcFence();
 }
 
 __device__ void IPCContext::internal_getmem(void *dest, const void *source,
                                             size_t nelems, int pe) {
   const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset =
-      const_cast<char *>(src_typed) - Wrk_Sync_buffer_bases_[my_pe];
-  memcpy(dest, Wrk_Sync_buffer_bases_[pe] + L_offset, nelems);
+  uint64_t L_offset = const_cast<char *>(src_typed) - wrk_sync_pool_bases_[my_pe];
+  memcpy(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems);
   ipcImpl_.ipcFence();
 }
 
 __device__ void IPCContext::internal_putmem_wg(void *dest, const void *source,
                                      size_t nelems, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - Wrk_Sync_buffer_bases_[my_pe];
-  memcpy_wg(Wrk_Sync_buffer_bases_[pe] + L_offset,
-                      const_cast<void *>(source), nelems);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - wrk_sync_pool_bases_[my_pe];
+  memcpy_wg(wrk_sync_pool_bases_[pe] + L_offset, const_cast<void *>(source), nelems);
   __syncthreads();
 }
 
 __device__ void IPCContext::internal_getmem_wg(void *dest, const void *source,
                                      size_t nelems, int pe) {
   const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset =
-      const_cast<char *>(src_typed) - Wrk_Sync_buffer_bases_[my_pe];
-  memcpy_wg(dest, Wrk_Sync_buffer_bases_[pe] + L_offset, nelems);
+  uint64_t L_offset = const_cast<char *>(src_typed) - wrk_sync_pool_bases_[my_pe];
+  memcpy_wg(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems);
   __syncthreads();
 }
 
 __device__ void IPCContext::internal_putmem_wave(void *dest,
                         const void *source, size_t nelems, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - Wrk_Sync_buffer_bases_[my_pe];
-  memcpy_wave(Wrk_Sync_buffer_bases_[pe] + L_offset,
-                        const_cast<void *>(source), nelems);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - wrk_sync_pool_bases_[my_pe];
+  memcpy_wave(wrk_sync_pool_bases_[pe] + L_offset, const_cast<void *>(source), nelems);
   ipcImpl_.ipcFence();
 }
 
 __device__ void IPCContext::internal_getmem_wave(void *dest,
                         const void *source, size_t nelems, int pe) {
   const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset =
-      const_cast<char *>(src_typed) - Wrk_Sync_buffer_bases_[my_pe];
-  memcpy_wave(dest, Wrk_Sync_buffer_bases_[pe] + L_offset,
-                        nelems);
+  uint64_t L_offset = const_cast<char *>(src_typed) - wrk_sync_pool_bases_[my_pe];
+  memcpy_wave(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems);
   ipcImpl_.ipcFence();
 }
 
diff --git a/src/ipc/context_ipc_device.hpp b/src/ipc/context_ipc_device.hpp
index 03450a4924..046bc1d31e 100644
--- a/src/ipc/context_ipc_device.hpp
+++ b/src/ipc/context_ipc_device.hpp
@@ -25,9 +25,9 @@
 #ifndef LIBRARY_SRC_IPC_CONTEXT_DEVICE_HPP_
 #define LIBRARY_SRC_IPC_CONTEXT_DEVICE_HPP_
 
-#include "../context.hpp"
-#include "../atomic.hpp"
-#include "../team.hpp"
+#include "context.hpp"
+#include "atomic.hpp"
+#include "team.hpp"
 
 namespace rocshmem {
 
@@ -235,8 +235,8 @@ class IPCContext : public Context {
   //internal functions used by collective operations
   template <typename T>
   __device__ void internal_broadcast(T *dest, const T *source, int nelems, int pe_root,
-                            int pe_start, int stride, int pe_size,
-                            long *p_sync);  // NOLINT(runtime/int)
+                                     int pe_start, int stride, int pe_size,
+                                     long *p_sync);  // NOLINT(runtime/int)
 
   template <typename T>
   __device__ void internal_put_broadcast(T *dst, const T *src, int nelems,
@@ -311,7 +311,7 @@ class IPCContext : public Context {
    * @brief Array containing the addresses of the work/sync buffer bases
    * of other PEs
   */
-  char **Wrk_Sync_buffer_bases_{nullptr};
+  char **wrk_sync_pool_bases_{nullptr};
 
   /**
    * @brief Decive context Id
diff --git a/src/ipc/context_ipc_device_coll.cpp b/src/ipc/context_ipc_device_coll.cpp
index 72223aa0d5..78547f701c 100644
--- a/src/ipc/context_ipc_device_coll.cpp
+++ b/src/ipc/context_ipc_device_coll.cpp
@@ -23,16 +23,16 @@
  *****************************************************************************/
 
 #include "rocshmem/rocshmem.hpp"
-#include "../context_incl.hpp"
+#include "context_incl.hpp"
 #include "context_ipc_tmpl_device.hpp"
-#include "../util.hpp"
+#include "util.hpp"
 #include "ipc_team.hpp"
 
 namespace rocshmem {
 
 __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start,
-                                                      int stride, int n_pes,
-                                                      int64_t *pSync) {
+                                                    int stride, int n_pes,
+                                                    int64_t *pSync) {
   int64_t flag_val = 1;
   if (pe == PE_start) {
     // Go through all PE offsets (except current offset = 0)
@@ -67,8 +67,8 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start,
 }
 
 __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start,
-                                                      int stride, int n_pes,
-                                                      int64_t *pSync) {
+                                                    int stride, int n_pes,
+                                                    int64_t *pSync) {
   int64_t flag_val = 1;
   if (pe == PE_start) {
     wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
@@ -96,7 +96,7 @@ __device__ void IPCContext::internal_sync(int pe, int PE_start, int stride,
 }
 
 __device__ void IPCContext::internal_sync_wave(int pe, int PE_start, int stride,
-                                          int PE_size, int64_t *pSync) {
+                                               int PE_size, int64_t *pSync) {
   if (is_thread_zero_in_wave()) {
     if (PE_size < 64) {
       internal_direct_barrier(pe, PE_start, stride, PE_size, pSync);
@@ -108,7 +108,7 @@ __device__ void IPCContext::internal_sync_wave(int pe, int PE_start, int stride,
 
 // Uses PE values that are relative to world
 __device__ void IPCContext::internal_sync_wg(int pe, int PE_start, int stride,
-                                          int PE_size, int64_t *pSync) {
+                                             int PE_size, int64_t *pSync) {
   __syncthreads();
   if (is_thread_zero_in_block()) {
     if (PE_size < 64) {
diff --git a/src/ipc/context_ipc_host.cpp b/src/ipc/context_ipc_host.cpp
index 7c459df1ba..e6e630355d 100644
--- a/src/ipc/context_ipc_host.cpp
+++ b/src/ipc/context_ipc_host.cpp
@@ -27,15 +27,15 @@
 #include <mpi.h>
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../backend_type.hpp"
-#include "../context_incl.hpp"
+#include "backend_type.hpp"
+#include "context_incl.hpp"
 #include "backend_ipc.hpp"
-#include "../host/host.hpp"
+#include "host/host.hpp"
 
 namespace rocshmem {
 
 __host__ IPCHostContext::IPCHostContext(Backend *backend,
-                                            [[maybe_unused]] int64_t options)
+                                        [[maybe_unused]] int64_t options)
     : Context(backend, true) {
   IPCBackend *b{static_cast<IPCBackend *>(backend)};
 
@@ -60,22 +60,22 @@ __host__ IPCHostContext::~IPCHostContext() {
 }
 
 __host__ void IPCHostContext::putmem_nbi(void *dest, const void *source,
-                                           size_t nelems, int pe) {
+                                         size_t nelems, int pe) {
   host_interface->putmem_nbi(dest, source, nelems, pe, context_window_info);
 }
 
 __host__ void IPCHostContext::getmem_nbi(void *dest, const void *source,
-                                           size_t nelems, int pe) {
+                                         size_t nelems, int pe) {
   host_interface->getmem_nbi(dest, source, nelems, pe, context_window_info);
 }
 
 __host__ void IPCHostContext::putmem(void *dest, const void *source,
-                                       size_t nelems, int pe) {
+                                     size_t nelems, int pe) {
   host_interface->putmem(dest, source, nelems, pe, context_window_info);
 }
 
 __host__ void IPCHostContext::getmem(void *dest, const void *source,
-                                       size_t nelems, int pe) {
+                                     size_t nelems, int pe) {
   host_interface->getmem(dest, source, nelems, pe, context_window_info);
 }
 
diff --git a/src/ipc/context_ipc_host.hpp b/src/ipc/context_ipc_host.hpp
index ddec120299..e14f905035 100644
--- a/src/ipc/context_ipc_host.hpp
+++ b/src/ipc/context_ipc_host.hpp
@@ -25,7 +25,7 @@
 #ifndef LIBRARY_SRC_IPC_CONTEXT_HOST_HPP_
 #define LIBRARY_SRC_IPC_CONTEXT_HOST_HPP_
 
-#include "../context.hpp"
+#include "context.hpp"
 
 namespace rocshmem {
 
@@ -116,9 +116,9 @@ class IPCHostContext : public Context {
 
   template <typename T>
   __host__ size_t wait_until_some(T *ivars, size_t nelems,
-                                size_t* indices,
-                                const int *status,
-                                int cmp, T val);
+                                  size_t* indices,
+                                  const int *status,
+                                  int cmp, T val);
 
   template <typename T>
   __host__ void wait_until_all_vector(T *ivars, size_t nelems,
diff --git a/src/ipc/context_ipc_tmpl_device.hpp b/src/ipc/context_ipc_tmpl_device.hpp
index 13094d8091..d35cec7f44 100644
--- a/src/ipc/context_ipc_tmpl_device.hpp
+++ b/src/ipc/context_ipc_tmpl_device.hpp
@@ -28,9 +28,9 @@
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "rocshmem/rocshmem.hpp"
 #include "context_ipc_device.hpp"
-#include "../util.hpp"
+#include "util.hpp"
 #include "ipc_team.hpp"
-#include "../rocshmem_calc.hpp"
+#include "rocshmem_calc.hpp"
 
 #include <hip/hip_runtime.h>
 
@@ -45,14 +45,12 @@ __device__ void IPCContext::p(T *dest, T value, int pe) {
 }
 
 template <typename T>
-__device__ void IPCContext::put(T *dest, const T *source, size_t nelems,
-                                int pe) {
+__device__ void IPCContext::put(T *dest, const T *source, size_t nelems, int pe) {
   putmem(dest, source, nelems * sizeof(T), pe);
 }
 
 template <typename T>
-__device__ void IPCContext::put_nbi(T *dest, const T *source, size_t nelems,
-                                    int pe) {
+__device__ void IPCContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) {
   putmem_nbi(dest, source, sizeof(T) * nelems, pe);
 }
 
@@ -64,32 +62,26 @@ __device__ T IPCContext::g(const T *source, int pe) {
 }
 
 template <typename T>
-__device__ void IPCContext::get(T *dest, const T *source, size_t nelems,
-                                int pe) {
+__device__ void IPCContext::get(T *dest, const T *source, size_t nelems, int pe) {
   getmem(dest, source, sizeof(T) * nelems, pe);
 }
 
 template <typename T>
-__device__ void IPCContext::get_nbi(T *dest, const T *source, size_t nelems,
-                                    int pe) {
+__device__ void IPCContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) {
   getmem_nbi(dest, source, sizeof(T) * nelems, pe);
 }
 
 // Atomics
 template <typename T>
 __device__ void IPCContext::amo_add(void *dest, T value, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
-  ipcImpl_.ipcAMOAdd(
-      reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
+  ipcImpl_.ipcAMOAdd(reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
 }
 
 template <typename T>
 __device__ void IPCContext::amo_set(void *dest, T value, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
-  ipcImpl_.ipcAMOSet(
-      reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
+  ipcImpl_.ipcAMOSet(reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
 }
 
 template <typename T>
@@ -140,34 +132,25 @@ __device__ void IPCContext::amo_xor(void *dst, T value, int pe) {
 
 template <typename T>
 __device__ void IPCContext::amo_cas(void *dest, T value, T cond, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
-  ipcImpl_.ipcAMOCas(
-      reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), cond,
-      value);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
+  ipcImpl_.ipcAMOCas(reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), cond, value);
 }
 
 template <typename T>
 __device__ T IPCContext::amo_fetch_add(void *dest, T value, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
-  return ipcImpl_.ipcAMOFetchAdd(
-      reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
+  return ipcImpl_.ipcAMOFetchAdd(reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
 }
 
 template <typename T>
 __device__ T IPCContext::amo_fetch_cas(void *dest, T value, T cond, int pe) {
-  uint64_t L_offset =
-      reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
-  return ipcImpl_.ipcAMOFetchCas(
-      reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), cond,
-      value);
+  uint64_t L_offset = reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
+  return ipcImpl_.ipcAMOFetchCas(reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), cond, value);
 }
 
 // Collectives
 template <typename T, ROCSHMEM_OP Op>
-__device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
-                               int wg_size) {
+__device__ void compute_reduce(T *src, T *dst, int size, int wg_id, int wg_size) {
   for (int i = wg_id; i < size; i += wg_size) {
     OpWrap<Op>::Calc(src, dst, i);
   }
@@ -543,50 +526,42 @@ __device__ void IPCContext::fcollect_linear(rocshmem_team_t team, T *dst,
 
 // Block/wave functions
 template <typename T>
-__device__ void IPCContext::put_wg(T *dest, const T *source, size_t nelems,
-                                   int pe) {
+__device__ void IPCContext::put_wg(T *dest, const T *source, size_t nelems, int pe) {
   putmem_wg(dest, source, nelems * sizeof(T), pe);
 }
 
 template <typename T>
-__device__ void IPCContext::put_nbi_wg(T *dest, const T *source,
-                                       size_t nelems, int pe) {
+__device__ void IPCContext::put_nbi_wg(T *dest, const T *source, size_t nelems, int pe) {
   putmem_nbi_wg(dest, source, nelems * sizeof(T), pe);
 }
 
   template <typename T>
-__device__ void IPCContext::put_wave(T *dest, const T *source, size_t nelems,
-                                     int pe) {
+__device__ void IPCContext::put_wave(T *dest, const T *source, size_t nelems, int pe) {
   putmem_wave(dest, source, nelems * sizeof(T), pe);
 }
 
 template <typename T>
-__device__ void IPCContext::put_nbi_wave(T *dest, const T *source,
-                                         size_t nelems, int pe) {
+__device__ void IPCContext::put_nbi_wave(T *dest, const T *source, size_t nelems, int pe) {
   putmem_nbi_wave(dest, source, nelems * sizeof(T), pe);
 }
 
 template <typename T>
-__device__ void IPCContext::get_wg(T *dest, const T *source, size_t nelems,
-                                   int pe) {
+__device__ void IPCContext::get_wg(T *dest, const T *source, size_t nelems, int pe) {
   getmem_wg(dest, source, nelems * sizeof(T), pe);
 }
 
 template <typename T>
-__device__ void IPCContext::get_nbi_wg(T *dest, const T *source,
-                                       size_t nelems, int pe) {
+__device__ void IPCContext::get_nbi_wg(T *dest, const T *source, size_t nelems, int pe) {
   getmem_nbi_wg(dest, source, nelems * sizeof(T), pe);
 }
 
 template <typename T>
-__device__ void IPCContext::get_wave(T *dest, const T *source, size_t nelems,
-                                     int pe) {
+__device__ void IPCContext::get_wave(T *dest, const T *source, size_t nelems, int pe) {
   getmem_wave(dest, source, nelems * sizeof(T), pe);
 }
 
 template <typename T>
-__device__ void IPCContext::get_nbi_wave(T *dest, const T *source,
-                                         size_t nelems, int pe) {
+__device__ void IPCContext::get_nbi_wave(T *dest, const T *source, size_t nelems, int pe) {
   getmem_nbi_wave(dest, source, nelems * sizeof(T), pe);
 }
 
diff --git a/src/ipc/context_ipc_tmpl_host.hpp b/src/ipc/context_ipc_tmpl_host.hpp
index aad55260c4..f8fc4aa8cb 100644
--- a/src/ipc/context_ipc_tmpl_host.hpp
+++ b/src/ipc/context_ipc_tmpl_host.hpp
@@ -26,7 +26,7 @@
 #define LIBRARY_SRC_IPC_CONTEXT_TMPL_HOST_HPP_
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../host/host_templates.hpp"
+#include "host/host_templates.hpp"
 
 namespace rocshmem {
 
@@ -41,26 +41,22 @@ __host__ T IPCHostContext::g(const T *source, int pe) {
 }
 
 template <typename T>
-__host__ void IPCHostContext::put(T *dest, const T *source, size_t nelems,
-                                    int pe) {
+__host__ void IPCHostContext::put(T *dest, const T *source, size_t nelems, int pe) {
   host_interface->put<T>(dest, source, nelems, pe, context_window_info);
 }
 
 template <typename T>
-__host__ void IPCHostContext::get(T *dest, const T *source, size_t nelems,
-                                    int pe) {
+__host__ void IPCHostContext::get(T *dest, const T *source, size_t nelems, int pe) {
   host_interface->get<T>(dest, source, nelems, pe, context_window_info);
 }
 
 template <typename T>
-__host__ void IPCHostContext::put_nbi(T *dest, const T *source, size_t nelems,
-                                        int pe) {
+__host__ void IPCHostContext::put_nbi(T *dest, const T *source, size_t nelems, int pe) {
   host_interface->put_nbi<T>(dest, source, nelems, pe, context_window_info);
 }
 
 template <typename T>
-__host__ void IPCHostContext::get_nbi(T *dest, const T *source, size_t nelems,
-                                        int pe) {
+__host__ void IPCHostContext::get_nbi(T *dest, const T *source, size_t nelems, int pe) {
   host_interface->get_nbi<T>(dest, source, nelems, pe, context_window_info);
 }
 
@@ -81,8 +77,7 @@ __host__ T IPCHostContext::amo_fetch_add(void *dst, T value, int pe) {
 
 template <typename T>
 __host__ T IPCHostContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
-  return host_interface->amo_fetch_cas(dst, value, cond, pe,
-                                       context_window_info);
+  return host_interface->amo_fetch_cas(dst, value, cond, pe, context_window_info);
 }
 
 template <typename T>
@@ -96,23 +91,23 @@ __host__ void IPCHostContext::broadcast(
 
 template <typename T>
 __host__ void IPCHostContext::broadcast(rocshmem_team_t team, T *dest,
-                                          const T *source, int nelems,
-                                          int pe_root) {
+                                        const T *source, int nelems,
+                                        int pe_root) {
   host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
 }
 
 template <typename T, ROCSHMEM_OP Op>
 __host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce,
-                                       int pe_start, int log_pe_stride,
-                                       int pe_size, T *p_wrk,
-                                       long *p_sync) {  // NOLINT(runtime/int)
+                                     int pe_start, int log_pe_stride,
+                                     int pe_size, T *p_wrk,
+                                     long *p_sync) {  // NOLINT(runtime/int)
   host_interface->to_all<T, Op>(dest, source, nreduce, pe_start, log_pe_stride,
                                 pe_size, p_wrk, p_sync);
 }
 
 template <typename T, ROCSHMEM_OP Op>
 __host__ int IPCHostContext::reduce(rocshmem_team_t team, T *dest,
-                                       const T *source, int nreduce) {
+                                    const T *source, int nreduce) {
   return host_interface->reduce<T, Op>(team, dest, source, nreduce);
 }
 
@@ -123,8 +118,8 @@ __host__ void IPCHostContext::wait_until(T *ivars, int cmp, T val) {
 
 template <typename T>
 __host__ void IPCHostContext::wait_until_all(T *ivars, size_t nelems,
-                                               const int* status,
-                                               int cmp, T val) {
+                                             const int* status,
+                                             int cmp, T val) {
   host_interface->wait_until_all<T>(ivars, nelems, status, cmp, val, context_window_info);
 }
 
@@ -137,31 +132,31 @@ __host__ size_t IPCHostContext::wait_until_any(T *ivars, size_t nelems,
 
 template <typename T>
 __host__ size_t IPCHostContext::wait_until_some(T *ivars, size_t nelems,
-                                                 size_t* indices,
-                                                 const int* status,
-                                                 int cmp, T val) {
+                                                size_t* indices,
+                                                const int* status,
+                                                int cmp, T val) {
   return host_interface->wait_until_some<T>(ivars, nelems, indices, status, cmp, val, context_window_info);
 }
 
 template <typename T>
 __host__ void IPCHostContext::wait_until_all_vector(T *ivars, size_t nelems,
-                                                      const int* status,
-                                                      int cmp, T* vals) {
+                                                    const int* status,
+                                                    int cmp, T* vals) {
   host_interface->wait_until_all_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
 }
 
 template <typename T>
 __host__ size_t IPCHostContext::wait_until_any_vector(T *ivars, size_t nelems,
-                                                        const int* status,
-                                                        int cmp, T* vals) {
+                                                      const int* status,
+                                                      int cmp, T* vals) {
   return host_interface->wait_until_any_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
 }
 
 template <typename T>
 __host__ size_t IPCHostContext::wait_until_some_vector(T *ivars, size_t nelems,
-                                                         size_t* indices,
-                                                         const int* status,
-                                                         int cmp, T* vals) {
+                                                       size_t* indices,
+                                                       const int* status,
+                                                       int cmp, T* vals) {
   return host_interface->wait_until_some_vector<T>(ivars, nelems, indices, status, cmp, vals, context_window_info);
 }
 
diff --git a/src/ipc/ipc_context_proxy.hpp b/src/ipc/ipc_context_proxy.hpp
index 6b9d22204f..68fa15637f 100644
--- a/src/ipc/ipc_context_proxy.hpp
+++ b/src/ipc/ipc_context_proxy.hpp
@@ -26,7 +26,7 @@
 #define LIBRARY_SRC_IPC_CONTEXT_PROXY_HPP_
 
 
-#include "../device_proxy.hpp"
+#include "device_proxy.hpp"
 #include "backend_ipc.hpp"
 
 namespace rocshmem {
diff --git a/src/ipc/ipc_team.cpp b/src/ipc/ipc_team.cpp
index bdb8d75209..757bbe059c 100644
--- a/src/ipc/ipc_team.cpp
+++ b/src/ipc/ipc_team.cpp
@@ -24,14 +24,15 @@
 
 #include "ipc_team.hpp"
 
-#include "../backend_type.hpp"
+#include "constants.hpp"
+#include "backend_type.hpp"
 #include "backend_ipc.hpp"
 
 namespace rocshmem {
 
 IPCTeam::IPCTeam(Backend *backend, TeamInfo *team_info_parent,
-                     TeamInfo *team_info_world, int num_pes, int my_pe,
-                     MPI_Comm mpi_comm, int pool_index)
+                 TeamInfo *team_info_world, int num_pes, int my_pe,
+                 MPI_Comm mpi_comm, int pool_index)
     : Team(backend, team_info_parent, team_info_world, num_pes, my_pe,
            mpi_comm) {
   type = BackendType::IPC_BACKEND;
@@ -39,18 +40,13 @@ IPCTeam::IPCTeam(Backend *backend, TeamInfo *team_info_parent,
 
   pool_index_ = pool_index;
 
-  barrier_pSync =
-      &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
-  reduce_pSync =
-      &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
+  barrier_pSync = &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
+  reduce_pSync = &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
   bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
-  alltoall_pSync =
-      &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
+  alltoall_pSync = &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
 
-  pWrk = reinterpret_cast<char *>(b->pWrk_pool) +
-         ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
-  pAta = reinterpret_cast<char *>(b->pAta_pool) +
-         ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
+  pWrk = reinterpret_cast<char *>(b->pWrk_pool) + ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
+  pAta = reinterpret_cast<char *>(b->pAta_pool) + ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
 }
 
 IPCTeam::~IPCTeam() {}
diff --git a/src/ipc/ipc_team.hpp b/src/ipc/ipc_team.hpp
index 74039692dd..59a4200748 100644
--- a/src/ipc/ipc_team.hpp
+++ b/src/ipc/ipc_team.hpp
@@ -25,7 +25,7 @@
 #ifndef LIBRARY_SRC_IPC_TEAM_HPP_
 #define LIBRARY_SRC_IPC_TEAM_HPP_
 
-#include "../team.hpp"
+#include "team.hpp"
 
 namespace rocshmem {
 
diff --git a/src/ipc_policy.hpp b/src/ipc_policy.hpp
index 7f1e17c925..bae17faf08 100644
--- a/src/ipc_policy.hpp
+++ b/src/ipc_policy.hpp
@@ -186,7 +186,7 @@ class IpcOffImpl {
 /*
  * Select which one of our IPC policies to use at compile time.
  */
-#ifdef USE_IPC
+#if defined(USE_IPC)
 typedef IpcOnImpl IpcImpl;
 #else
 typedef IpcOffImpl IpcImpl;
diff --git a/src/memory/binner.hpp b/src/memory/binner.hpp
index d62be5bd6b..a66d5682fb 100644
--- a/src/memory/binner.hpp
+++ b/src/memory/binner.hpp
@@ -30,7 +30,7 @@
 #include <iostream>
 #include <vector>
 
-#include "../constants.hpp"
+#include "constants.hpp"
 #include "bin.hpp"
 
 /**
diff --git a/src/memory/dlmalloc.hpp b/src/memory/dlmalloc.hpp
index f1d2d3b462..e427fbeae3 100644
--- a/src/memory/dlmalloc.hpp
+++ b/src/memory/dlmalloc.hpp
@@ -28,7 +28,7 @@
 #include <cassert>
 #include <map>
 
-#include "../constants.hpp"
+#include "constants.hpp"
 #include "shmem_allocator_strategy.hpp"
 
 /**
diff --git a/src/memory/memory_allocator.cpp b/src/memory/memory_allocator.cpp
index b9eaa96fe4..0c0177f9f8 100644
--- a/src/memory/memory_allocator.cpp
+++ b/src/memory/memory_allocator.cpp
@@ -26,7 +26,7 @@
 
 #include <cassert>
 
-#include "../util.hpp"
+#include "util.hpp"
 
 namespace rocshmem {
 
diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp
index 6805b86cbb..97a4e43ff4 100644
--- a/src/memory/notifier.hpp
+++ b/src/memory/notifier.hpp
@@ -25,9 +25,9 @@
 #ifndef LIBRARY_SRC_MEMORY_NOTIFIER_HPP_
 #define LIBRARY_SRC_MEMORY_NOTIFIER_HPP_
 
-#include "../device_proxy.hpp"
-#include "../util.hpp"
-#include "../atomic.hpp"
+#include "device_proxy.hpp"
+#include "util.hpp"
+#include "atomic.hpp"
 
 namespace rocshmem {
 
diff --git a/src/memory/pow2_bins.hpp b/src/memory/pow2_bins.hpp
index 3514c9d927..904f2dd380 100644
--- a/src/memory/pow2_bins.hpp
+++ b/src/memory/pow2_bins.hpp
@@ -28,7 +28,7 @@
 #include <cassert>
 #include <map>
 
-#include "../constants.hpp"
+#include "constants.hpp"
 #include "bin.hpp"
 #include "binner.hpp"
 #include "shmem_allocator_strategy.hpp"
diff --git a/src/memory/remote_heap_info.hpp b/src/memory/remote_heap_info.hpp
index 9918540668..29286d6dac 100644
--- a/src/memory/remote_heap_info.hpp
+++ b/src/memory/remote_heap_info.hpp
@@ -32,7 +32,7 @@
 
 #include "hip_allocator.hpp"
 #include "window_info.hpp"
-#include "../bootstrap/bootstrap.hpp"
+#include "bootstrap/bootstrap.hpp"
 
 /**
  * @file remote_heap_info.hpp
diff --git a/src/memory/symmetric_heap.hpp b/src/memory/symmetric_heap.hpp
index f7d3cf4871..c823918c3b 100644
--- a/src/memory/symmetric_heap.hpp
+++ b/src/memory/symmetric_heap.hpp
@@ -45,7 +45,7 @@
 
 #include "remote_heap_info.hpp"
 #include "single_heap.hpp"
-#include "../bootstrap/bootstrap.hpp"
+#include "bootstrap/bootstrap.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/backend_proxy.hpp b/src/reverse_offload/backend_proxy.hpp
index 7efad56b77..0d16178eed 100644
--- a/src/reverse_offload/backend_proxy.hpp
+++ b/src/reverse_offload/backend_proxy.hpp
@@ -27,8 +27,8 @@
 
 #include <atomic>
 
-#include "../device_proxy.hpp"
-#include "../stats.hpp"
+#include "device_proxy.hpp"
+#include "stats.hpp"
 #include "queue.hpp"
 
 namespace rocshmem {
diff --git a/src/reverse_offload/backend_ro.cpp b/src/reverse_offload/backend_ro.cpp
index c696fc5004..d7188709cc 100644
--- a/src/reverse_offload/backend_ro.cpp
+++ b/src/reverse_offload/backend_ro.cpp
@@ -35,12 +35,12 @@
 #include <thread>  // NOLINT
 
 #include "rocshmem/rocshmem.hpp"
-#include "../atomic_return.hpp"
-#include "../backend_type.hpp"
-#include "../context_incl.hpp"
+#include "atomic_return.hpp"
+#include "backend_type.hpp"
+#include "context_incl.hpp"
 #include "mpi_transport.hpp"
 #include "ro_net_team.hpp"
-#include "../util.hpp"
+#include "util.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/backend_ro.hpp b/src/reverse_offload/backend_ro.hpp
index 80b5993740..b2247367a6 100644
--- a/src/reverse_offload/backend_ro.hpp
+++ b/src/reverse_offload/backend_ro.hpp
@@ -28,10 +28,10 @@
 #include <memory>
 #include <vector>
 
-#include "../backend_bc.hpp"
-#include "../containers/free_list_impl.hpp"
-#include "../hdp_proxy.hpp"
-#include "../memory/hip_allocator.hpp"
+#include "backend_bc.hpp"
+#include "containers/free_list_impl.hpp"
+#include "hdp_proxy.hpp"
+#include "memory/hip_allocator.hpp"
 #include "backend_proxy.hpp"
 #include "block_handle.hpp"
 #include "context_proxy.hpp"
diff --git a/src/reverse_offload/block_handle.hpp b/src/reverse_offload/block_handle.hpp
index 06ba09b8ed..0c80567514 100644
--- a/src/reverse_offload/block_handle.hpp
+++ b/src/reverse_offload/block_handle.hpp
@@ -25,9 +25,9 @@
 #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_BLOCK_HANDLE_HPP_
 #define LIBRARY_SRC_REVERSE_OFFLOAD_BLOCK_HANDLE_HPP_
 
-#include "../containers/atomic_wf_queue_impl.hpp"
-#include "../hdp_policy.hpp"
-#include "../ipc_policy.hpp"
+#include "containers/atomic_wf_queue_impl.hpp"
+#include "hdp_policy.hpp"
+#include "ipc_policy.hpp"
 #include "profiler.hpp"
 #include "queue.hpp"
 
diff --git a/src/reverse_offload/context_proxy.hpp b/src/reverse_offload/context_proxy.hpp
index 91bbbdd3bf..3394c39847 100644
--- a/src/reverse_offload/context_proxy.hpp
+++ b/src/reverse_offload/context_proxy.hpp
@@ -26,8 +26,8 @@
 #define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_
 
 #include "rocshmem/rocshmem.hpp"
-#include "../device_proxy.hpp"
-#include "../memory/hip_allocator.hpp"
+#include "device_proxy.hpp"
+#include "memory/hip_allocator.hpp"
 #include "context_ro_device.hpp"
 
 namespace rocshmem {
diff --git a/src/reverse_offload/context_ro_device.cpp b/src/reverse_offload/context_ro_device.cpp
index 4e1b789a13..cd589b2492 100644
--- a/src/reverse_offload/context_ro_device.cpp
+++ b/src/reverse_offload/context_ro_device.cpp
@@ -34,12 +34,12 @@
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "rocshmem/rocshmem.hpp"
-#include "../backend_type.hpp"
-#include "../hdp_policy.hpp"
+#include "backend_type.hpp"
+#include "hdp_policy.hpp"
 #include "backend_proxy.hpp"
 #include "backend_ro.hpp"
 #include "ro_net_team.hpp"
-#include "../sync/abql_block_mutex.hpp"
+#include "sync/abql_block_mutex.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/context_ro_device.hpp b/src/reverse_offload/context_ro_device.hpp
index 1b06f8c0c7..3fc7b31ae9 100644
--- a/src/reverse_offload/context_ro_device.hpp
+++ b/src/reverse_offload/context_ro_device.hpp
@@ -25,7 +25,7 @@
 #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_RO_DEVICE_HPP_
 #define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_RO_DEVICE_HPP_
 
-#include "../context.hpp"
+#include "context.hpp"
 #include "block_handle.hpp"
 #include "commands_types.hpp"
 #include "queue.hpp"
diff --git a/src/reverse_offload/context_ro_host.cpp b/src/reverse_offload/context_ro_host.cpp
index 2d24c73085..8692b8dfe5 100644
--- a/src/reverse_offload/context_ro_host.cpp
+++ b/src/reverse_offload/context_ro_host.cpp
@@ -27,9 +27,9 @@
 #include <mpi.h>
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../backend_type.hpp"
-#include "../context_incl.hpp"
-#include "../host/host.hpp"
+#include "backend_type.hpp"
+#include "context_incl.hpp"
+#include "host/host.hpp"
 #include "backend_ro.hpp"
 
 namespace rocshmem {
diff --git a/src/reverse_offload/context_ro_host.hpp b/src/reverse_offload/context_ro_host.hpp
index 13d06f94a5..4e0719a84b 100644
--- a/src/reverse_offload/context_ro_host.hpp
+++ b/src/reverse_offload/context_ro_host.hpp
@@ -25,7 +25,7 @@
 #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_RO_HOST_HPP_
 #define LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_RO_HOST_HPP_
 
-#include "../context.hpp"
+#include "context.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/context_ro_tmpl_host.hpp b/src/reverse_offload/context_ro_tmpl_host.hpp
index 02dc788263..eb2b0efd51 100644
--- a/src/reverse_offload/context_ro_tmpl_host.hpp
+++ b/src/reverse_offload/context_ro_tmpl_host.hpp
@@ -26,7 +26,7 @@
 #define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_HOST_TEMPLATES_HPP_
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../host/host_templates.hpp"
+#include "host/host_templates.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/mpi_transport.cpp b/src/reverse_offload/mpi_transport.cpp
index 93f722fe3c..27ce188e3c 100644
--- a/src/reverse_offload/mpi_transport.cpp
+++ b/src/reverse_offload/mpi_transport.cpp
@@ -30,10 +30,10 @@
 #include <unistd.h>
 #include <cassert>
 
-#include "../host/host.hpp"
+#include "host/host.hpp"
 #include "backend_ro.hpp"
 #include "ro_net_team.hpp"
-#include "../util.hpp"
+#include "util.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/profiler.hpp b/src/reverse_offload/profiler.hpp
index cd043e1eba..c6b0e3ad9d 100644
--- a/src/reverse_offload/profiler.hpp
+++ b/src/reverse_offload/profiler.hpp
@@ -29,9 +29,9 @@
 #include <cassert>
 
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../device_proxy.hpp"
-#include "../memory/../memory/hip_allocator.hpp"
-#include "../stats.hpp"
+#include "device_proxy.hpp"
+#include "memory/../memory/hip_allocator.hpp"
+#include "stats.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/queue.hpp b/src/reverse_offload/queue.hpp
index 89807cc7d5..16d8eb1b76 100644
--- a/src/reverse_offload/queue.hpp
+++ b/src/reverse_offload/queue.hpp
@@ -25,7 +25,7 @@
 #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_HPP_
 #define LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_HPP_
 
-#include "../hdp_proxy.hpp"
+#include "hdp_proxy.hpp"
 #include "queue_proxy.hpp"
 #include "queue_desc_proxy.hpp"
 
diff --git a/src/reverse_offload/queue_desc_proxy.hpp b/src/reverse_offload/queue_desc_proxy.hpp
index a5ca2a207a..c9d90aa721 100644
--- a/src/reverse_offload/queue_desc_proxy.hpp
+++ b/src/reverse_offload/queue_desc_proxy.hpp
@@ -25,7 +25,7 @@
 #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_DESC_PROXY_HPP_
 #define LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_DESC_PROXY_HPP_
 
-#include "../device_proxy.hpp"
+#include "device_proxy.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/queue_proxy.hpp b/src/reverse_offload/queue_proxy.hpp
index bfa60e690b..d6a2b9be9e 100644
--- a/src/reverse_offload/queue_proxy.hpp
+++ b/src/reverse_offload/queue_proxy.hpp
@@ -27,13 +27,13 @@
 
 #include <mpi.h>
 
-#include "../atomic_return.hpp"
-#include "../device_proxy.hpp"
-#include "../hdp_policy.hpp"
-#include "../ipc_policy.hpp"
+#include "atomic_return.hpp"
+#include "device_proxy.hpp"
+#include "hdp_policy.hpp"
+#include "ipc_policy.hpp"
 #include "commands_types.hpp"
 #include "profiler.hpp"
-#include "../sync/abql_block_mutex.hpp"
+#include "sync/abql_block_mutex.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/ro_net_team.cpp b/src/reverse_offload/ro_net_team.cpp
index cae0f5d92d..f04872826a 100644
--- a/src/reverse_offload/ro_net_team.cpp
+++ b/src/reverse_offload/ro_net_team.cpp
@@ -24,7 +24,7 @@
 
 #include "ro_net_team.hpp"
 
-#include "../backend_type.hpp"
+#include "backend_type.hpp"
 #include "backend_ro.hpp"
 
 namespace rocshmem {
diff --git a/src/reverse_offload/ro_net_team.hpp b/src/reverse_offload/ro_net_team.hpp
index 5ffcf8e5f3..8af113b701 100644
--- a/src/reverse_offload/ro_net_team.hpp
+++ b/src/reverse_offload/ro_net_team.hpp
@@ -25,7 +25,7 @@
 #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_TEAM_HPP_
 #define LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_TEAM_HPP_
 
-#include "../team.hpp"
+#include "team.hpp"
 
 #define MAX_ATA_BUFF_SIZE (1024 * 1024 * 128)
 
diff --git a/src/reverse_offload/ro_team_proxy.hpp b/src/reverse_offload/ro_team_proxy.hpp
index 06bb43d663..28e620f3e5 100644
--- a/src/reverse_offload/ro_team_proxy.hpp
+++ b/src/reverse_offload/ro_team_proxy.hpp
@@ -27,7 +27,7 @@
 
 #include <mpi.h>
 
-#include "../device_proxy.hpp"
+#include "device_proxy.hpp"
 #include "ro_net_team.hpp"
 #include "team_info_proxy.hpp"
 
diff --git a/src/reverse_offload/team_info_proxy.hpp b/src/reverse_offload/team_info_proxy.hpp
index 1ad25c0679..e73ca98d1b 100644
--- a/src/reverse_offload/team_info_proxy.hpp
+++ b/src/reverse_offload/team_info_proxy.hpp
@@ -25,8 +25,8 @@
 #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_TEAM_INFO_PROXY_HPP_
 #define LIBRARY_SRC_REVERSE_OFFLOAD_TEAM_INFO_PROXY_HPP_
 
-#include "../device_proxy.hpp"
-#include "../team.hpp"
+#include "device_proxy.hpp"
+#include "team.hpp"
 
 namespace rocshmem {
 
diff --git a/src/reverse_offload/window_proxy.hpp b/src/reverse_offload/window_proxy.hpp
index 3883628ebb..1492e6d3ab 100644
--- a/src/reverse_offload/window_proxy.hpp
+++ b/src/reverse_offload/window_proxy.hpp
@@ -25,8 +25,8 @@
 #ifndef LIBRARY_SRC_REVERSE_OFFLOAD_WINDOW_PROXY_HPP_
 #define LIBRARY_SRC_REVERSE_OFFLOAD_WINDOW_PROXY_HPP_
 
-#include "../device_proxy.hpp"
-#include "../memory/window_info.hpp"
+#include "device_proxy.hpp"
+#include "memory/window_info.hpp"
 #include "mpi_transport.hpp"
 
 namespace rocshmem {
diff --git a/src/rocshmem.cpp b/src/rocshmem.cpp
index a5d69cf179..d1a8e38b2d 100644
--- a/src/rocshmem.cpp
+++ b/src/rocshmem.cpp
@@ -35,12 +35,17 @@
 
 #include "backend_bc.hpp"
 #include "context_incl.hpp"
-#ifdef USE_RO
+#if defined(USE_RO)
 #include "reverse_offload/backend_ro.hpp"
 #include "reverse_offload/context_ro_tmpl_host.hpp"
-#else
+#elif defined(USE_IPC)
 #include "ipc/backend_ipc.hpp"
 #include "ipc/context_ipc_tmpl_host.hpp"
+#elif defined(USE_GDA)
+#include "gda/backend_gda.hpp"
+#include "gda/context_gda_tmpl_host.hpp"
+#else
+#error "Select one backend among USE_RO, USE_IPC, USE_GDA"
 #endif
 #include "mpi_instance.hpp"
 #include "team.hpp"
@@ -89,12 +94,15 @@ rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
 
   mpi_instance = new MPIInstance(comm);
 
-#ifdef USE_RO
+#if defined(USE_RO)
   CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
   backend = new (backend) ROBackend(comm);
-#else
+#elif defined(USE_IPC)
   CHECK_HIP(hipHostMalloc(&backend, sizeof(IPCBackend)));
   backend = new (backend) IPCBackend(comm);
+#elif defined(USE_GDA)
+  CHECK_HIP(hipHostMalloc(&backend, sizeof(GDABackend)));
+  backend = new (backend) GDABackend(comm);
 #endif
 
   if (!backend) {
@@ -166,12 +174,15 @@ rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
 
   rocm_init();
 
-#ifdef USE_RO
+#if defined(USE_RO)
   printf("RO Backend requires MPI library to be initialized, even when using uniqueId initializations!\n");
   abort();
-#else
+#elif defined(USE_IPC)
   CHECK_HIP(hipHostMalloc(&backend, sizeof(IPCBackend)));
   backend = new (backend) IPCBackend(bootstrap);
+#elif defined(USE_GDA)
+  CHECK_HIP(hipHostMalloc(&backend, sizeof(GDABackend)));
+  backend = new (backend) GDABackend(bootstrap);
 #endif
 
   if (!backend) {
diff --git a/src/rocshmem_gpu.cpp b/src/rocshmem_gpu.cpp
index 2e3fc0821d..8abe25e7ec 100644
--- a/src/rocshmem_gpu.cpp
+++ b/src/rocshmem_gpu.cpp
@@ -51,13 +51,17 @@
 #include "templates.hpp"
 #include "util.hpp"
 
-#ifdef USE_RO
+#if defined(USE_RO)
 #include "reverse_offload/context_ro_tmpl_device.hpp"
-#else
-#ifdef ENABLE_IPC_BITCODE
-  #include "ipc/backend_ipc.hpp"
-#endif
+#elif defined(USE_IPC)
+# if defined(ENABLE_IPC_BITCODE)
+#  include "ipc/backend_ipc.hpp"
+# endif
 #include "ipc/context_ipc_tmpl_device.hpp"
+#elif defined(USE_GDA)
+#include "gda/context_gda_tmpl_device.hpp"
+#else
+#error "Select one backend among USE_RO, USE_IPC, USE_GDA"
 #endif
 
 /******************************************************************************
@@ -70,7 +74,7 @@ __device__  rocshmem_ctx_t __attribute__((visibility("default"))) ROCSHMEM_CTX_D
 
 __constant__ Backend *device_backend_proxy;
 
-#ifdef ENABLE_IPC_BITCODE
+#if defined(ENABLE_IPC_BITCODE)
   typedef IPCContext ContextTy;
 #else
   typedef Context ContextTy;
diff --git a/src/sync/abql_block_mutex.cpp b/src/sync/abql_block_mutex.cpp
index cb23925d44..9a34cbb1bf 100644
--- a/src/sync/abql_block_mutex.cpp
+++ b/src/sync/abql_block_mutex.cpp
@@ -22,9 +22,9 @@
  * IN THE SOFTWARE.
  *****************************************************************************/
 
-#include "../sync/abql_block_mutex.hpp"
+#include "sync/abql_block_mutex.hpp"
 
-#include "../util.hpp"
+#include "util.hpp"
 
 namespace rocshmem {
 
diff --git a/src/sync/abql_block_mutex.hpp b/src/sync/abql_block_mutex.hpp
index 7bf95edc4a..bcd51aba01 100644
--- a/src/sync/abql_block_mutex.hpp
+++ b/src/sync/abql_block_mutex.hpp
@@ -25,7 +25,7 @@
 #ifndef LIBRARY_SRC_SYNC_ABQL_BLOCK_MUTEX_HPP_
 #define LIBRARY_SRC_SYNC_ABQL_BLOCK_MUTEX_HPP_
 
-#include "../device_proxy.hpp"
+#include "device_proxy.hpp"
 
 #include <hip/hip_runtime.h>
 
diff --git a/src/team.cpp b/src/team.cpp
index ab06c22d74..e26420e408 100644
--- a/src/team.cpp
+++ b/src/team.cpp
@@ -46,6 +46,10 @@ IPCTeam* get_internal_ipc_team(rocshmem_team_t team) {
   return reinterpret_cast<IPCTeam*>(team);
 }
 
+GDATeam* get_internal_gda_team(rocshmem_team_t team) {
+  return reinterpret_cast<GDATeam*>(team);
+}
+
 __host__ __device__ int team_translate_pe(rocshmem_team_t src_team, int src_pe,
                                           rocshmem_team_t dst_team) {
   if (src_team == ROCSHMEM_TEAM_INVALID ||
diff --git a/src/team.hpp b/src/team.hpp
index bf9bc764ac..343424da39 100644
--- a/src/team.hpp
+++ b/src/team.hpp
@@ -36,6 +36,7 @@ class Backend;
 class Team;
 class ROTeam;
 class IPCTeam;
+class GDATeam;
 
 class TeamInfo {
  public:
@@ -164,6 +165,8 @@ ROTeam* get_internal_ro_team(rocshmem_team_t team);
 
 IPCTeam* get_internal_ipc_team(rocshmem_team_t team);
 
+GDATeam* get_internal_gda_team(rocshmem_team_t team);
+
 __host__ __device__ int team_translate_pe(rocshmem_team_t src_team, int src_pe,
                                           rocshmem_team_t dst_team);
 
diff --git a/src/tools/rocshmem_info.cpp b/src/tools/rocshmem_info.cpp
index 738b5b4f98..73040b6451 100644
--- a/src/tools/rocshmem_info.cpp
+++ b/src/tools/rocshmem_info.cpp
@@ -1,5 +1,5 @@
 
-#include "../util.hpp"
+#include "util.hpp"
 
 #include <stdio.h>
 #include <fstream>
diff --git a/src/util.hpp b/src/util.hpp
index c3d059edf9..cdd70932fa 100644
--- a/src/util.hpp
+++ b/src/util.hpp
@@ -31,24 +31,67 @@
 
 #include <cstdio>
 
-#include "assembly.hpp"
 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "constants.hpp"
+#include "assembly.hpp"
 
 namespace rocshmem {
 
-#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
-#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+#define LIKELY(X)   __builtin_expect(X, 1)
+#define UNLIKELY(X) __builtin_expect(X, 0)
 
-#define CHECK_HIP(cmd)                                                        \
-  {                                                                           \
-    hipError_t error = cmd;                                                   \
-    if (error != hipSuccess) {                                                \
-      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \
-              error, __FILE__, __LINE__);                                     \
-      exit(EXIT_FAILURE);                                                     \
-    }                                                                         \
-  }
+/**
+ * @name CHECK_NNULL
+ * @brief Checks if value is NOT null. If it is null print errno and exit the program.
+ *
+ * @param[in] value    Value to check
+ * @param[in] fn_str   String describing checked function
+ *
+ */
+#define CHECK_NNULL(value, fn_str) do {                \
+  if (UNLIKELY(nullptr == (value))) {                  \
+    fprintf(stderr,                                    \
+      "Error: %s: %s (%d) at RocSHMEM::%s:%d\n",       \
+      fn_str, strerror(errno), errno,                  \
+      __FILE__, __LINE__);                             \
+    abort();                                           \
+  }                                                    \
+} while(0)
+
+/**
+ * @name CHECK_ZERO
+ * @brief Checks if value is zero. If it is not zero print errno and exit the program.
+ *
+ * @param[in] value    Value to check
+ * @param[in] fn_str   String describing checked function
+ *
+ */
+#define CHECK_ZERO(value, fn_str) do {                 \
+  if (UNLIKELY(0 != (value))) {                        \
+    fprintf(stderr,                                    \
+      "Error: %s: %s (%d) at RocSHMEM::%s:%d\n",       \
+      fn_str, strerror(errno), errno,             \
+      __FILE__, __LINE__);                             \
+    abort();                                           \
+  }                                                    \
+} while(0)
+
+/**
+ * @name CHECK_HIP
+ * @brief Checks if HIP command succeeded. If it is not not success then it exits the program.
+ *
+ * @param[in] instr    HIP function to run and check
+ *
+ */
+#define CHECK_HIP(instr) do {                               \
+  hipError_t error = (instr);                               \
+  if (error != hipSuccess) {                                \
+    fprintf(stderr,                                         \
+      "Error: " #instr ": %s (%d) at RocSHMEM::%s:%d\n",    \
+      hipGetErrorString(error), error, __FILE__, __LINE__); \
+    abort();                                                \
+  }                                                         \
+} while(0)
 
 #ifdef DEBUG
 #define DPRINTF(...)     \
@@ -132,7 +175,7 @@ __device__ __forceinline__ int get_flat_grid_id() {
  * Returns the flattened thread index of the calling thread within the grid.
  */
 __device__ __forceinline__ int get_flat_id() {
-    return get_flat_grid_id() * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z) + get_flat_block_id();
+  return get_flat_grid_id() * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z) + get_flat_block_id();
 }
 
 /*
@@ -142,6 +185,51 @@ __device__ __forceinline__ bool is_thread_zero_in_wave() {
   return (get_flat_block_id() % WF_SIZE) == 0;
 }
 
+__device__ __forceinline__ uint64_t get_active_lane_mask() {
+  return __ballot(true);
+}
+
+__device__ __forceinline__ unsigned int get_active_lane_count(uint64_t active_lane_mask) {
+  return __popcll(active_lane_mask);
+}
+
+__device__ __forceinline__ unsigned int get_active_lane_count() {
+  return get_active_lane_count(get_active_lane_mask());
+}
+
+__device__ __forceinline__ unsigned int get_active_lane_num(uint64_t active_lane_mask) {
+  return __popcll(active_lane_mask & __lanemask_lt());
+}
+
+__device__ __forceinline__ unsigned int get_active_lane_num() {
+  return get_active_lane_num(get_active_lane_mask());
+}
+
+__device__ __forceinline__ int get_first_active_lane_id(uint64_t active_lane_mask) {
+  return __ffsll((unsigned long long int)active_lane_mask) - 1;
+}
+
+__device__ __forceinline__ int get_first_active_lane_id() {
+  return get_first_active_lane_id(get_active_lane_mask());
+}
+
+__device__ __forceinline__ bool is_first_active_lane(uint64_t active_lane_mask) {
+  return get_active_lane_num(active_lane_mask) == 0;
+}
+
+__device__ __forceinline__ bool is_first_active_lane() {
+  return is_first_active_lane(get_active_lane_mask());
+}
+
+__device__ __forceinline__ bool is_last_active_lane(uint64_t active_lane_mask) {
+  return get_active_lane_num(active_lane_mask) == get_active_lane_count(active_lane_mask) - 1;
+}
+
+__device__ __forceinline__ bool is_last_active_lane() {
+  return is_last_active_lane(get_active_lane_mask());
+}
+
+
 extern __constant__ int* print_lock;
 
 template <typename... Args>
@@ -168,6 +256,9 @@ __device__ void gpu_dprintf(const char* fmt, const Args&... args) {
   }
 }
 
+#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+
 __device__ __forceinline__ void memcpy(void* dst, void* src, size_t size) {
   uint8_t* dst_bytes{static_cast<uint8_t*>(dst)};
   uint8_t* src_bytes{static_cast<uint8_t*>(src)};
@@ -264,8 +355,7 @@ __device__ __forceinline__ void memcpy_wave(void* dst, void* src, size_t size) {
 
 int rocm_init();
 
-void rocm_memory_lock_to_fine_grain(void* ptr, size_t size, void** gpu_ptr,
-                                    int gpu_id);
+void rocm_memory_lock_to_fine_grain(void* ptr, size_t size, void** gpu_ptr, int gpu_id);
 
 class rocshmem_env_config {
 public:
diff --git a/tests/functional_tests/amo_standard_tester.cpp b/tests/functional_tests/amo_standard_tester.cpp
index 028cb4447e..c05f1d780e 100644
--- a/tests/functional_tests/amo_standard_tester.cpp
+++ b/tests/functional_tests/amo_standard_tester.cpp
@@ -23,6 +23,7 @@
  *****************************************************************************/
 
 #include "amo_standard_tester.hpp"
+#include "tester.hpp"
 
 #include <iostream>
 #include <rocshmem/rocshmem.hpp>
@@ -101,10 +102,7 @@ void AMOStandardTester<T>::verifyResults(size_t size) {
         break;
     }
 
-    int fetch_op = (_type == AMO_FAddTestType || _type == AMO_FIncTestType ||
-                    _type == AMO_FCswapTestType)
-                       ? 1
-                       : 0;
+    int fetch_op = (_type == AMO_FAddTestType || _type == AMO_FIncTestType || _type == AMO_FCswapTestType) ? 1: 0;
 
     if (fetch_op == 1) {
       ret = *std::max_element(_ret_val, _ret_val + args.num_wgs);
diff --git a/tests/functional_tests/tester.hpp b/tests/functional_tests/tester.hpp
index ddb65c6508..ea2bb1a728 100644
--- a/tests/functional_tests/tester.hpp
+++ b/tests/functional_tests/tester.hpp
@@ -187,14 +187,15 @@ class Tester {
   hipEvent_t stop_event;
 };
 
-#define CHECK_HIP(cmd)                                                        \
-  {                                                                           \
-    hipError_t error = cmd;                                                   \
-    if (error != hipSuccess) {                                                \
-      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \
-              error, __FILE__, __LINE__);                                     \
-      exit(EXIT_FAILURE);                                                     \
-    }                                                                         \
-  }
+//TODO remove altogether? THere is a small difference in print format
+#undef CHECK_HIP
+#define CHECK_HIP(instr) do {                                               \
+  hipError_t error = (instr);                                               \
+  if (error != hipSuccess) {                                                \
+    fprintf(stderr, "error: " #instr ": %s (%d) at %s:%d\n",                \
+      hipGetErrorString(error), error, __FILE__, __LINE__);                 \
+    abort();                                                                \
+  }                                                                         \
+} while(0)
 
 #endif /* _TESTER_HPP */