From 658bf2a3b5e0f1b7ec7fa837529a947502b69bd3 Mon Sep 17 00:00:00 2001
From: Yiltan <ytemucin@amd.com>
Date: Mon, 24 Mar 2025 09:04:52 -0400
Subject: [PATCH] Removed GPU_IB (#59)

---
 CMakeLists.txt                                |   20 -
 README.md                                     |   10 +-
 cmake/Modules/FindIbverbs.cmake               |   62 -
 cmake/rocshmem_config.h.in                    |    2 -
 scripts/build_configs/dc_multi                |   27 -
 scripts/build_configs/dc_multi_debug          |   27 -
 scripts/build_configs/dc_multi_ipc            |   27 -
 scripts/build_configs/dc_multi_profile        |   27 -
 scripts/build_configs/dc_single               |   27 -
 scripts/build_configs/ipc_single              |    2 -
 scripts/build_configs/ipc_tests_only          |    2 -
 scripts/build_configs/rc_multi                |   27 -
 scripts/build_configs/rc_multi_debug          |   27 -
 scripts/build_configs/rc_multi_wf_coal        |   27 -
 scripts/build_configs/rc_single               |   28 -
 scripts/build_configs/rc_single_debug         |   27 -
 scripts/build_configs/rc_single_managed       |   28 -
 scripts/build_configs/rc_single_managed_debug |   28 -
 scripts/build_configs/rc_single_profile       |   27 -
 scripts/build_configs/rc_single_single_node   |   30 -
 .../build_configs/rc_single_single_node_debug |   30 -
 scripts/build_configs/ro_ipc                  |    2 -
 scripts/build_configs/ro_net                  |    2 -
 scripts/build_configs/ro_net_debug            |    2 -
 scripts/functional_tests/driver.py            |    5 +-
 src/CMakeLists.txt                            |    4 +-
 src/backend_bc.cpp                            |   12 +-
 src/backend_bc.hpp                            |    2 +-
 src/backend_type.hpp                          |   30 +-
 src/context_incl.hpp                          |    5 +-
 src/context_tmpl_device.hpp                   |    4 +-
 src/context_tmpl_host.hpp                     |    4 +-
 src/gpu_ib/CMakeLists.txt                     |   45 -
 src/gpu_ib/backend_ib.cpp                     |  493 --------
 src/gpu_ib/backend_ib.hpp                     |  351 ------
 src/gpu_ib/connect_tests.cpp                  |   54 -
 src/gpu_ib/connection.cpp                     |  431 -------
 src/gpu_ib/connection.hpp                     |  259 ----
 src/gpu_ib/connection_policy.cpp              |   81 --
 src/gpu_ib/connection_policy.hpp              |  130 --
 src/gpu_ib/context_ib_device.cpp              |  371 ------
 src/gpu_ib/context_ib_device.hpp              |  304 -----
 src/gpu_ib/context_ib_device_coll.cpp         |  122 --
 src/gpu_ib/context_ib_host.cpp                |   85 --
 src/gpu_ib/context_ib_host.hpp                |  149 ---
 src/gpu_ib/context_ib_tmpl_device.hpp         | 1097 -----------------
 src/gpu_ib/context_ib_tmpl_host.hpp           |  173 ---
 src/gpu_ib/debug.cpp                          |   39 -
 src/gpu_ib/dynamic_connection.cpp             |  381 ------
 src/gpu_ib/dynamic_connection.hpp             |  122 --
 src/gpu_ib/endian.cpp                         |   79 --
 src/gpu_ib/endian.hpp                         |   53 -
 src/gpu_ib/gpu_ib_team.cpp                    |   56 -
 src/gpu_ib/gpu_ib_team.hpp                    |   50 -
 src/gpu_ib/infiniband_structs.hpp             |   49 -
 src/gpu_ib/memory_builder_policy.hpp          |   78 --
 src/gpu_ib/network_policy.cpp                 |  500 --------
 src/gpu_ib/network_policy.hpp                 |  357 ------
 src/gpu_ib/qe_dumper.cpp                      |   79 --
 src/gpu_ib/qe_dumper.hpp                      |   66 -
 src/gpu_ib/queue_pair.cpp                     |  437 -------
 src/gpu_ib/queue_pair.hpp                     |  431 -------
 src/gpu_ib/reliable_connection.cpp            |  201 ---
 src/gpu_ib/reliable_connection.hpp            |   84 --
 src/gpu_ib/segment_builder.cpp                |  138 ---
 src/gpu_ib/segment_builder.hpp                |   64 -
 src/gpu_ib/thread_policy.cpp                  |  358 ------
 src/gpu_ib/thread_policy.hpp                  |  176 ---
 src/ipc/backend_ipc.cpp                       |    4 +-
 src/ipc/context_ipc_device.hpp                |    2 +-
 src/memory/symmetric_heap.hpp                 |    2 +-
 src/reverse_offload/context_ro_tmpl_host.hpp  |   14 +-
 src/rocshmem.cpp                              |   10 +-
 src/rocshmem_gpu.cpp                          |    4 +-
 src/team.cpp                                  |    4 -
 src/team.hpp                                  |    5 +-
 76 files changed, 33 insertions(+), 8539 deletions(-)
 delete mode 100644 cmake/Modules/FindIbverbs.cmake
 delete mode 100755 scripts/build_configs/dc_multi
 delete mode 100755 scripts/build_configs/dc_multi_debug
 delete mode 100755 scripts/build_configs/dc_multi_ipc
 delete mode 100755 scripts/build_configs/dc_multi_profile
 delete mode 100755 scripts/build_configs/dc_single
 delete mode 100755 scripts/build_configs/rc_multi
 delete mode 100755 scripts/build_configs/rc_multi_debug
 delete mode 100755 scripts/build_configs/rc_multi_wf_coal
 delete mode 100755 scripts/build_configs/rc_single
 delete mode 100755 scripts/build_configs/rc_single_debug
 delete mode 100755 scripts/build_configs/rc_single_managed
 delete mode 100755 scripts/build_configs/rc_single_managed_debug
 delete mode 100755 scripts/build_configs/rc_single_profile
 delete mode 100755 scripts/build_configs/rc_single_single_node
 delete mode 100755 scripts/build_configs/rc_single_single_node_debug
 delete mode 100644 src/gpu_ib/CMakeLists.txt
 delete mode 100644 src/gpu_ib/backend_ib.cpp
 delete mode 100644 src/gpu_ib/backend_ib.hpp
 delete mode 100644 src/gpu_ib/connect_tests.cpp
 delete mode 100644 src/gpu_ib/connection.cpp
 delete mode 100644 src/gpu_ib/connection.hpp
 delete mode 100644 src/gpu_ib/connection_policy.cpp
 delete mode 100644 src/gpu_ib/connection_policy.hpp
 delete mode 100644 src/gpu_ib/context_ib_device.cpp
 delete mode 100644 src/gpu_ib/context_ib_device.hpp
 delete mode 100644 src/gpu_ib/context_ib_device_coll.cpp
 delete mode 100644 src/gpu_ib/context_ib_host.cpp
 delete mode 100644 src/gpu_ib/context_ib_host.hpp
 delete mode 100644 src/gpu_ib/context_ib_tmpl_device.hpp
 delete mode 100644 src/gpu_ib/context_ib_tmpl_host.hpp
 delete mode 100644 src/gpu_ib/debug.cpp
 delete mode 100644 src/gpu_ib/dynamic_connection.cpp
 delete mode 100644 src/gpu_ib/dynamic_connection.hpp
 delete mode 100644 src/gpu_ib/endian.cpp
 delete mode 100644 src/gpu_ib/endian.hpp
 delete mode 100644 src/gpu_ib/gpu_ib_team.cpp
 delete mode 100644 src/gpu_ib/gpu_ib_team.hpp
 delete mode 100644 src/gpu_ib/infiniband_structs.hpp
 delete mode 100644 src/gpu_ib/memory_builder_policy.hpp
 delete mode 100644 src/gpu_ib/network_policy.cpp
 delete mode 100644 src/gpu_ib/network_policy.hpp
 delete mode 100644 src/gpu_ib/qe_dumper.cpp
 delete mode 100644 src/gpu_ib/qe_dumper.hpp
 delete mode 100644 src/gpu_ib/queue_pair.cpp
 delete mode 100644 src/gpu_ib/queue_pair.hpp
 delete mode 100644 src/gpu_ib/reliable_connection.cpp
 delete mode 100644 src/gpu_ib/reliable_connection.hpp
 delete mode 100644 src/gpu_ib/segment_builder.cpp
 delete mode 100644 src/gpu_ib/segment_builder.hpp
 delete mode 100644 src/gpu_ib/thread_policy.cpp
 delete mode 100644 src/gpu_ib/thread_policy.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 599aeada61..9d29220c5f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,9 +39,7 @@ endif()
 ###############################################################################
 option(DEBUG "Enable debug trace" OFF)
 option(PROFILE "Enable statistics and timing support" OFF)
-option(USE_GPU_IB "Enable GPU_IB conduit." ON)
 option(USE_RO "Enable RO conduit." ON)
-option(USE_DC "Enable IB dynamically connected transport (DC)" OFF)
 option(USE_IPC "Enable IPC support (using HIP)" OFF)
 option(USE_THREADS "Enable workgroup threads to share network queues" OFF)
 option(USE_WF_COAL "Enable wavefront message coalescing" OFF)
@@ -158,10 +156,6 @@ if (NOT BUILD_TESTS_ONLY)
   find_package(hip REQUIRED)
   find_package(hsa-runtime64 REQUIRED)
 
-  if (USE_GPU_IB)
-    find_package(Ibverbs REQUIRED)
-  endif()
-
   set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
   set(THREADS_PREFER_PTHREAD_FLAG TRUE)
   find_package(Threads REQUIRED)
@@ -188,20 +182,6 @@ if (NOT BUILD_TESTS_ONLY)
       hip::host
       hsa-runtime64::hsa-runtime64
   )
-
-  if (USE_GPU_IB)
-    target_include_directories(
-      ${PROJECT_NAME}
-      PUBLIC
-        ${IBVERBS_INCLUDE_DIRS}
-    )
-
-    target_link_libraries(
-      ${PROJECT_NAME}
-      PUBLIC
-        ${IBVERBS_LIBRARIES}
-    )
-  endif()
 endif()
 
 ###############################################################################
diff --git a/README.md b/README.md
index ed8bff6821..cc21118ede 100644
--- a/README.md
+++ b/README.md
@@ -7,11 +7,11 @@ code complexity and enables more fine-grained communication/computation
 overlap than traditional host-driven networking.
 rocSHMEM uses a single symmetric heap (SHEAP) that is allocated on GPU memories.
 
-There are currently three backends for rocSHMEM;
-IPC, Reverse Offload (RO), and GPU-IB.
+There are currently two backends for rocSHMEM;
+IPC and Reverse Offload (RO).
 The backends primarily differ in their implementations of intra-kernel networking.
 Currently, only the IPC backend is supported.
-The RO and GPU-IB backends are provided as-is with
+The RO backend is provided as-is with
 no guarantees of support from AMD or AMD Research.
 
 The IPC backend implements communication primitives using load/store operations issued from the GPU.
@@ -21,10 +21,6 @@ to the host-side runtime, which calls into a traditional MPI or OpenSHMEM
 implementation. This forwarding of requests is transparent to the
 programmer, who only sees the GPU-side interface.
 
-The GPU InfiniBand (GPU-IB) backend implements a lightweight InfiniBand verbs interface
-on the GPU. The GPU itself is responsible for building commands and ringing
-the doorbell on the NIC to send network commands.
-
 ## Requirements
 
 rocSHMEM base requirements:
diff --git a/cmake/Modules/FindIbverbs.cmake b/cmake/Modules/FindIbverbs.cmake
deleted file mode 100644
index 46c17f31a5..0000000000
--- a/cmake/Modules/FindIbverbs.cmake
+++ /dev/null
@@ -1,62 +0,0 @@
-###############################################################################
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-###############################################################################
-
-find_package(PkgConfig REQUIRED QUIET)
-pkg_check_modules(PC_IBVERBS QUIET libibverbs)
-
-find_path(
-    IBVERBS_INCLUDE_DIR infiniband/verbs.h
-    HINTS ${PC_IBVERBS_INCLUDEDIR} ${PC_IBVERBS_INCLUDE_DIRS}
-    PATH_SUFFIXES include
-)
-
-find_library(
-    IBVERBS_LIBRARY
-    NAMES ibverbs libibverbs
-    HINTS ${PC_IBVERBS_LIBDIR} ${PC_IBVERBS_LIBRARY_DIRS}
-    PATH_SUFFIXES lib lib64
-)
-
-find_library(
-    MLX5_LIBRARY
-    NAMES mlx5 libmlx5
-    HINTS ${PC_IBVERBS_LIBDIR} ${PC_IBVERBS_LIBRARY_DIRS}
-    PATH_SUFFIXES lib lib64
-)
-
-set(
-    IBVERBS_LIBRARIES
-    ${IBVERBS_LIBRARY} ${MLX5_LIBRARY}
-    CACHE INTERNAL ""
-)
-
-set(
-    IBVERBS_INCLUDE_DIRS
-    ${IBVERBS_INCLUDE_DIR}
-    CACHE INTERNAL ""
-)
-
-find_package_handle_standard_args(
-    Ibverbs DEFAULT_MSG IBVERBS_LIBRARY IBVERBS_INCLUDE_DIR
-)
-
-mark_as_advanced(IBVERBS_LIBRARY IBVERBS_INCLUDE_DIR)
diff --git a/cmake/rocshmem_config.h.in b/cmake/rocshmem_config.h.in
index 8067651a01..d3964ef722 100644
--- a/cmake/rocshmem_config.h.in
+++ b/cmake/rocshmem_config.h.in
@@ -1,8 +1,6 @@
 #cmakedefine DEBUG
 #cmakedefine PROFILE
-#cmakedefine USE_GPU_IB
 #cmakedefine USE_RO
-#cmakedefine USE_DC
 #cmakedefine USE_IPC
 #cmakedefine USE_THREADS
 #cmakedefine USE_SHARED_CTX
diff --git a/scripts/build_configs/dc_multi b/scripts/build_configs/dc_multi
deleted file mode 100755
index 4e806e7f85..0000000000
--- a/scripts/build_configs/dc_multi
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=ON \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=ON \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/dc_multi_debug b/scripts/build_configs/dc_multi_debug
deleted file mode 100755
index e95ed54a6a..0000000000
--- a/scripts/build_configs/dc_multi_debug
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=ON \
-    -DDEBUG=ON \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=ON \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=ON \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/dc_multi_ipc b/scripts/build_configs/dc_multi_ipc
deleted file mode 100755
index a154f34667..0000000000
--- a/scripts/build_configs/dc_multi_ipc
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=ON \
-    -DUSE_IPC=ON \
-    -DUSE_THREADS=ON \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/dc_multi_profile b/scripts/build_configs/dc_multi_profile
deleted file mode 100755
index 37954c8342..0000000000
--- a/scripts/build_configs/dc_multi_profile
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=ON \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=ON \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=ON \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/dc_single b/scripts/build_configs/dc_single
deleted file mode 100755
index 1cc14f71ef..0000000000
--- a/scripts/build_configs/dc_single
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=ON \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/ipc_single b/scripts/build_configs/ipc_single
index dc6b0c522b..a0ff287adc 100755
--- a/scripts/build_configs/ipc_single
+++ b/scripts/build_configs/ipc_single
@@ -17,9 +17,7 @@ cmake \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
-    -DUSE_GPU_IB=OFF \
     -DUSE_RO=OFF \
-    -DUSE_DC=OFF \
     -DUSE_IPC=ON \
     -DUSE_COHERENT_HEAP=ON \
     -DUSE_THREADS=OFF \
diff --git a/scripts/build_configs/ipc_tests_only b/scripts/build_configs/ipc_tests_only
index d725da97ef..f111cf1349 100755
--- a/scripts/build_configs/ipc_tests_only
+++ b/scripts/build_configs/ipc_tests_only
@@ -22,9 +22,7 @@ cmake \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
-    -DUSE_GPU_IB=OFF \
     -DUSE_RO=OFF \
-    -DUSE_DC=OFF \
     -DUSE_IPC=ON \
     -DUSE_COHERENT_HEAP=ON \
     -DUSE_THREADS=OFF \
diff --git a/scripts/build_configs/rc_multi b/scripts/build_configs/rc_multi
deleted file mode 100755
index f38514dea7..0000000000
--- a/scripts/build_configs/rc_multi
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=ON \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_multi_debug b/scripts/build_configs/rc_multi_debug
deleted file mode 100755
index 8937d426ea..0000000000
--- a/scripts/build_configs/rc_multi_debug
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=ON \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=ON \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_multi_wf_coal b/scripts/build_configs/rc_multi_wf_coal
deleted file mode 100755
index cfa1acaf90..0000000000
--- a/scripts/build_configs/rc_multi_wf_coal
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=ON \
-    -DUSE_WF_COAL=ON \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_single b/scripts/build_configs/rc_single
deleted file mode 100755
index ad378880e1..0000000000
--- a/scripts/build_configs/rc_single
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_COHERENT_HEAP=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_single_debug b/scripts/build_configs/rc_single_debug
deleted file mode 100755
index cf75f64819..0000000000
--- a/scripts/build_configs/rc_single_debug
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=ON \
-    -DDEBUG=ON \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_single_managed b/scripts/build_configs/rc_single_managed
deleted file mode 100755
index 963fe7260b..0000000000
--- a/scripts/build_configs/rc_single_managed
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    -DUSE_MANAGED_HEAP=ON \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_single_managed_debug b/scripts/build_configs/rc_single_managed_debug
deleted file mode 100755
index fd74cf904e..0000000000
--- a/scripts/build_configs/rc_single_managed_debug
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=ON \
-    -DDEBUG=ON \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    -DUSE_MANAGED_HEAP=ON \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_single_profile b/scripts/build_configs/rc_single_profile
deleted file mode 100755
index ff6c0194f6..0000000000
--- a/scripts/build_configs/rc_single_profile
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=ON \
-    -DDEBUG=OFF \
-    -DPROFILE=ON \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_single_single_node b/scripts/build_configs/rc_single_single_node
deleted file mode 100755
index 4f1c56f79a..0000000000
--- a/scripts/build_configs/rc_single_single_node
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=ON \
-    -DUSE_COHERENT_HEAP=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    -DUSE_SINGLE_NODE=ON \
-    -DUSE_HOST_SIDE_HDP_FLUSH=ON\
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/rc_single_single_node_debug b/scripts/build_configs/rc_single_single_node_debug
deleted file mode 100755
index 60292ca594..0000000000
--- a/scripts/build_configs/rc_single_single_node_debug
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-set -e
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/../../
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_INSTALL_PREFIX=$install_path \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DDEBUG=OFF \
-    -DPROFILE=OFF \
-    -DUSE_GPU_IB=ON \
-    -DUSE_DC=OFF \
-    -DUSE_IPC=ON \
-    -DUSE_COHERENT_HEAP=OFF \
-    -DUSE_THREADS=OFF \
-    -DUSE_WF_COAL=OFF \
-    -DUSE_SINGLE_NODE=ON \
-    -DUSE_HOST_SIDE_HDP_FLUSH=ON\
-    $src_path
-cmake --build . --parallel 8
-cmake --install .
diff --git a/scripts/build_configs/ro_ipc b/scripts/build_configs/ro_ipc
index cfe359eb57..cb68279158 100755
--- a/scripts/build_configs/ro_ipc
+++ b/scripts/build_configs/ro_ipc
@@ -17,8 +17,6 @@ cmake \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
-    -DUSE_GPU_IB=OFF \
-    -DUSE_DC=OFF \
     -DUSE_IPC=ON \
     -DUSE_COHERENT_HEAP=ON \
     -DUSE_THREADS=OFF \
diff --git a/scripts/build_configs/ro_net b/scripts/build_configs/ro_net
index 240e7735b5..a8de47c368 100755
--- a/scripts/build_configs/ro_net
+++ b/scripts/build_configs/ro_net
@@ -17,8 +17,6 @@ cmake \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
-    -DUSE_GPU_IB=OFF \
-    -DUSE_DC=OFF \
     -DUSE_IPC=OFF \
     -DUSE_COHERENT_HEAP=ON \
     -DUSE_THREADS=OFF \
diff --git a/scripts/build_configs/ro_net_debug b/scripts/build_configs/ro_net_debug
index 54e0309730..b9f1766629 100755
--- a/scripts/build_configs/ro_net_debug
+++ b/scripts/build_configs/ro_net_debug
@@ -17,8 +17,6 @@ cmake \
     -DCMAKE_VERBOSE_MAKEFILE=OFF \
     -DDEBUG=OFF \
     -DPROFILE=OFF \
-    -DUSE_GPU_IB=OFF \
-    -DUSE_DC=OFF \
     -DUSE_IPC=OFF \
     -DUSE_COHERENT_HEAP=ON \
     -DUSE_THREADS=OFF \
diff --git a/scripts/functional_tests/driver.py b/scripts/functional_tests/driver.py
index 8e2894a126..b6de00ff2a 100755
--- a/scripts/functional_tests/driver.py
+++ b/scripts/functional_tests/driver.py
@@ -145,12 +145,9 @@ def determine_algos_from_library_config_type(config):
     if config['algorithms']:
         return config
 
-    gpu_ib = re.match('^[rd]c_', config['library_build_config_type'])
     thread_single = re.match('.*single.*', config['library_build_config_type'])
 
-    if not gpu_ib:
-        config['algorithms'] = reverse_offload_algorithms
-    elif thread_single:
+    if thread_single:
         config['algorithms'] = single_thread_algorithms
     else:
         config['algorithms'] = multi_thread_algorithms
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3a02f783b5..8b928eef68 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -58,9 +58,7 @@ target_compile_options(
 ###############################################################################
 # ROCSHMEM TARGET FOR BACKENDS
 ###############################################################################
-IF (USE_GPU_IB)
-add_subdirectory(gpu_ib)
-ELSEIF(USE_RO)
+IF (USE_RO)
 add_subdirectory(reverse_offload)
 ELSE()
 add_subdirectory(ipc)
diff --git a/src/backend_bc.cpp b/src/backend_bc.cpp
index 23d085070a..d8c3ecaac1 100644
--- a/src/backend_bc.cpp
+++ b/src/backend_bc.cpp
@@ -25,9 +25,7 @@
 #include "backend_type.hpp"
 #include "context_incl.hpp"
 
-#ifdef USE_GPU_IB
-#include "gpu_ib/backend_ib.hpp"
-#elif defined(USE_RO)
+#ifdef USE_RO
 #include "reverse_offload/backend_ro.hpp"
 #else
 #include "ipc/backend_ipc.hpp"
@@ -203,9 +201,7 @@ void Backend::reset_stats() {
 }
 
 __device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
-#ifdef USE_GPU_IB
-  return static_cast<GPUIBBackend*>(this)->create_ctx(option, ctx);
-#elif defined(USE_RO)
+#ifdef USE_RO
   return static_cast<ROBackend*>(this)->create_ctx(option, ctx);
 #else
   return static_cast<IPCBackend*>(this)->create_ctx(option, ctx);
@@ -213,9 +209,7 @@ __device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
 }
 
 __device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) {
-#ifdef USE_GPU_IB
-  static_cast<GPUIBBackend*>(this)->destroy_ctx(ctx);
-#elif defined(USE_RO)
+#ifdef USE_RO
   static_cast<ROBackend*>(this)->destroy_ctx(ctx);
 #else
   static_cast<IPCBackend*>(this)->destroy_ctx(ctx);
diff --git a/src/backend_bc.hpp b/src/backend_bc.hpp
index ccfd81c6f3..6b71293ce2 100644
--- a/src/backend_bc.hpp
+++ b/src/backend_bc.hpp
@@ -278,7 +278,7 @@ class Backend {
    * rely on the normal inheritance mechanism to tailor behavior for
    * derived backend types.
    */
-  BackendType type{BackendType::GPU_IB_BACKEND};
+  BackendType type{BackendType::RO_BACKEND};
 
   /**
    * @brief Dumps derived class statistics.
diff --git a/src/backend_type.hpp b/src/backend_type.hpp
index 020b956f57..6b75d60cb7 100644
--- a/src/backend_type.hpp
+++ b/src/backend_type.hpp
@@ -44,7 +44,7 @@ namespace rocshmem {
  * @note Derived classes which use Backend as a base class must add
  * themselves to this enum class to support static polymorphism.
  */
-enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
+enum class BackendType { RO_BACKEND, IPC_BACKEND };
 
 /**
  * @brief Helper macro for some dispatch calls
@@ -54,10 +54,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
 /**
  * @brief Device static dispatch method call.
  */
-#ifdef USE_GPU_IB
-#define DISPATCH(Func)                     \
-  static_cast<GPUIBContext *>(this)->Func;
-#elif defined(USE_RO)
+#ifdef USE_RO
 #define DISPATCH(Func)                     \
   static_cast<ROContext *>(this)->Func;
 #else
@@ -68,11 +65,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
 /**
  * @brief Device static dispatch method call with a return value.
  */
-#ifdef USE_GPU_IB
-#define DISPATCH_RET(Func)                           \
-  auto ret_val = static_cast<GPUIBContext *>(this)->Func; \
-  return ret_val;
-#elif defined(USE_RO)
+#ifdef USE_RO
 #define DISPATCH_RET(Func)                        \
   auto ret_val = static_cast<ROContext *>(this)->Func; \
   return ret_val;
@@ -85,12 +78,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
 /**
  * @brief Device static dispatch method call with a return type of pointer.
  */
-#ifdef USE_GPU_IB
-#define DISPATCH_RET_PTR(Func)                       \
-  void *ret_val{nullptr};                            \
-  ret_val = static_cast<GPUIBContext *>(this)->Func; \
-  return ret_val;
-#elif defined(USE_RO)
+#ifdef USE_RO
 #define DISPATCH_RET_PTR(Func)                    \
   void *ret_val{nullptr};                         \
   ret_val = static_cast<ROContext *>(this)->Func; \
@@ -109,9 +97,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
  * MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and
  * threading semantics of collectives in OpenSHMEM match those of MPI.
  */
-#ifdef USE_GPU_IB
-#define HOST_DISPATCH(Func) static_cast<GPUIBHostContext *>(this)->Func;
-#elif defined(USE_RO)
+#ifdef USE_RO
 #define HOST_DISPATCH(Func) static_cast<ROHostContext *>(this)->Func;
 #else
 #define HOST_DISPATCH(Func) static_cast<IPCHostContext *>(this)->Func;
@@ -124,11 +110,7 @@ enum class BackendType { RO_BACKEND, GPU_IB_BACKEND, IPC_BACKEND };
  * threading semantics of collectives in OpenSHMEM match those of MPI.
  */
 
-#ifdef USE_GPU_IB
-#define HOST_DISPATCH_RET(Func)                          \
-  auto ret_val = static_cast<GPUIBHostContext *>(this)->Func; \
-  return ret_val;
-#elif defined(USE_RO)
+#ifdef USE_RO
 #define HOST_DISPATCH_RET(Func)                       \
   auto ret_val = static_cast<ROHostContext *>(this)->Func; \
   return ret_val;
diff --git a/src/context_incl.hpp b/src/context_incl.hpp
index f86ca3dfe8..0794397670 100644
--- a/src/context_incl.hpp
+++ b/src/context_incl.hpp
@@ -26,10 +26,7 @@
 #include "context.hpp"
 #include "context_tmpl_device.hpp"
 #include "context_tmpl_host.hpp"
-#ifdef USE_GPU_IB
-#include "gpu_ib/context_ib_device.hpp"
-#include "gpu_ib/context_ib_host.hpp"
-#elif defined (USE_RO)
+#ifdef USE_RO
 #include "reverse_offload/context_ro_device.hpp"
 #include "reverse_offload/context_ro_host.hpp"
 #else
diff --git a/src/context_tmpl_device.hpp b/src/context_tmpl_device.hpp
index a9c1dda624..e57884d602 100644
--- a/src/context_tmpl_device.hpp
+++ b/src/context_tmpl_device.hpp
@@ -25,9 +25,7 @@
 
 #include "rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "backend_type.hpp"
-#ifdef USE_GPU_IB
-#include "gpu_ib/context_ib_device.hpp"
-#elif defined(USE_RO)
+#ifdef USE_RO
 #include "reverse_offload/context_ro_device.hpp"
 #else
 #include "ipc/context_ipc_device.hpp"
diff --git a/src/context_tmpl_host.hpp b/src/context_tmpl_host.hpp
index 8bc913f2fc..e24f34c19d 100644
--- a/src/context_tmpl_host.hpp
+++ b/src/context_tmpl_host.hpp
@@ -25,9 +25,7 @@
 
 #include "rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "backend_type.hpp"
-#ifdef USE_GPU_IB
-#include "gpu_ib/context_ib_host.hpp"
-#elif defined(USE_RO)
+#ifdef USE_RO
 #include "reverse_offload/context_ro_host.hpp"
 #else
 #include "ipc/context_ipc_host.hpp"
diff --git a/src/gpu_ib/CMakeLists.txt b/src/gpu_ib/CMakeLists.txt
deleted file mode 100644
index 34848b1ca8..0000000000
--- a/src/gpu_ib/CMakeLists.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-###############################################################################
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-###############################################################################
-
-###############################################################################
-# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY
-###############################################################################
-target_sources(
-  ${PROJECT_NAME}
-  PRIVATE
-    backend_ib.cpp
-    connection.cpp
-    connection_policy.cpp
-    context_ib_device.cpp
-    context_ib_device_coll.cpp
-    context_ib_host.cpp
-    debug.cpp
-    dynamic_connection.cpp
-    endian.cpp
-    gpu_ib_team.cpp
-    network_policy.cpp
-    qe_dumper.cpp
-    queue_pair.cpp
-    reliable_connection.cpp
-    segment_builder.cpp
-    thread_policy.cpp
-)
diff --git a/src/gpu_ib/backend_ib.cpp b/src/gpu_ib/backend_ib.cpp
deleted file mode 100644
index 7a757b67da..0000000000
--- a/src/gpu_ib/backend_ib.cpp
+++ /dev/null
@@ -1,493 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "backend_ib.hpp"
-
-#include <endian.h>
-#include <mpi.h>
-#include <unistd.h>
-
-#include <cstdio>
-#include <cstdlib>
-#include <mutex>  // NOLINT(build/c++11)
-
-#include "rocshmem/rocshmem.hpp"
-#include "../backend_type.hpp"
-#include "../context_incl.hpp"
-#include "gpu_ib_team.hpp"
-#include "queue_pair.hpp"
-#include "../host/host.hpp"
-
-namespace rocshmem {
-
-#define NET_CHECK(cmd)                                       \
-  {                                                          \
-    if (cmd != MPI_SUCCESS) {                                \
-      fprintf(stderr, "Unrecoverable error: MPI Failure\n"); \
-      abort();                                               \
-    }                                                        \
-  }
-
-extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
-
-rocshmem_team_t get_external_team(GPUIBTeam *team) {
-  return reinterpret_cast<rocshmem_team_t>(team);
-}
-
-int get_ls_non_zero_bit(char *bitmask, int mask_length) {
-  int position = -1;
-
-  for (int bit_i = 0; bit_i < mask_length; bit_i++) {
-    int byte_i = bit_i / CHAR_BIT;
-    if (bitmask[byte_i] & (1 << (bit_i % CHAR_BIT))) {
-      position = bit_i;
-      break;
-    }
-  }
-
-  return position;
-}
-
-GPUIBBackend::GPUIBBackend(MPI_Comm comm) : Backend() {
-  if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) {
-    std::stringstream sstream(maximum_num_contexts_str);
-    sstream >> maximum_num_contexts_;
-  }
-  num_blocks_ = maximum_num_contexts_;
-
-  init_mpi_once(comm);
-
-  type = BackendType::GPU_IB_BACKEND;
-
-  NET_CHECK(MPI_Comm_dup(backend_comm, &gpu_ib_comm_world));
-  NET_CHECK(MPI_Comm_size(gpu_ib_comm_world, &num_pes));
-  NET_CHECK(MPI_Comm_rank(gpu_ib_comm_world, &my_pe));
-
-  /* Initialize the host interface */
-  host_interface =
-      new HostInterface(hdp_proxy_.get(), gpu_ib_comm_world, &heap);
-
-  /*
-   * Construct default host context independently of the
-   * default device context (done in the async thread)
-   * so that host operations can execute regardless of
-   * device operations.
-   */
-  setup_default_host_ctx();
-
-  setup_team_world();
-
-  rocshmem_collective_init();
-
-  teams_init();
-
-  // MPI_Comm_dup(gpu_ib_comm_world, &thread_comm);
-  thread_comm = gpu_ib_comm_world;
-
-  NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
-
-  worker_thread_exit = false;
-
-#ifdef USE_HOST_SIDE_HDP_FLUSH
-  hdp_gpu_cpu_flush_flag_ =
-      static_cast<unsigned int *>(rocshmem_malloc(sizeof(unsigned int)));
-  hdp_policy->set_flush_polling_ptr(hdp_gpu_cpu_flush_flag_);
-  hdp_flush_worker_thread = std::thread(&GPUIBBackend::hdp_flush_poll, this);
-
-  // We can now initialize and set the HDP window in the host interface
-  host_interface->create_hdp_window();
-
-#endif
-
-  // commenting out the async  thread as there is some issues with ROCm
-  // this makes the CPU init blocking
-  // async_thread_ = thread_spawn(this);
-  thread_func_internal(this);
-}
-
-__device__ bool GPUIBBackend::create_ctx(int64_t options,
-                                         rocshmem_ctx_t *ctx) {
-  GPUIBContext *ctx_;
-
-  auto pop_result = ctx_free_list.get()->pop_front();
-  if (!pop_result.success) {
-    return false;
-  }
-  ctx_ = pop_result.value;
-  ctx->ctx_opaque = ctx_;
-  return true;
-}
-
-void GPUIBBackend::ctx_create(int64_t options, void **ctx) {
-  GPUIBHostContext *new_ctx = nullptr;
-
-  new_ctx = new GPUIBHostContext(this, options);
-
-  *ctx = new_ctx;
-}
-
-GPUIBHostContext *get_internal_gpu_ib_ctx(Context *ctx) {
-  return reinterpret_cast<GPUIBHostContext *>(ctx);
-}
-
-void GPUIBBackend::ctx_destroy(Context *ctx) {
-  GPUIBHostContext *gpu_ib_host_ctx = get_internal_gpu_ib_ctx(ctx);
-  delete gpu_ib_host_ctx;
-}
-
-__device__ void GPUIBBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
-  ctx_free_list.get()->push_back(static_cast<GPUIBContext *>(ctx->ctx_opaque));
-}
-
-GPUIBBackend::~GPUIBBackend() {
-  // need to get this back once ROCm is fixed
-  // async_thread_.join();
-
-  worker_thread_exit = true;
-
-#ifdef USE_HOST_SIDE_HDP_FLUSH
-  hdp_flush_worker_thread.join();
-  hdp_policy->set_flush_polling_ptr(nullptr);
-  rocshmem_free(hdp_gpu_cpu_flush_flag_);
-#endif
-
-  /**
-   * Destroy teams infrastructure
-   * and team world
-   */
-  teams_destroy();
-  auto *team_world{team_tracker.get_team_world()};
-  team_world->~Team();
-  CHECK_HIP(hipFree(team_world));
-
-  delete default_host_ctx_;
-
-  NET_CHECK(MPI_Comm_free(&gpu_ib_comm_world));
-
-  CHECK_HIP(hipFree(default_ctx_->device_qp_proxy));
-  CHECK_HIP(hipFree(default_ctx_));
-  default_ctx_ = nullptr;
-
-  delete host_interface;
-  host_interface = nullptr;
-
-  networkImpl.networkHostFinalize();
-
-  CHECK_HIP(hipFree(ctx_array));
-}
-
-__host__ void GPUIBBackend::global_exit(int status) {
-  MPI_Abort(gpu_ib_comm_world, status);
-}
-
-void GPUIBBackend::create_new_team([[maybe_unused]] Team *parent_team,
-                                   TeamInfo *team_info_wrt_parent,
-                                   TeamInfo *team_info_wrt_world, int num_pes,
-                                   int my_pe_in_new_team, MPI_Comm team_comm,
-                                   rocshmem_team_t *new_team) {
-  /**
-   * Read the bit mask and find out a common index into
-   * the pool of available work arrays.
-   */
-  NET_CHECK(MPI_Allreduce(pool_bitmask_, reduced_bitmask_, bitmask_size_,
-                          MPI_CHAR, MPI_BAND, team_comm));
-
-  /* Pick the least significant non-zero bit (logical layout) in the reduced
-   * bitmask */
-  auto max_num_teams{team_tracker.get_max_num_teams()};
-  int common_index = get_ls_non_zero_bit(reduced_bitmask_, max_num_teams);
-  if (common_index < 0) {
-    /* No team available */
-    abort();
-  }
-
-  /* Mark the team as taken (by unsetting the bit in the pool bitmask) */
-  int byte = common_index / CHAR_BIT;
-  pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT));
-
-  /**
-   * Allocate device-side memory for team_world and
-   * construct a GPU_IB team in it
-   */
-  GPUIBTeam *new_team_obj;
-  CHECK_HIP(hipMalloc(&new_team_obj, sizeof(GPUIBTeam)));
-  new (new_team_obj)
-      GPUIBTeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes,
-                my_pe_in_new_team, team_comm, common_index);
-
-  *new_team = get_external_team(new_team_obj);
-}
-
-void GPUIBBackend::team_destroy(rocshmem_team_t team) {
-  GPUIBTeam *team_obj = get_internal_gpu_ib_team(team);
-
-  /* Mark the pool as available */
-  int bit = team_obj->pool_index_;
-  int byte_i = bit / CHAR_BIT;
-  pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT);
-
-  team_obj->~GPUIBTeam();
-  CHECK_HIP(hipFree(team_obj));
-}
-
-void GPUIBBackend::dump_backend_stats() {
-  networkImpl.dump_backend_stats(&globalStats);
-}
-
-void GPUIBBackend::reset_backend_stats() { networkImpl.reset_backend_stats(); }
-
-void GPUIBBackend::initialize_ipc() {
-  ipcImpl.ipcHostInit(my_pe, heap.get_heap_bases(), thread_comm);
-}
-
-void GPUIBBackend::initialize_network() { networkImpl.networkHostSetup(this); }
-
-void GPUIBBackend::setup_default_host_ctx() {
-  default_host_ctx_ = new GPUIBHostContext(this, 0);
-  ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx_;
-}
-
-void GPUIBBackend::setup_ctxs() {
-  /*
-   * Allocate device-side memory for all context and construct an
-   * InfiniBand context in it.
-   */
-  CHECK_HIP(
-      hipMalloc(&ctx_array, sizeof(GPUIBContext) * maximum_num_contexts_));
-  for (int i = 0; i < maximum_num_contexts_; i++) {
-    new (&ctx_array[i]) GPUIBContext(this, false, i);
-    ctx_free_list.get()->push_back(ctx_array + i);
-  }
-}
-
-void GPUIBBackend::setup_default_ctx() {
-  /*
-   * Allocate device-side memory for default context and construct an
-   * InfiniBand context in it.
-   */
-  CHECK_HIP(hipMalloc(&default_ctx_, sizeof(GPUIBContext)));
-  new (default_ctx_) GPUIBContext(this, true, 0);
-
-  /*
-   * Set the ROCSHMEM_CTX_DEFAULT in constant memory.
-   */
-  int *symbol_address;
-  CHECK_HIP(hipGetSymbolAddress(reinterpret_cast<void **>(&symbol_address),
-                                HIP_SYMBOL(ROCSHMEM_CTX_DEFAULT)));
-
-  TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world;
-  rocshmem_ctx_t ctx_default_host{default_ctx_, tinfo};
-
-  hipStream_t stream;
-  CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-  CHECK_HIP(hipMemcpyAsync(symbol_address, &ctx_default_host,
-                           sizeof(rocshmem_ctx_t), hipMemcpyDefault, stream));
-  CHECK_HIP(hipStreamSynchronize(stream));
-  CHECK_HIP(hipStreamDestroy(stream));
-}
-
-void GPUIBBackend::setup_team_world() {
-  TeamInfo *team_info_wrt_parent, *team_info_wrt_world;
-
-  /**
-   * Allocate device-side memory for team_world and construct a
-   * GPU_IB team in it.
-   */
-  CHECK_HIP(hipMalloc(&team_info_wrt_parent, sizeof(TeamInfo)));
-  CHECK_HIP(hipMalloc(&team_info_wrt_world, sizeof(TeamInfo)));
-
-  new (team_info_wrt_parent) TeamInfo(nullptr, 0, 1, num_pes);
-  new (team_info_wrt_world) TeamInfo(nullptr, 0, 1, num_pes);
-
-  MPI_Comm team_world_comm;
-  NET_CHECK(MPI_Comm_dup(gpu_ib_comm_world, &team_world_comm));
-
-  GPUIBTeam *team_world{nullptr};
-  CHECK_HIP(hipMalloc(&team_world, sizeof(GPUIBTeam)));
-  new (team_world) GPUIBTeam(this, team_info_wrt_parent, team_info_wrt_world,
-                             num_pes, my_pe, team_world_comm, 0);
-  team_tracker.set_team_world(team_world);
-
-  /**
-   * Copy the address to ROCSHMEM_TEAM_WORLD.
-   */
-  ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
-}
-
-void GPUIBBackend::init_mpi_once(MPI_Comm comm) {
-  static std::mutex init_mutex;
-  const std::lock_guard<std::mutex> lock(init_mutex);
-
-  int init_done = 0;
-  NET_CHECK(MPI_Initialized(&init_done));
-  if (init_done == 0) {
-    int provided;
-    NET_CHECK(
-        MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided));
-  }
-  if (comm == MPI_COMM_NULL) {
-    NET_CHECK(MPI_Comm_dup(MPI_COMM_WORLD, &backend_comm));
-  } else {
-    NET_CHECK(MPI_Comm_dup(comm, &backend_comm));
-  }
-}
-
-std::thread GPUIBBackend::thread_spawn(GPUIBBackend *b) {
-  return std::thread(&GPUIBBackend::thread_func_internal, this, b);
-}
-
-void GPUIBBackend::thread_func_internal(GPUIBBackend *b) {
-  CHECK_HIP(hipSetDevice(hip_dev_id));
-
-  b->initialize_ipc();
-  b->initialize_network();
-  b->setup_ctxs();
-  b->setup_default_ctx();
-  *(b->done_init) = 1;
-}
-
-#ifdef USE_HOST_SIDE_HDP_FLUSH
-void GPUIBBackend::hdp_flush_poll() {
-  while (!worker_thread_exit) {
-    if (hdp_policy->has_active_flush_request()) {
-      hdp_policy->hdp_flush();
-      hdp_policy->clear_active_flush_flag();
-    }
-  }
-}
-#endif
-
-void GPUIBBackend::teams_init() {
-  /**
-   * Allocate pools for the teams sync and work arrary from the SHEAP.
-   */
-  auto max_num_teams{team_tracker.get_max_num_teams()};
-  barrier_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
-      sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE * max_num_teams));
-  reduce_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
-      sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE * max_num_teams));
-  bcast_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
-      sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE * max_num_teams));
-  alltoall_pSync_pool = reinterpret_cast<long *>(rocshmem_malloc(
-      sizeof(long) * ROCSHMEM_ALLTOALL_SYNC_SIZE * max_num_teams));
-
-  /* Accommodating for largest possible data type for pWrk */
-  pWrk_pool = rocshmem_malloc(
-      sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * max_num_teams);
-  pAta_pool = rocshmem_malloc(sizeof(double) * ROCSHMEM_ATA_MAX_WRKDATA_SIZE *
-                               max_num_teams);
-
-  /**
-   * Initialize the sync arrays in the pool with default values.
-   */
-  long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
-  for (int team_i = 0; team_i < max_num_teams; team_i++) {
-    barrier_pSync = reinterpret_cast<long *>(
-        &barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
-    reduce_pSync = reinterpret_cast<long *>(
-        &reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
-    bcast_pSync = reinterpret_cast<long *>(
-        &bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
-    alltoall_pSync = reinterpret_cast<long *>(
-        &alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
-
-    for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
-      barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
-    }
-    for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
-      reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
-    }
-    for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
-      bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
-    }
-    for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
-      alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
-    }
-  }
-
-  /**
-   * Initialize bit mask
-   *
-   * Logical:
-   * MSB..........................................................................LSB
-   * Physical: MSB...1st least significant 8 bits...LSB  MSB...2nd least
-   * signifant 8 bits...LSB
-   *
-   * Description shows only a 2-byte long mask but idea extends to any
-   * arbitrary size.
-   */
-  bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1)
-                                             : (max_num_teams / CHAR_BIT);
-  pool_bitmask_ = reinterpret_cast<char *>(malloc(bitmask_size_));
-  reduced_bitmask_ = reinterpret_cast<char *>(malloc(bitmask_size_));
-
-  memset(pool_bitmask_, 0, bitmask_size_);
-  memset(reduced_bitmask_, 0, bitmask_size_);
-  /* Set all to available except the 0th one (reserved for TEAM_WORLD) */
-  for (int bit_i = 1; bit_i < max_num_teams; bit_i++) {
-    int byte_i = bit_i / CHAR_BIT;
-
-    pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT);
-  }
-
-  /**
-   * Make sure that all processing elements have done this before
-   * continuing.
-   */
-  NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
-}
-
-void GPUIBBackend::teams_destroy() {
-  rocshmem_free(barrier_pSync_pool);
-  rocshmem_free(reduce_pSync_pool);
-  rocshmem_free(bcast_pSync_pool);
-  rocshmem_free(alltoall_pSync_pool);
-  rocshmem_free(pWrk_pool);
-  rocshmem_free(pAta_pool);
-
-  free(pool_bitmask_);
-  free(reduced_bitmask_);
-}
-
-void GPUIBBackend::rocshmem_collective_init() {
-  /*
-   * Allocate heap space for barrier_sync
-   */
-  size_t one_sync_size_bytes{sizeof(*barrier_sync)};
-  size_t sync_size_bytes{one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
-  heap.malloc(reinterpret_cast<void **>(&barrier_sync), sync_size_bytes);
-
-  /*
-   * Initialize the barrier synchronization array with default values.
-   */
-  for (int i = 0; i < num_pes; i++) {
-    barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
-  }
-
-  /*
-   * Make sure that all processing elements have done this before
-   * continuing.
-   */
-  NET_CHECK(MPI_Barrier(gpu_ib_comm_world));
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/backend_ib.hpp b/src/gpu_ib/backend_ib.hpp
deleted file mode 100644
index 87b5dc85f1..0000000000
--- a/src/gpu_ib/backend_ib.hpp
+++ /dev/null
@@ -1,351 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
-#define LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
-
-#include "../backend_bc.hpp"
-#include "../containers/free_list_impl.hpp"
-#include "network_policy.hpp"
-#include "../hdp_policy.hpp"
-#include "../hdp_proxy.hpp"
-#include "../memory/hip_allocator.hpp"
-
-namespace rocshmem {
-
-class HostInterface;
-
-/**
- * @class GPUIBBackend backend.hpp
- * @brief InfiniBand specific backend.
- *
- * The InfiniBand (GPUIB) backend enables the device to enqueue network
- * requests to InfiniBand queues (with minimal host intervention). The setup
- * requires some effort from the host, but the device is able to craft
- * InfiniBand requests and send them on its own.
- */
-class GPUIBBackend : public Backend {
- public:
-  /**
-   * @copydoc Backend::Backend(unsigned)
-   */
-  explicit GPUIBBackend(MPI_Comm comm);
-
-  /**
-   * @copydoc Backend::~Backend()
-   */
-  virtual ~GPUIBBackend();
-
-  /**
-   * @brief Abort the application.
-   *
-   * @param[in] status Exit code.
-   *
-   * @return void.
-   *
-   * @note This routine terminates the entire application.
-   */
-  void global_exit(int status) override;
-
-  /**
-   * @copydoc Backend::create_new_team
-   */
-  void create_new_team(Team *parent_team, TeamInfo *team_info_wrt_parent,
-                       TeamInfo *team_info_wrt_world, int num_pes,
-                       int my_pe_in_new_team, MPI_Comm team_comm,
-                       rocshmem_team_t *new_team) override;
-
-  /**
-   * @copydoc Backend::team_destroy(rocshmem_team_t)
-   */
-  void team_destroy(rocshmem_team_t team) override;
-
-  /**
-   * @copydoc Backend::ctx_create
-   */
-  void ctx_create(int64_t options, void **ctx) override;
-
-  __device__ bool create_ctx(int64_t options, rocshmem_ctx_t *ctx);
-
-  /**
-   * @copydoc Backend::ctx_destroy
-   */
-  void ctx_destroy(Context *ctx) override;
-
-  /**
-   * @copydoc Backend::ctx_destroy
-   */
-  __device__ void destroy_ctx(rocshmem_ctx_t *ctx);
-
- protected:
-  /**
-   * @copydoc Backend::dump_backend_stats()
-   */
-  void dump_backend_stats() override;
-
-  /**
-   * @copydoc Backend::reset_backend_stats()
-   */
-  void reset_backend_stats() override;
-
-  /**
-   * @brief spawn a new thread to perform the rest of initialization
-   */
-  std::thread thread_spawn(GPUIBBackend *b);
-
-  /**
-   * @brief overheads for helper thread to run
-   *
-   * @param[in] the thread needs access to the class
-   *
-   * @return void
-   */
-  void thread_func_internal(GPUIBBackend *b);
-
-  /**
-   * @brief initialize MPI.
-   *
-   * GPUIB relies on MPI just to exchange the connection information.
-   *
-   * todo: remove the dependency on MPI and make it generic to PMI-X or just
-   * to OpenSHMEM to have support for both CPU and GPU
-   */
-  void init_mpi_once(MPI_Comm comm);
-
-  /**
-   * @brief init the network support
-   */
-  void initialize_network();
-
-  /**
-   * @brief Invokes the IPC policy class initialization method.
-   *
-   * This method delegates Inter Process Communication (IPC)
-   * initialization to the appropriate policy class. The initialization
-   * needs to be exposed to the Backed due to initialization ordering
-   * constraints. (The symmetric heaps needs to be allocated and
-   * initialized before this method can be called.)
-   *
-   * The policy class encapsulates what the initialization process so
-   * refer to that class for more details.
-   */
-  void initialize_ipc();
-
-  /**
-   * @brief Allocate and initialize the ROCSHMEM_CTX_DEFAULT variable.
-   *
-   * @todo The default_ctx member looks unused after it is copied into
-   * the ROCSHMEM_CTX_DEFAULT variable.
-   */
-  void setup_default_ctx();
-  void setup_ctxs();
-
-  /**
-   * @brief Allocate and initialize the default context for host
-   * operations.
-   */
-  void setup_default_host_ctx();
-
-  /**
-   * @brief Allocate and initialize team world.
-   */
-  void setup_team_world();
-
-  /**
-   * @brief Initialize the resources required to support teams
-   */
-  void teams_init();
-
-  /**
-   * @brief Destruct the resources required to support teams
-   */
-  void teams_destroy();
-
-  /**
-   * @brief Allocate and initialize barrier operation addresses on
-   * symmetric heap.
-   *
-   * When this method completes, the barrier_sync member will be available
-   * for use.
-   */
-  void rocshmem_collective_init();
-
-#ifdef USE_HOST_SIDE_HDP_FLUSH
-  /**
-   * @brief A service thread routine that flushes the hdp cache on behalf of the
-   * GPU.
-   */
-  void hdp_flush_poll();
-
-  /**
-   * @brief Workers used to poll on the device hdp flush request.
-   */
-  std::thread hdp_flush_worker_thread{};
-#endif
-
-  /**
-   * @brief Signals to the worker threads to exist
-   */
-  std::atomic<bool> worker_thread_exit{false};
-
- public:
-  /**
-   * @brief The host-facing interface that will be used
-   * by all contexts of the GPUIBBackend
-   */
-  HostInterface *host_interface{nullptr};
-
-  /**
-   * @brief Handle for raw memory for barrier sync
-   */
-  long *barrier_pSync_pool{nullptr};
-
-  /**
-   * @brief Handle for raw memory for reduce sync
-   */
-  long *reduce_pSync_pool{nullptr};
-
-  /**
-   * @brief Handle for raw memory for broadcast sync
-   */
-  long *bcast_pSync_pool{nullptr};
-
-  /**
-   * @brief Handle for raw memory for alltoall sync
-   */
-  long *alltoall_pSync_pool{nullptr};
-
-  /**
-   * @brief Handle for raw memory for work
-   */
-  void *pWrk_pool{nullptr};
-
-  /**
-   * @brief Handle for raw memory for alltoall
-   */
-  void *pAta_pool{nullptr};
-
-  /**
-   * @brief rocSHMEM's copy of MPI_COMM_WORLD (for interoperability
-   * with orthogonal MPI usage in an MPI+rocSHMEM program).
-   */
-  MPI_Comm gpu_ib_comm_world{};
-  MPI_Comm backend_comm{};
-
-  /**
-   * @brief Holds number of blocks used in library
-   */
-  size_t num_blocks_{1};
-
- private:
-  /**
-   * @brief Allocates cacheable, device memory for the hdp policy.
-   *
-   * @note Internal data ownership is managed by the proxy
-   */
-  HdpProxy<HIPAllocator> hdp_proxy_{};
-
- public:
-  /**
-   * @brief Policy choice for two HDP implementations.
-   *
-   * @todo Combine HDP related stuff together into a class with a
-   * reasonable interface. The functionality does not need to exist in
-   * multiple pieces in the Backend and QueuePair classes. The hdp_rkey,
-   * hdp_addresses, and hdp_policy fields should all live in the class.
-   */
-  HdpPolicy *hdp_policy{hdp_proxy_.get()};
-
-  /**
-   * @brief Scratchpad for the internal barrier algorithms.
-   */
-  int64_t *barrier_sync{nullptr};
-
-  /**
-   * @brief Compile-time configuration policy for network (IB)
-   *
-   *
-   * The configuration option "USE_SINGLE_NODE" can be enabled to not build
-   * with network support.
-   */
-  NetworkImpl networkImpl{};
-
- private:
-  /**
-   * @brief An array of @ref ROContexts that backs the context FreeList.
-   */
-  GPUIBContext *ctx_array{nullptr};
-
-  /**
-   * @brief A free-list containing contexts.
-   */
-  FreeListProxy<HIPAllocator, GPUIBContext *> ctx_free_list{};
-
-  /**
-   * @brief Holds maximum number of contexts used in library
-   */
-  size_t maximum_num_contexts_{1024};
-
-  /**
-   * @brief The bitmask representing the availability of teams in the pool
-   */
-  char *pool_bitmask_{nullptr};
-
-  /**
-   * @brief Bitmask to store the reduced result of bitmasks on pariticipating
-   * PEs
-   *
-   * With no thread-safety for this bitmask, multithreaded creation of teams is
-   * not supported.
-   */
-  char *reduced_bitmask_{nullptr};
-
-  /**
-   * @brief Size of the bitmask
-   */
-  int bitmask_size_{-1};
-
-  /**
-   * @brief a helper thread to perform the initialization (non-blocking init)
-   */
-  std::thread async_thread_{};
-
-  /**
-   * @brief Holds a copy of the default context (see OpenSHMEM
-   * specification).
-   *
-   * @todo Remove this member from the backend class. There is another
-   * copy stored in ROCSHMEM_CTX_DEFAULT.
-   */
-  GPUIBContext *default_ctx_{nullptr};
-
-  /**
-   * @brief Holds a copy of the default context for host functions
-   */
-  GPUIBHostContext *default_host_ctx_{nullptr};
-
-  unsigned int* hdp_gpu_cpu_flush_flag_;
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_BACKEND_IB_HPP_
diff --git a/src/gpu_ib/connect_tests.cpp b/src/gpu_ib/connect_tests.cpp
deleted file mode 100644
index 0f2cc67686..0000000000
--- a/src/gpu_ib/connect_tests.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include <gtest/gtest.h>
-#include <mpi.h>
-
-#include "dynamic_connection.hpp"
-#include "reliable_connection.hpp"
-
-namespace rocshmem {
-
-// test with different use_ib_hca
-// test with different heap size
-// test with different sleep
-// test with different sq_size
-
-TEST(DynamicConnect, ToNothing) {
-  DynamicConnection connect;
-  connect.construct_init(1);
-}
-
-// test with different num_dcis
-// test with different num_dcts
-
-TEST(ReliableConnect, ToNothing) {
-  ReliableConnection connect;
-  connect.construct_init(1);
-}
-
-int main(int argc, char **argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/connection.cpp b/src/gpu_ib/connection.cpp
deleted file mode 100644
index 31f8eda4d3..0000000000
--- a/src/gpu_ib/connection.cpp
+++ /dev/null
@@ -1,431 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "connection.hpp"
-
-#include <mpi.h>
-
-#include <mutex>  // NOLINT(build/c++11)
-#include <vector>
-
-#include "backend_ib.hpp"
-#include "queue_pair.hpp"
-#include "../util.hpp"
-
-namespace rocshmem {
-
-int Connection::use_gpu_mem = 0;
-int Connection::coherent_cq = 0;
-
-Connection::Connection(GPUIBBackend* b, int k) : backend(b), key_offset(k) {
-  char* value = nullptr;
-
-  if ((value = getenv("ROCSHMEM_USE_IB_HCA"))) {
-    requested_dev = value;
-  }
-
-  if ((value = getenv("ROCSHMEM_SQ_SIZE"))) {
-    sq_size = atoi(value);
-  }
-
-  if ((value = getenv("ROCSHMEM_USE_CQ_GPU_MEM")) != nullptr) {
-    cq_use_gpu_mem = atoi(value);
-  }
-
-  if ((value = getenv("ROCSHMEM_USE_SQ_GPU_MEM")) != nullptr) {
-    sq_use_gpu_mem = atoi(value);
-  }
-}
-
-Connection::~Connection() { delete ib_state; }
-
-void Connection::reg_mr(void* ptr, size_t size, ibv_mr** mr, bool managed) {
-  int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
-               IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
-  if (managed) {
-    access |= IBV_ACCESS_ON_DEMAND;
-  }
-
-  *mr = ibv_reg_mr(ib_state->pd, ptr, size, access);
-
-  if (*mr == nullptr) {
-    abort();
-  }
-}
-
-unsigned Connection::total_number_connections() {
-  int connections;
-  get_remote_conn(&connections);
-  return backend->num_blocks_ * connections;
-}
-
-void Connection::initialize(int num_block) {
-  allocate_dynamic_members(num_block);
-
-  int ib_devices{0};
-  dev_list = ibv_get_device_list(&ib_devices);
-  if (dev_list == nullptr) {
-    abort();
-  }
-
-  struct ibv_device* ib_dev = dev_list[0];
-  if (requested_dev != nullptr) {
-    for (int i = 0; i < ib_devices; i++) {
-      const char* select_dev = ibv_get_device_name(dev_list[i]);
-      if (strstr(select_dev, requested_dev) != nullptr) {
-        ib_dev = dev_list[i];
-        break;
-      }
-    }
-  }
-
-  uint8_t port = 1;
-  ib_init(ib_dev, port);
-
-  int hip_dev_id = 0;
-  CHECK_HIP(hipGetDevice(&hip_dev_id));
-
-  int ib_fork_err = ibv_fork_init();
-  if (ib_fork_err != 0) printf("error: ibv_fork_init failed \n");
-
-  sq_post_dv = static_cast<sq_post_dv_t*>(
-      malloc(sizeof(sq_post_dv_t) * total_number_connections()));
-
-  if (sq_post_dv == nullptr) {
-    abort();
-  }
-
-  create_qps(port, backend->my_pe, &ib_state->portinfo);
-  initialize_1(port, num_block);
-
-  MPI_Barrier(backend->thread_comm);
-  free_dynamic_members();
-}
-
-void Connection::finalize() {
-  ibv_free_device_list(dev_list);
-
-  int ret = ibv_dereg_mr(backend->networkImpl.heap_mr);
-  if (ret) {
-    abort();
-  }
-  // comment until rocm 4.5
-  // ibv_dereg_mr(backend->networkImpl.hdp_mr);
-  ibv_dereg_mr(backend->networkImpl.mr);
-}
-
-void Connection::ib_init(struct ibv_device* ib_dev, uint8_t port) {
-  ib_state = new ib_state_t;
-  if (!ib_state) {
-    abort();
-  }
-
-  ib_state->context = ibv_open_device(ib_dev);
-  if (!ib_state->context) {
-    delete ib_state;
-    abort();
-  }
-
-  ib_state->pd = ibv_alloc_pd(ib_state->context);
-  if (!ib_state->pd) {
-    delete ib_state;
-    abort();
-  }
-
-  ibv_parent_domain_init_attr pattr;
-  init_parent_domain_attr(&pattr);
-  ib_state->pd = ibv_alloc_parent_domain(ib_state->context, &pattr);
-
-  ibv_query_port(ib_state->context, port, &ib_state->portinfo);
-}
-
-template <typename StateType>
-void Connection::try_to_modify_qp(ibv_qp* qp, StateType state) {
-  ibv_modify_qp(qp, &state.exp_qp_attr, state.exp_attr_mask);
-}
-
-void Connection::init_qp_status(ibv_qp* qp, uint8_t port) {
-  try_to_modify_qp<InitQPState>(qp, initqp(port));
-}
-
-/**
- * rtr stands for 'ready to receive'
- */
-void Connection::change_status_rtr(ibv_qp* qp, dest_info_t* dest,
-                                     uint8_t port) {
-  try_to_modify_qp<RtrState>(qp, rtr(dest, port));
-}
-
-/**
- * rts stands for 'ready to send'
- */
-void Connection::change_status_rts(ibv_qp* qp, dest_info_t* dest) {
-  try_to_modify_qp<RtsState>(qp, rts(dest));
-}
-
-void Connection::create_qps(uint8_t port, int my_rank,
-                              ibv_port_attr* ib_port_att) {
-  create_qps_1();
-
-  ibv_qp_cap cap{};
-  cap.max_send_wr = sq_size;
-  cap.max_send_sge = 1;
-  cap.max_inline_data = 4;
-
-  QPInitAttr qp_init_attr = qpattr(cap);
-
-  size_t qp_size = total_number_connections();
-  cqs.resize(qp_size);
-  qps.resize(qp_size);
-
-  int cqe = qp_init_attr.attr.cap.max_send_wr;
-  for (auto& entry : cqs) {
-    entry = create_cq(ib_state->context, ib_state->pd, cqe);
-    if (!entry) {
-      abort();
-    }
-  }
-
-  create_qps_2(port, my_rank, ib_port_att);
-
-  for (int i = 0; i < qps.size(); i++) {
-    qps[i] =
-        create_qp(ib_state->pd, ib_state->context, &qp_init_attr.attr, cqs[i]);
-    if (!qps[i]) {
-      abort();
-    }
-
-    create_qps_3(port, qps[i], i, ib_port_att);
-  }
-}
-
-void Connection::initialize_gpu_policy(ConnectionImpl** conn,
-                                         uint32_t* heap_rkey) {
-  CHECK_HIP(hipMalloc(reinterpret_cast<void**>(conn), sizeof(ConnectionImpl)));
-  new (*conn) ConnectionImpl(this, heap_rkey);
-}
-
-/*
- * Create and write the rdma segment to the SQ
- */
-void Connection::set_rdma_seg(mlx5_wqe_raddr_seg* rdma, uint64_t address,
-                              uint32_t rkey) {
-  rdma->raddr = htobe64(address);
-  rdma->rkey = htobe32(rkey);
-}
-
-/*
- * Retrieve the address of a SQ.
- * We used this address to write the WQE directly to the SQ.
- */
-uint64_t* Connection::get_address_sq(int i) {
-  mlx5dv_obj mlx_obj;
-  mlx5dv_qp qp_out;
-
-  mlx_obj.qp.in = qps[i];
-  mlx_obj.qp.out = &qp_out;
-
-  mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP);
-
-  return reinterpret_cast<uint64_t*>(qp_out.sq.buf);
-}
-
-void* Connection::buf_alloc([[maybe_unused]] struct ibv_pd* pd,
-                            [[maybe_unused]] void* pd_context, size_t size,
-                            [[maybe_unused]] size_t alignment,
-                            [[maybe_unused]] uint64_t resource_type) {
-  if (use_gpu_mem) {
-    void* dev_ptr;
-    if (coherent_cq == 1) {
-#if defined USE_COHERENT_HEAP
-      CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&dev_ptr), size));
-#else
-  #ifdef HIP_SUPPORTS_MALLOC_UNCACHED
-        CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
-                                        hipDeviceMallocUncached));
-  #else
-        CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
-                                        hipDeviceMallocFinegrained));
-  #endif
-#endif
-    } else {
-#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
-      CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
-                                      hipDeviceMallocUncached));
-#else
-      CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size,
-                                      hipDeviceMallocFinegrained));
-#endif
-
-    }
-    memset(dev_ptr, 0, size);
-    return dev_ptr;
-  }
-  return IBV_ALLOCATOR_USE_DEFAULT;
-}
-
-void Connection::buf_release([[maybe_unused]] struct ibv_pd* pd,
-                             [[maybe_unused]] void* pd_context, void* ptr,
-                             [[maybe_unused]] uint64_t resource_type) {
-  if (use_gpu_mem) {
-    CHECK_HIP(hipFree(ptr));
-  } else {
-    free(ptr);
-  }
-}
-
-void Connection::init_parent_domain_attr(ibv_parent_domain_init_attr* attr1) {
-  attr1->pd = ib_state->pd;
-  attr1->td = nullptr;
-  attr1->comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS;
-  attr1->alloc = Connection::buf_alloc;
-  attr1->free = Connection::buf_release;
-  attr1->pd_context = nullptr;
-}
-
-ibv_cq* Connection::create_cq(ibv_context* context, ibv_pd* pd, int cqe) {
-  use_gpu_mem = cq_use_gpu_mem;
-
-  ibv_cq_init_attr_ex cq_attr;
-  memset(&cq_attr, 0, sizeof(ibv_cq_init_attr_ex));
-  cq_attr.cqe = cqe;
-  cq_attr.cq_context = nullptr;
-  cq_attr.channel = nullptr;
-  cq_attr.comp_vector = 0;
-  cq_attr.flags = 0;  // see ibv_exp_cq_create_flags
-  cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_PD;
-  cq_attr.parent_domain = pd;
-
-  coherent_cq = 1;
-  ibv_cq_ex* cq = ibv_create_cq_ex(context, &cq_attr);
-  coherent_cq = 0;
-  if (!cq) {
-    printf("error in ibv_create_cq_ex: %d %s\n", errno, strerror(errno));
-    return nullptr;
-  }
-  return ibv_cq_ex_to_cq(cq);
-}
-
-void Connection::init_gpu_qp_from_connection(QueuePair* gpu_qp,
-                                               int conn_num) {
-  int hip_dev_id = 0;
-  CHECK_HIP(hipGetDevice(&hip_dev_id));
-  use_gpu_mem = cq_use_gpu_mem;
-
-  mlx5dv_cq cq_out;
-  mlx5dv_obj mlx_obj;
-  mlx_obj.cq.in = cqs[conn_num];
-  mlx_obj.cq.out = &cq_out;
-
-  mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
-  gpu_qp->cq_log_size = log2(cq_out.cqe_cnt);
-  gpu_qp->cq_size = cq_out.cqe_cnt;
-
-  void* gpu_ptr = nullptr;
-  if (use_gpu_mem) {
-    gpu_qp->current_cq_q = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
-  } else {
-    rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(cq_out.buf),
-                                   cq_out.cqe_cnt * 64, &gpu_ptr, hip_dev_id);
-    gpu_qp->current_cq_q = reinterpret_cast<mlx5_cqe64*>(gpu_ptr);
-  }
-  gpu_qp->current_cq_q_H = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
-
-  rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(cq_out.dbrec), 64,
-                                 &gpu_ptr, hip_dev_id);
-
-  gpu_qp->dbrec_cq = reinterpret_cast<volatile uint32_t*>(gpu_ptr);
-
-  use_gpu_mem = sq_use_gpu_mem;
-
-  mlx5dv_qp qp_out;
-  mlx_obj.qp.in = qps[conn_num];
-  mlx_obj.qp.out = &qp_out;
-
-  mlx5dv_init_obj(&mlx_obj, MLX5DV_OBJ_QP);
-
-  gpu_qp->max_nwqe = (qp_out.sq.wqe_cnt);
-
-  volatile uint32_t* dbrec_send = qp_out.dbrec + 1;
-
-  if (use_gpu_mem) {
-    gpu_qp->current_sq = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
-    gpu_qp->dbrec_send = reinterpret_cast<volatile uint32_t*>(dbrec_send);
-  } else {
-    gpu_ptr = nullptr;
-    rocm_memory_lock_to_fine_grain(reinterpret_cast<void*>(qp_out.sq.buf),
-                                   qp_out.sq.wqe_cnt * 64, &gpu_ptr,
-                                   hip_dev_id);
-
-    gpu_qp->current_sq = reinterpret_cast<uint64_t*>(gpu_ptr);
-
-    rocm_memory_lock_to_fine_grain(
-        reinterpret_cast<void*>(const_cast<uint32_t*>(dbrec_send)), 32,
-        &gpu_ptr, hip_dev_id);
-
-    gpu_qp->dbrec_send = reinterpret_cast<volatile uint32_t*>(gpu_ptr);
-  }
-
-  gpu_qp->current_sq_H = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
-
-  gpu_qp->setDBval(*(reinterpret_cast<uint64_t*>(qp_out.sq.buf)));
-
-  rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr,
-                                 hip_dev_id);
-
-  gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
-
-  uint32_t* sq = reinterpret_cast<uint32_t*>(qp_out.sq.buf);
-  uint32_t ctrl_qp_sq = (reinterpret_cast<uint32_t*>(sq))[1];
-  gpu_qp->ctrl_qp_sq = ctrl_qp_sq & 0xFFFFFF;
-  gpu_qp->ctrl_sig = (reinterpret_cast<uint64_t*>(sq))[1];
-  gpu_qp->rkey = (reinterpret_cast<uint32_t*>(sq))[6 + key_offset];
-  gpu_qp->lkey = (reinterpret_cast<uint32_t*>(sq))[9 + key_offset];
-}
-
-ibv_qp* Connection::create_qp(ibv_pd* pd, ibv_context* context,
-                              ibv_qp_init_attr_ex* qp_attr, ibv_cq* cq) {
-  use_gpu_mem = sq_use_gpu_mem;
-
-  ibv_qp* qp = nullptr;
-
-  assert(pd);
-  assert(context);
-  assert(qp_attr);
-
-  qp_attr->send_cq = cq;
-  qp_attr->recv_cq = cq;
-  qp_attr->pd = pd;
-
-  qp_attr->comp_mask = IBV_QP_INIT_ATTR_PD;
-
-  qp = create_qp_0(context, qp_attr);
-
-  if (!qp) {
-    printf("***** error ibv_create_qp failed %d m %m \n", errno, errno);
-    ibv_destroy_cq(cq);
-  }
-
-  return qp;
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/connection.hpp b/src/gpu_ib/connection.hpp
deleted file mode 100644
index bdcf9eaeba..0000000000
--- a/src/gpu_ib/connection.hpp
+++ /dev/null
@@ -1,259 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
-#define LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
-
-#include <infiniband/verbs.h>
-
-extern "C" {
-#include <infiniband/mlx5dv.h>
-}
-
-#include <vector>
-
-#include "rocshmem/rocshmem.hpp"
-#include "connection_policy.hpp"
-
-namespace rocshmem {
-
-class GPUIBBackend;
-class QueuePair;
-
-class Connection {
- protected:
-  typedef struct ib_state {
-    struct ibv_context* context;
-    struct ibv_pd* pd;
-    struct ibv_mr* mr;
-    struct ibv_port_attr portinfo;
-  } ib_state_t;
-
-  typedef struct dest_info {
-    int lid;
-    int qpn;
-    int psn;
-    union ibv_gid gid;
-  } dest_info_t;
-
-  typedef struct heap_info {
-    void* base_heap;
-    uint32_t rkey;
-  } heap_info_t;
-
-  struct sq_post_dv_t {
-    uint64_t segments[16];
-    uint32_t current_sq;
-    uint16_t wqe_idx;
-  };
-
-  class State {
-   public:
-    ibv_qp_attr exp_qp_attr{};
-    uint64_t exp_attr_mask{};
-  };
-
-  class InitQPState : public State {
-   public:
-    InitQPState() {
-      exp_qp_attr.qp_state = IBV_QPS_INIT;
-      exp_qp_attr.qp_access_flags =
-          IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
-          IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
-
-      exp_attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
-    }
-  };
-
-  class RtrState : public State {
-   public:
-    RtrState() {
-      exp_qp_attr.qp_state = IBV_QPS_RTR;
-      exp_qp_attr.path_mtu = IBV_MTU_4096;
-      exp_qp_attr.ah_attr.sl = 1;
-      exp_qp_attr.max_dest_rd_atomic = 1;
-      exp_qp_attr.min_rnr_timer = 12;
-
-      exp_attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU;
-    }
-  };
-
-  class RtsState : public State {
-   public:
-    RtsState() {
-      exp_qp_attr.qp_state = IBV_QPS_RTS;
-      exp_qp_attr.timeout = 14;
-      exp_qp_attr.retry_cnt = 7;
-      exp_qp_attr.rnr_retry = 7;
-      exp_qp_attr.max_rd_atomic = 1;
-
-      exp_attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
-                      IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC;
-    }
-  };
-
-  class QPInitAttr {
-   public:
-    explicit QPInitAttr(ibv_qp_cap cap) {
-      attr.cap = cap;
-      attr.sq_sig_all = 0;
-    }
-    ibv_qp_init_attr_ex attr{};
-  };
-
- public:
-  Connection(GPUIBBackend* backend, int key_offset);
-
-  virtual ~Connection();
-
-  void initialize(int num_block);
-
-  void finalize();
-
-  virtual void post_wqes() = 0;
-
-  void reg_mr(void* ptr, size_t size, ibv_mr** mr, bool is_managed);
-
-  virtual void get_remote_conn(int* remote_conn) = 0;
-
-  unsigned total_number_connections();
-
-  virtual void initialize_rkey_handle(uint32_t** heap_rkey_handle,
-                                        ibv_mr* mr) = 0;
-
-  virtual void free_rkey_handle(uint32_t* heap_rkey_handle) = 0;
-
-  void initialize_gpu_policy(ConnectionImpl** conn, uint32_t* heap_rkey);
-
-  /*
-   * Populate a QueuePair for use on the GPU from the internal IB state.
-   */
-  void init_gpu_qp_from_connection(QueuePair* qp, int conn_num);
-
- protected:
-  Connection() = default;
-
-  virtual InitQPState initqp(uint8_t port) = 0;
-
-  virtual RtrState rtr(dest_info_t* dest, uint8_t port) = 0;
-
-  virtual RtsState rts(dest_info_t* dest) = 0;
-
-  virtual QPInitAttr qpattr(ibv_qp_cap cap) = 0;
-
-  void init_qp_status(ibv_qp* qp, uint8_t port);
-
-  void change_status_rtr(ibv_qp* qp, dest_info_t* dest, uint8_t port);
-
-  void change_status_rts(ibv_qp* qp, dest_info_t* dest);
-
-  void create_qps(uint8_t port, int my_rank, ibv_port_attr* ib_port_att);
-
-  template <typename T>
-  void try_to_modify_qp(ibv_qp* qp, T state);
-
-  virtual void create_qps_1() = 0;
-
-  virtual void create_qps_2(int port, int my_rank,
-                              ibv_port_attr* ib_port_att) = 0;
-
-  virtual void create_qps_3(int port, ibv_qp* qp, int offset,
-                              ibv_port_attr* ib_port_att) = 0;
-
-  virtual ibv_qp* create_qp_0(ibv_context* context,
-                              ibv_qp_init_attr_ex* qp_attr) = 0;
-
-  virtual void allocate_dynamic_members(int num_block) = 0;
-
-  virtual void free_dynamic_members() = 0;
-
-  virtual void initialize_1(int port, int num_block) = 0;
-
-  virtual void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah,
-                                    int dc_key) = 0;
-
-  virtual int get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) = 0;
-
-  void set_sq_dv(int num_block, int wg_idx, int pe_idx);
-
-  /*
-   * ibv interface functions must be static.
-   */
-  static void* buf_alloc(ibv_pd* pd, void* pd_context, size_t size,
-                         size_t alignment, uint64_t resource_type);
-
-  static void buf_release(ibv_pd* pd, void* pd_context, void* ptr,
-                          uint64_t resource_type);
-
-  void init_parent_domain_attr(ibv_parent_domain_init_attr* attr);
-
-  void set_rdma_seg(mlx5_wqe_raddr_seg* rdma, uint64_t address, uint32_t rkey);
-
-  uint64_t* get_address_sq(int i);
-
-  ibv_cq* create_cq(ibv_context* context, ibv_pd* pd, int cqe);
-
-  ibv_qp* create_qp(ibv_pd* pd, ibv_context* context,
-                    ibv_qp_init_attr_ex* qp_attr, ibv_cq* rcq);
-
-  /*
-   * TODO: Remove this eventually. Goal is to have backend delegate
-   * connection stuff to this class, while this class knows nothing about
-   * GPUs or backends.
-   */
-  GPUIBBackend* backend{nullptr};
-
-  uint32_t sq_size{1024};
-
-  ib_state_t* ib_state{nullptr};
-
-  const int key_offset{0};
-
-  sq_post_dv_t* sq_post_dv{nullptr};
-
-  std::vector<ibv_cq*> cqs;
-
-  std::vector<ibv_qp*> qps;
-
-  uint64_t counter_wqe{0};
-
-  static int use_gpu_mem;
-
-  static int coherent_cq;
-
-  int cq_use_gpu_mem{1};
-
-  int sq_use_gpu_mem{1};
-
- private:
-  void init_shmem_handle();
-
-  void ib_init(ibv_device* ib_dev, uint8_t port);
-
-  char* requested_dev{nullptr};
-
-  ibv_device** dev_list{nullptr};
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_CONNECTION_HPP_
diff --git a/src/gpu_ib/connection_policy.cpp b/src/gpu_ib/connection_policy.cpp
deleted file mode 100644
index 5ccf91a0ea..0000000000
--- a/src/gpu_ib/connection_policy.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "connection_policy.hpp"
-
-#include <infiniband/mlx5dv.h>
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "dynamic_connection.hpp"
-#include "queue_pair.hpp"
-
-#ifdef DEBUG
-#define HIP_ENABLE_PRINTF 1
-#endif
-
-namespace rocshmem {
-
-RCConnectionImpl::RCConnectionImpl([[maybe_unused]] Connection* conn,
-                                   [[maybe_unused]] uint32_t* _vec_rkey) {}
-
-DCConnectionImpl::DCConnectionImpl(Connection* conn, uint32_t* _vec_rkey)
-    : vec_dct_num(static_cast<DynamicConnection*>(conn)->get_vec_dct_num()),
-      vec_rkey(_vec_rkey),
-      vec_lids(static_cast<DynamicConnection*>(conn)->get_vec_lids()) {}
-
-__device__ uint32_t RCConnectionImpl::getNumWqesImpl([
-    [maybe_unused]] uint8_t opcode) {
-  return 1;
-}
-
-__device__ uint32_t DCConnectionImpl::getNumWqesImpl(uint8_t opcode) {
-  // FIXME: We assume all threads in wave are performing ATOMIC ops.
-  // While this might be common, we do not have such restriction
-  // so need to be fixed.
-  // Since OFED 5.2, a DC segments uses 48bytes - so with or without
-  // atomic we need 2 wqes.
-  // return 2;
-  return (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS)
-             ? 2
-             : 1;
-}
-
-__device__ bool RCConnectionImpl::updateConnectionSegmentImpl(
-    [[maybe_unused]] ib_mlx5_base_av_t* wqe, [[maybe_unused]] int pe) {
-  return false;
-}
-
-__device__ bool DCConnectionImpl::updateConnectionSegmentImpl(
-    ib_mlx5_base_av_t* wqe, int pe) {
-  wqe->dqp_dct = vec_dct_num[pe];
-  wqe->rlid = vec_lids[pe];
-  return true;
-}
-
-__device__ void RCConnectionImpl::setRkeyImpl([[maybe_unused]] uint32_t* rkey,
-                                              [[maybe_unused]] int pe) {}
-
-__device__ void DCConnectionImpl::setRkeyImpl(uint32_t* rkey, int pe) {
-  *rkey = vec_rkey[pe];
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/connection_policy.hpp b/src/gpu_ib/connection_policy.hpp
deleted file mode 100644
index 76bb5db1cf..0000000000
--- a/src/gpu_ib/connection_policy.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
-#define LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "infiniband_structs.hpp"
-
-namespace rocshmem {
-
-/*
- * CRTP base class for connection type
- */
-template <typename Derived>
-class ConnectionBase {
- public:
-  /*
-   * Control segment WQE offset imposed by this connection type.
-   */
-  __device__ int wqeCntrlOffset() {
-    return static_cast<Derived*>(this)->wqeCntrlOffsetImpl();
-  }
-
-  /*
-   * Whether or not we need to force PE-level divergence when posting for
-   * this connection type.
-   */
-  __device__ bool forcePostDivergence() {
-    return static_cast<Derived*>(this)->forcePostDivergenceImpl();
-  }
-
-  /*
-   * Number of WQEs produced by this connection type for the given opcode.
-   */
-  __device__ uint32_t getNumWqes(uint8_t opcode) {
-    return static_cast<Derived*>(this)->getNumWqesImpl(opcode);
-  }
-
-  /*
-   * Updates the connection-specific segment in the SQ.
-   */
-  __device__ bool updateConnectionSegment(ib_mlx5_base_av_t* wqe, int pe) {
-    return static_cast<Derived*>(this)->updateConnectionSegmentImpl(wqe, pe);
-  }
-
-  /*
-   * Set the rkey based on this connection type.
-   */
-  __device__ void setRkey(uint32_t* rkey, int pe) {
-    static_cast<Derived*>(this)->setRkeyImpl(rkey, pe);
-  }
-};
-
-class Connection;
-
-/*
- * Connection policy corresponding to an RC connection type.
- */
-class RCConnectionImpl : public ConnectionBase<RCConnectionImpl> {
- public:
-  RCConnectionImpl(Connection* conn, uint32_t* _vec_rkey);
-
-  __device__ int wqeCntrlOffsetImpl() { return 0; }
-
-  __device__ bool forcePostDivergenceImpl() { return true; }
-
-  __device__ uint32_t getNumWqesImpl(uint8_t opcode);
-
-  __device__ bool updateConnectionSegmentImpl(ib_mlx5_base_av_t* wqe, int pe);
-
-  __device__ void setRkeyImpl(uint32_t* rkey, int pe);
-};
-
-/*
- * Connection policy corresponding to a DC connection type.
- */
-class DCConnectionImpl : public ConnectionBase<DCConnectionImpl> {
- public:
-  DCConnectionImpl(Connection* conn, uint32_t* _vec_rkey);
-
-  __device__ int wqeCntrlOffsetImpl() { return 1; }
-
-  __device__ bool forcePostDivergenceImpl() { return false; }
-
-  __device__ uint32_t getNumWqesImpl(uint8_t opcode);
-
-  __device__ bool updateConnectionSegmentImpl(ib_mlx5_base_av_t* wqe, int pe);
-
-  __device__ void setRkeyImpl(uint32_t* rkey, int pe);
-
- private:
-  uint32_t* vec_dct_num{nullptr};
-
-  uint32_t* vec_rkey{nullptr};
-
-  uint16_t* vec_lids{nullptr};
-};
-
-/*
- * Select which one of our connection policies to use at compile time.
- */
-#ifdef USE_DC
-typedef DCConnectionImpl ConnectionImpl;
-#else
-typedef RCConnectionImpl ConnectionImpl;
-#endif
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_CONNECTION_POLICY_HPP_
diff --git a/src/gpu_ib/context_ib_device.cpp b/src/gpu_ib/context_ib_device.cpp
deleted file mode 100644
index c3e8146736..0000000000
--- a/src/gpu_ib/context_ib_device.cpp
+++ /dev/null
@@ -1,371 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "context_ib_device.hpp"
-
-#include <hip/hip_runtime.h>
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "rocshmem/rocshmem.hpp"
-#include "../backend_type.hpp"
-#include "../context_incl.hpp"
-#include "backend_ib.hpp"
-#include "queue_pair.hpp"
-
-namespace rocshmem {
-
-__host__ GPUIBContext::GPUIBContext(Backend *backend, bool option, int idx)
-    : Context(backend, option) {
-  GPUIBBackend *b{static_cast<GPUIBBackend *>(backend)};
-  ctx_idx = idx;
-  networkImpl = b->networkImpl;
-  base_heap = b->heap.get_heap_bases().data();
-  networkImpl.networkHostInit(this, idx);
-
-  barrier_sync = b->barrier_sync;
-  ipcImpl_.ipc_bases = b->ipcImpl.ipc_bases;
-  ipcImpl_.shm_size = b->ipcImpl.shm_size;
-}
-
-__device__ void GPUIBContext::ctx_create() {
-  /* Nothing to do in the GPU_IB backend */
-  return;
-}
-
-/*
- * TODO(bpotter): these will go in a policy class based on DC/RC.
- * I am not completely sure at this point what else is needed in said class,
- * so just leave them up here for now.
- */
-__device__ __host__ QueuePair *GPUIBContext::getQueuePair(int pe) {
-  return networkImpl.getQueuePair(device_qp_proxy, pe);
-}
-
-__device__ __host__ int GPUIBContext::getNumQueuePairs() {
-  return networkImpl.getNumQueuePairs();
-}
-
-__device__ __host__ int GPUIBContext::getNumDest() {
-  return networkImpl.getNumDest();
-}
-
-__device__ void GPUIBContext::fence() {
-#ifdef USE_SINGLE_NODE
-  threadfence_system();
-#else
-
-  for (int k = 0; k < getNumDest(); k++) {
-    getQueuePair(k)->fence(k);
-  }
-
-  fence_.flush();
-#endif
-}
-
-__device__ void GPUIBContext::fence(int pe) {
-#ifdef USE_SINGLE_NODE
-  threadfence_system();
-#else
-  getQueuePair(pe)->fence(pe);
-  fence_.flush();
-#endif
-}
-
-__device__ void GPUIBContext::putmem_nbi(void *dest, const void *source,
-                                         size_t nelems, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
-
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[local_pe] + L_offset,
-                     const_cast<void *>(source), nelems);
-  } else {
-    bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
-    if (!must_send_message) {
-      return;
-    }
-
-    auto *qp = getQueuePair(pe);
-    qp->put_nbi<THREAD>(base_heap[pe] + L_offset, source, nelems, pe, true);
-  }
-}
-
-__device__ void GPUIBContext::getmem_nbi(void *dest, const void *source,
-                                         size_t nelems, int pe) {
-  const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
-  } else {
-    bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
-    if (!must_send_message) {
-      return;
-    }
-
-    auto *qp = getQueuePair(pe);
-    qp->get_nbi<THREAD>(base_heap[pe] + L_offset, dest, nelems, pe, true);
-  }
-}
-
-__device__ void GPUIBContext::quiet() {
-#ifdef USE_SINGLE_NODE
-  threadfence_system();
-  for (int pe = 0; pe < ipcImpl_.shm_size; pe++) {
-    if (pe != my_pe) {
-      ipcImpl_.zero_byte_read(pe);
-    }
-  }
-#else
-
-  for (int k = 0; k < getNumDest(); k++) {
-    getQueuePair(k)->quiet_single_heavy<THREAD>(k);
-  }
-  fence_.flush();
-
-#endif
-}
-
-__device__ void *GPUIBContext::shmem_ptr(const void *dest, int pe) {
-  void *ret = nullptr;
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    void *dst = const_cast<void *>(dest);
-    uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
-    int local_pe = pe % ipcImpl_.shm_size;
-    ret = ipcImpl_.ipc_bases[local_pe] + L_offset;
-  }
-  return ret;
-}
-
-__device__ void GPUIBContext::threadfence_system() {
-  int thread_id = get_flat_block_id();
-
-  if (thread_id % WF_SIZE == lowerID()) {
-#ifdef USE_SINGLE_NODE
-    // Flush current PE HDP
-    HdpPolicy::hdp_flush(
-        reinterpret_cast<unsigned int *>(networkImpl.hdp_address));
-
-    // Flush the rest of the HDPs
-    for (int pe = 0; pe < ipcImpl_.shm_size; pe++) {
-      auto target_address = networkImpl.hdp_address;
-      const int value = HdpPolicy::HDP_FLUSH_VAL;
-      if (pe != my_pe) {
-        const int value = HdpPolicy::HDP_FLUSH_VAL;
-        auto mapped_address =
-            shmem_ptr(reinterpret_cast<void *>(target_address), pe);
-        __hip_atomic_store(static_cast<int *>(mapped_address), value,
-                           __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-      }
-    }
-#else
-    getQueuePair(my_pe)->hdp_policy->flushCoherency();
-#endif
-  }
-
-  __threadfence_system();
-}
-
-__device__ void GPUIBContext::getmem(void *dest, const void *source,
-                                     size_t nelems, int pe) {
-  const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
-  } else {
-    bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
-    if (!must_send_message) {
-      return;
-    }
-    auto *qp = getQueuePair(pe);
-    qp->get_nbi_cqe<THREAD>(base_heap[pe] + L_offset, dest, nelems, pe, true);
-    qp->quiet_single<THREAD>();
-  }
-  fence_.flush();
-}
-
-__device__ void GPUIBContext::putmem(void *dest, const void *source,
-                                     size_t nelems, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy(ipcImpl_.ipc_bases[local_pe] + L_offset,
-                     const_cast<void *>(source), nelems);
-
-    threadfence_system();
-    ipcImpl_.zero_byte_read(pe);
-  } else {
-    bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
-    if (!must_send_message) {
-      return;
-    }
-    auto *qp = getQueuePair(pe);
-    qp->put_nbi_cqe<THREAD>(base_heap[pe] + L_offset, source, nelems, pe, true);
-    qp->quiet_single<THREAD>();
-  }
-  fence_.flush();
-}
-
-/******************************************************************************
- ************************ WORKGROUP/WAVE-LEVEL RMA API ************************
- *****************************************************************************/
-__device__ void GPUIBContext::putmem_nbi_wg(void *dest, const void *source,
-                                            size_t nelems, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[local_pe] + L_offset,
-                        const_cast<void *>(source), nelems);
-  } else {
-    if (is_thread_zero_in_block()) {
-      auto *qp = getQueuePair(pe);
-      qp->put_nbi<WG>(base_heap[pe] + L_offset, source, nelems, pe, true);
-    }
-  }
-  __syncthreads();
-}
-
-__device__ void GPUIBContext::putmem_nbi_wave(void *dest, const void *source,
-                                              size_t nelems, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[local_pe] + L_offset,
-                          const_cast<void *>(source), nelems);
-  } else {
-    if (is_thread_zero_in_wave()) {
-      auto *qp = getQueuePair(pe);
-      qp->put_nbi<WAVE>(base_heap[pe] + L_offset, source, nelems, pe, true);
-    }
-  }
-}
-
-__device__ void GPUIBContext::putmem_wg(void *dest, const void *source,
-                                        size_t nelems, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy_wg(ipcImpl_.ipc_bases[local_pe] + L_offset,
-                        const_cast<void *>(source), nelems);
-    __syncthreads();
-    threadfence_system();
-    ipcImpl_.zero_byte_read(pe);
-  } else {
-    auto *qp = getQueuePair(pe);
-    if (is_thread_zero_in_block()) {
-      qp->put_nbi_cqe<WG>(base_heap[pe] + L_offset, source, nelems, pe, true);
-    }
-    qp->quiet_single<WG>();
-  }
-  __syncthreads();
-  fence_.flush();
-}
-
-__device__ void GPUIBContext::putmem_wave(void *dest, const void *source,
-                                          size_t nelems, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dest) - base_heap[my_pe];
-  auto *qp = getQueuePair(pe);
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy_wave(ipcImpl_.ipc_bases[local_pe] + L_offset,
-                          const_cast<void *>(source), nelems);
-    threadfence_system();
-    ipcImpl_.zero_byte_read(pe);
-  } else {
-    if (is_thread_zero_in_wave()) {
-      qp->put_nbi_cqe<WAVE>(base_heap[pe] + L_offset, source, nelems, pe, true);
-    }
-    qp->quiet_single<WAVE>();
-  }
-  fence_.flush();
-}
-
-__device__ void GPUIBContext::getmem_wg(void *dest, const void *source,
-                                        size_t nelems, int pe) {
-  const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
-  auto *qp = getQueuePair(pe);
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
-  } else {
-    if (is_thread_zero_in_block()) {
-      qp->get_nbi_cqe<WG>(base_heap[pe] + L_offset, dest, nelems, pe, true);
-    }
-    qp->quiet_single<WG>();
-  }
-  __syncthreads();
-  fence_.flush();
-}
-
-__device__ void GPUIBContext::getmem_wave(void *dest, const void *source,
-                                          size_t nelems, int pe) {
-  const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
-  auto *qp = getQueuePair(pe);
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[local_pe] + L_offset,
-                          nelems);
-  } else {
-    if (is_thread_zero_in_wave()) {
-      qp->get_nbi_cqe<WAVE>(base_heap[pe] + L_offset, dest, nelems, pe, true);
-    }
-    qp->quiet_single<WAVE>();
-  }
-  fence_.flush();
-}
-
-__device__ void GPUIBContext::getmem_nbi_wg(void *dest, const void *source,
-                                            size_t nelems, int pe) {
-  const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy_wg(dest, ipcImpl_.ipc_bases[local_pe] + L_offset, nelems);
-  } else {
-    if (is_thread_zero_in_block()) {
-      auto *qp = getQueuePair(pe);
-      qp->get_nbi<WG>(base_heap[pe] + L_offset, dest, nelems, pe, true);
-    }
-  }
-  __syncthreads();
-}
-
-__device__ void GPUIBContext::getmem_nbi_wave(void *dest, const void *source,
-                                              size_t nelems, int pe) {
-  const char *src_typed = reinterpret_cast<const char *>(source);
-  uint64_t L_offset = const_cast<char *>(src_typed) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    int local_pe = pe % ipcImpl_.shm_size;
-    ipcImpl_.ipcCopy_wave(dest, ipcImpl_.ipc_bases[local_pe] + L_offset,
-                          nelems);
-  } else {
-    if (is_thread_zero_in_wave()) {
-      auto *qp = getQueuePair(pe);
-      qp->get_nbi<WAVE>(base_heap[pe] + L_offset, dest, nelems, pe, true);
-    }
-  }
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/context_ib_device.hpp b/src/gpu_ib/context_ib_device.hpp
deleted file mode 100644
index 9748b1dac3..0000000000
--- a/src/gpu_ib/context_ib_device.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
-#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
-
-#include "../context.hpp"
-#include "memory_builder_policy.hpp"
-#include "network_policy.hpp"
-
-namespace rocshmem {
-
-class QueuePair;
-
-class GPUIBContext : public Context {
- public:
-  __host__ GPUIBContext(Backend *b, bool option, int idx);
-
-  __device__ __host__ QueuePair *getQueuePair(int pe);
-
-  __device__ __host__ int getNumQueuePairs();
-
-  __device__ __host__ int getNumDest();
-
-  __device__ __attribute__((noinline)) void threadfence_system();
-
-  __device__ void ctx_create();
-
-  __device__ void ctx_destroy();
-
-  __device__ void putmem(void *dest, const void *source, size_t nelems, int pe);
-
-  __device__ void getmem(void *dest, const void *source, size_t nelems, int pe);
-
-  __device__ void putmem_nbi(void *dest, const void *source, size_t nelems,
-                             int pe);
-
-  __device__ void getmem_nbi(void *dest, const void *source, size_t size,
-                             int pe);
-
-  __device__ void fence();
-
-  __device__ void fence(int pe);
-
-  __device__ void quiet();
-
-  __device__ void *shmem_ptr(const void *dest, int pe);
-
-  __device__ void barrier_all();
-
-  __device__ void sync_all();
-
-  __device__ void sync(rocshmem_team_t team);
-
-  template <typename T>
-  __device__ void amo_add(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ void amo_set(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ T amo_swap(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ T amo_fetch_and(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ void amo_and(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ T amo_fetch_or(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ void amo_or(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ T amo_fetch_xor(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ void amo_xor(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ void amo_cas(void *dst, T value, T cond, int pe);
-
-  template <typename T>
-  __device__ T amo_fetch_add(void *dst, T value, int pe);
-
-  template <typename T>
-  __device__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
-
-  template <typename T>
-  __device__ void p(T *dest, T value, int pe);
-
-  template <typename T>
-  __device__ T g(const T *source, int pe);
-
-  template <typename T, ROCSHMEM_OP Op>
-  __device__ void to_all(T *dest, const T *source, int nreduce, int PE_start,
-                         int logPE_stride, int PE_size, T *pWrk,
-                         long *pSync);  // NOLINT(runtime/int)
-
-  template <typename T, ROCSHMEM_OP Op>
-  __device__ void to_all(rocshmem_team_t team, T *dest, const T *source,
-                         int nreduce);
-
-  template <typename T>
-  __device__ void put(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void put_nbi(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void get(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
-                            int nelems, int pe_root);
-
-  template <typename T>
-  __device__ void broadcast(T *dest, const T *source, int nelems, int pe_root,
-                            int pe_start, int log_pe_stride, int pe_size,
-                            long *p_sync);  // NOLINT(runtime/int)
-
-  template <typename T>
-  __device__ void alltoall(rocshmem_team_t team, T *dest, const T *source,
-                           int nelems);
-
-  template <typename T>
-  __device__ void alltoall_broadcast(rocshmem_team_t team, T *dest,
-                                     const T *source, int nelems);
-
-  template <typename T>
-  __device__ void alltoall_brucks(rocshmem_team_t team, T *dest,
-                                  const T *source, int nelems);
-
-  template <typename T>
-  __device__ void alltoall_gcen(rocshmem_team_t team, T *dest, const T *source,
-                                int nelems);
-
-  template <typename T>
-  __device__ void alltoall_gcen2(rocshmem_team_t team, T *dest,
-                                 const T *source, int nelems);
-
-  template <typename T>
-  __device__ void fcollect(rocshmem_team_t team, T *dest, const T *source,
-                           int nelems);
-
-  template <typename T>
-  __device__ void fcollect_broadcast(rocshmem_team_t team, T *dest,
-                                     const T *source, int nelems);
-
-  template <typename T>
-  __device__ void fcollect_brucks(rocshmem_team_t team, T *dest,
-                                  const T *source, int nelems);
-
-  template <typename T>
-  __device__ void fcollect_gcen(rocshmem_team_t team, T *dest, const T *source,
-                                int nelems);
-
-  template <typename T>
-  __device__ void fcollect_gcen2(rocshmem_team_t team, T *dest,
-                                 const T *source, int nelems);
-
-  __device__ void putmem_wg(void *dest, const void *source, size_t nelems,
-                            int pe);
-
-  __device__ void getmem_wg(void *dest, const void *source, size_t nelems,
-                            int pe);
-
-  __device__ void putmem_nbi_wg(void *dest, const void *source, size_t nelems,
-                                int pe);
-
-  __device__ void getmem_nbi_wg(void *dest, const void *source, size_t size,
-                                int pe);
-
-  __device__ void putmem_wave(void *dest, const void *source, size_t nelems,
-                              int pe);
-
-  __device__ void getmem_wave(void *dest, const void *source, size_t nelems,
-                              int pe);
-
-  __device__ void putmem_nbi_wave(void *dest, const void *source, size_t nelems,
-                                  int pe);
-
-  __device__ void getmem_nbi_wave(void *dest, const void *source, size_t size,
-                                  int pe);
-
-  template <typename T>
-  __device__ void put_wg(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void put_nbi_wg(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void get_wg(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void get_nbi_wg(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void put_wave(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void put_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void get_wave(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe);
-
- private:
-  template <typename T, ROCSHMEM_OP Op>
-  __device__ void internal_direct_allreduce(
-      T *dst, const T *src, int nelems, int PE_start, int logPE_stride,
-      int PE_size, T *pWrk,
-      long *pSync);  // NOLINT(runtime/int)
-
-  template <typename T, ROCSHMEM_OP Op>
-  __device__ void internal_ring_allreduce(T *dst, const T *src, int nelems,
-                                          int PE_start, int logPE_stride,
-                                          int PE_size, T *pWrk,
-                                          long *pSync,  // NOLINT(runtime/int)
-                                          int n_seg, int seg_size,
-                                          int chunk_size);
-
-  template <typename T>
-  __device__ void internal_put_broadcast(T *dst, const T *src, int nelems,
-                                         int pe_root, int PE_start,
-                                         int logPE_stride, int PE_size,
-                                         long *pSync);  // NOLINT(runtime/int)
-
-  template <typename T>
-  __device__ void internal_get_broadcast(T *dst, const T *src, int nelems,
-                                         int pe_root,
-                                         long *pSync);  // NOLINT(runtime/int)
-
-  __device__ void internal_direct_barrier(int pe, int PE_start, int stride,
-                                          int n_pes, int64_t *pSync);
-
-  __device__ void internal_atomic_barrier(int pe, int PE_start, int stride,
-                                          int n_pes, int64_t *pSync);
-
-  __device__ void internal_sync(int pe, int PE_start, int stride, int PE_size,
-                                int64_t *pSync);
-
-  __device__ void quiet_single(int cq_num);
-
- public:
-  /*
-   * Collection of queue pairs that are currently checked out by this
-   * context from GPUIBBackend.
-   */
-  // FIXME: keep it private and destroy in destructor for better
-  // encapsulation.
-  QueuePair *device_qp_proxy{nullptr};
-
-  /*
-   * Array of char * pointers corresponding to the heap base pointers VA for
-   * each PE that we can communicate with.
-   */
-  char *const *base_heap{nullptr};
-
-  /*
-   * Buffer used to store the results of a *_g operation. These ops do not
-   * provide a destination buffer, so the runtime must manage one.
-   */
-  char *g_ret{nullptr};
-
-  NetworkImpl networkImpl{};
-
-  /*
-   * Temporary scratchpad memory used by internal barrier algorithms.
-   */
-  int64_t *barrier_sync{nullptr};
-
-  int ctx_idx;
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
diff --git a/src/gpu_ib/context_ib_device_coll.cpp b/src/gpu_ib/context_ib_device_coll.cpp
deleted file mode 100644
index 61f1e97ca9..0000000000
--- a/src/gpu_ib/context_ib_device_coll.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "rocshmem/rocshmem.hpp"
-#include "../context_incl.hpp"
-#include "context_ib_tmpl_device.hpp"
-#include "../util.hpp"
-
-namespace rocshmem {
-
-__device__ void GPUIBContext::internal_direct_barrier(int pe, int PE_start,
-                                                      int stride, int n_pes,
-                                                      int64_t *pSync) {
-  int64_t flag_val = 1;
-  if (pe == PE_start) {
-    // Go through all PE offsets (except current offset = 0)
-    // and wait until they all reach
-    for (size_t i = 1; i < n_pes; i++) {
-      wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
-      pSync[i] = ROCSHMEM_SYNC_VALUE;
-    }
-    threadfence_system();
-    // Announce to other PEs that all have reached
-    for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
-      put_nbi(&pSync[0], &flag_val, 1, j);
-    }
-
-  } else {
-    // Mark current PE offset as reached
-    size_t pe_offset = (pe - PE_start) / stride;
-    put_nbi(&pSync[pe_offset], &flag_val, 1, PE_start);
-    wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
-    pSync[0] = ROCSHMEM_SYNC_VALUE;
-    threadfence_system();
-  }
-}
-
-__device__ void GPUIBContext::internal_atomic_barrier(int pe, int PE_start,
-                                                      int stride, int n_pes,
-                                                      int64_t *pSync) {
-  int64_t flag_val = 1;
-  if (pe == PE_start) {
-    wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(n_pes - 1));
-    pSync[0] = ROCSHMEM_SYNC_VALUE;
-    threadfence_system();
-    for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
-      put_nbi(&pSync[0], &flag_val, 1, j);
-    }
-  } else {
-    amo_add<int64_t>(&pSync[0], flag_val, PE_start);
-    wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
-    pSync[0] = ROCSHMEM_SYNC_VALUE;
-    threadfence_system();
-  }
-}
-
-// Uses PE values that are relative to world
-__device__ void GPUIBContext::internal_sync(int pe, int PE_start, int stride,
-                                            int PE_size, int64_t *pSync) {
-  __syncthreads();
-  if (is_thread_zero_in_block()) {
-    if (PE_size < 64) {
-      internal_direct_barrier(pe, PE_start, stride, PE_size, pSync);
-    } else {
-      internal_atomic_barrier(pe, PE_start, stride, PE_size, pSync);
-    }
-  }
-  __threadfence();
-  __syncthreads();
-}
-
-__device__ void GPUIBContext::sync(rocshmem_team_t team) {
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO: enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-
-  int pe = team_obj->my_pe_in_world;
-  int pe_start = team_obj->tinfo_wrt_world->pe_start;
-  int pe_stride = (1 << log_pe_stride);
-  int pe_size = team_obj->num_pes;
-  internal_sync(pe, pe_start, pe_stride, pe_size, barrier_sync);
-}
-
-__device__ void GPUIBContext::sync_all() {
-  internal_sync(my_pe, 0, 1, num_pes, barrier_sync);
-}
-
-__device__ void GPUIBContext::barrier_all() {
-  if (is_thread_zero_in_block()) {
-    quiet();
-  }
-  sync_all();
-  __syncthreads();
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/context_ib_host.cpp b/src/gpu_ib/context_ib_host.cpp
deleted file mode 100644
index e569d7e6ef..0000000000
--- a/src/gpu_ib/context_ib_host.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "context_ib_host.hpp"
-
-#include <mpi.h>
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../backend_type.hpp"
-#include "../context_incl.hpp"
-#include "backend_ib.hpp"
-#include "../host/host.hpp"
-
-namespace rocshmem {
-
-__host__ GPUIBHostContext::GPUIBHostContext(Backend *backend,
-                                            [[maybe_unused]] int64_t options)
-    : Context(backend, true) {
-  GPUIBBackend *b{static_cast<GPUIBBackend *>(backend)};
-
-  host_interface = b->host_interface;
-
-  context_window_info = host_interface->acquire_window_context();
-}
-
-__host__ GPUIBHostContext::~GPUIBHostContext() {
-  host_interface->release_window_context(context_window_info);
-}
-
-__host__ void GPUIBHostContext::putmem_nbi(void *dest, const void *source,
-                                           size_t nelems, int pe) {
-  host_interface->putmem_nbi(dest, source, nelems, pe, context_window_info);
-}
-
-__host__ void GPUIBHostContext::getmem_nbi(void *dest, const void *source,
-                                           size_t nelems, int pe) {
-  host_interface->getmem_nbi(dest, source, nelems, pe, context_window_info);
-}
-
-__host__ void GPUIBHostContext::putmem(void *dest, const void *source,
-                                       size_t nelems, int pe) {
-  host_interface->putmem(dest, source, nelems, pe, context_window_info);
-}
-
-__host__ void GPUIBHostContext::getmem(void *dest, const void *source,
-                                       size_t nelems, int pe) {
-  host_interface->getmem(dest, source, nelems, pe, context_window_info);
-}
-
-__host__ void GPUIBHostContext::fence() {
-  host_interface->fence(context_window_info);
-}
-
-__host__ void GPUIBHostContext::quiet() {
-  host_interface->quiet(context_window_info);
-}
-
-__host__ void GPUIBHostContext::sync_all() {
-  host_interface->sync_all(context_window_info);
-}
-
-__host__ void GPUIBHostContext::barrier_all() {
-  host_interface->barrier_all(context_window_info);
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/context_ib_host.hpp b/src/gpu_ib/context_ib_host.hpp
deleted file mode 100644
index a38f18ce01..0000000000
--- a/src/gpu_ib/context_ib_host.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
-#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
-
-#include "../context.hpp"
-
-namespace rocshmem {
-
-class GPUIBHostContext : public Context {
- public:
-  __host__ GPUIBHostContext(Backend *b, int64_t options);
-
-  __host__ ~GPUIBHostContext();
-
-  template <typename T>
-  __host__ void p(T *dest, T value, int pe);
-
-  template <typename T>
-  __host__ T g(const T *source, int pe);
-
-  template <typename T>
-  __host__ void put(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __host__ void get(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __host__ void put_nbi(T *dest, const T *source, size_t nelems, int pe);
-
-  template <typename T>
-  __host__ void get_nbi(T *dest, const T *source, size_t nelems, int pe);
-
-  __host__ void putmem(void *dest, const void *source, size_t nelems, int pe);
-
-  __host__ void getmem(void *dest, const void *source, size_t nelems, int pe);
-
-  __host__ void putmem_nbi(void *dest, const void *source, size_t nelems,
-                           int pe);
-
-  __host__ void getmem_nbi(void *dest, const void *source, size_t size, int pe);
-
-  template <typename T>
-  __host__ void amo_add(void *dst, T value, int pe);
-
-  template <typename T>
-  __host__ void amo_cas(void *dst, T value, T cond, int pe);
-
-  template <typename T>
-  __host__ T amo_fetch_add(void *dst, T value, int pe);
-
-  template <typename T>
-  __host__ T amo_fetch_cas(void *dst, T value, T cond, int pe);
-
-  __host__ void fence();
-
-  __host__ void quiet();
-
-  __host__ void barrier_all();
-
-  __host__ void sync_all();
-
-  template <typename T>
-  __host__ void broadcast(T *dest, const T *source, int nelems, int pe_root,
-                          int pe_start, int log_pe_stride, int pe_size,
-                          long *p_sync);  // NOLINT(runtime/int)
-
-  template <typename T>
-  __host__ void broadcast(rocshmem_team_t team, T *dest, const T *source,
-                          int nelems, int pe_root);
-
-  template <typename T, ROCSHMEM_OP Op>
-  __host__ void to_all(T *dest, const T *source, int nreduce, int pe_start,
-                       int log_pe_stride, int pe_size, T *p_wrk,
-                       long *p_sync);  // NOLINT(runtime/int)
-
-  template <typename T, ROCSHMEM_OP Op>
-  __host__ void to_all(rocshmem_team_t team, T *dest, const T *source,
-                       int nreduce);
-
-  template <typename T>
-  __host__ void wait_until(T *ivars, int cmp, T val);
-
-  template <typename T>
-  __host__ size_t wait_until_any(T *ivars, size_t nelems,
-                                 const int *status,
-                                 int cmp, T val);
-
-  template <typename T>
-  __host__ void wait_until_all(T *ivars, size_t nelems,
-                               const int *status,
-                               int cmp, T val);
-
-  template <typename T>
-  __host__ size_t wait_until_some(T *ivars, size_t nelems,
-                                size_t* indices,
-                                const int *status,
-                                int cmp, T val);
-
-  template <typename T>
-  __host__ void wait_until_all_vector(T *ivars, size_t nelems,
-                                      const int *status,
-                                      int cmp, T* vals);
-
-  template <typename T>
-  __host__ size_t wait_until_any_vector(T *ivars, size_t nelems,
-                                        const int *status,
-                                        int cmp, T* vals);
-
-  template <typename T>
-  __host__ size_t wait_until_some_vector(T *ivars, size_t nelems,
-                                         size_t* indices,
-                                         const int *status,
-                                         int cmp, T* vals);
-
-  template <typename T>
-  __host__ int test(T *ivars, int cmp, T val);
-
- public:
-  /* Pointer to the backend's host interface */
-  HostInterface *host_interface{nullptr};
-
-  /* An MPI Window implements a context */
-  WindowInfo *context_window_info{nullptr};
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_CONTEXT_IB_HOST_HPP_
diff --git a/src/gpu_ib/context_ib_tmpl_device.hpp b/src/gpu_ib/context_ib_tmpl_device.hpp
deleted file mode 100644
index b29523c804..0000000000
--- a/src/gpu_ib/context_ib_tmpl_device.hpp
+++ /dev/null
@@ -1,1097 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
-#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "rocshmem/rocshmem.hpp"
-#include "context_ib_device.hpp"
-#include "gpu_ib_team.hpp"
-#include "queue_pair.hpp"
-#include "../util.hpp"
-#include "../rocshmem_calc.hpp"
-
-namespace rocshmem {
-
-template <typename T, ROCSHMEM_OP Op>
-__device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
-                               int wg_size) {
-  for (size_t i = wg_id; i < size; i += wg_size) {
-    OpWrap<Op>::Calc(src, dst, i);
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__ void GPUIBContext::p(T *dest, T value, int pe) {
-  putmem_nbi(dest, &value, sizeof(T), pe);
-}
-
-template <typename T, ROCSHMEM_OP Op>
-__device__ void GPUIBContext::internal_ring_allreduce(
-    T *dst, const T *src, int nelems, [[maybe_unused]] int PE_start,
-    [[maybe_unused]] int logPE_stride, [[maybe_unused]] int PE_size, T *pWrk,
-    long *pSync,  // NOLINT(runtime/int)
-    int n_seg, int seg_size, int chunk_size) {
-  int off_seg, off_send, off_recv;
-  int send_pe = (my_pe + 1) % num_pes;
-  long wait_val;  // NOLINT(runtime/int)
-
-  int wg_size = get_flat_block_size();
-  int wg_id = get_flat_block_id();
-
-  for (size_t i = wg_id; i < nelems; i += wg_size) {
-    dst[i] = src[i];
-  }
-  __syncthreads();
-
-  for (size_t seg = 0; seg < n_seg; seg++) {
-    off_seg = seg * seg_size;
-    for (int round = 0; round < num_pes - 1; round++) {
-      off_send = (((my_pe + 1 - round + 2 * num_pes) % num_pes) * chunk_size);
-      off_recv = (((my_pe - round + 2 * num_pes) % num_pes) * chunk_size);
-
-      putmem_nbi_wg(reinterpret_cast<void *>(&pWrk[off_send]),
-                    reinterpret_cast<void *>(&dst[off_send + off_seg]),
-                    chunk_size * sizeof(T), send_pe);
-
-      if (is_thread_zero_in_block()) {
-        fence();
-
-        wait_val = seg + 100;
-        p(&pSync[round], wait_val, send_pe);
-
-        wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val);
-        __threadfence();
-      }
-      __syncthreads();
-      compute_reduce<T, Op>(&pWrk[off_recv], &dst[off_seg + off_recv],
-                            chunk_size, wg_id, wg_size);
-    }
-    for (size_t round = num_pes - 1; round < 2 * num_pes - 2; round++) {
-      int off_send2 =
-          (((my_pe + 1 - round + 2 * num_pes) % num_pes) * chunk_size);
-      putmem_nbi_wg(reinterpret_cast<void *>(&dst[off_send2 + off_seg]),
-                    reinterpret_cast<void *>(&dst[off_send2 + off_seg]),
-                    chunk_size * sizeof(T), send_pe);
-
-      if (is_thread_zero_in_block()) {
-        fence();
-        wait_val = seg + 100;
-        p(&pSync[round], wait_val, send_pe);
-        wait_until(&pSync[round], ROCSHMEM_CMP_EQ, wait_val);
-      }
-      __syncthreads();
-    }
-  }
-  __syncthreads();
-  for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
-    pSync[i] = ROCSHMEM_SYNC_VALUE;
-  }
-  __syncthreads();
-}
-
-template <typename T, ROCSHMEM_OP Op>
-__device__ void GPUIBContext::internal_direct_allreduce(
-    T *dst, const T *src, int nelems, int PE_start, int logPE_stride,
-    int PE_size, T *pWrk,
-    long *pSync) {  // NOLINT(runtime/int)
-  int stride = 1 << logPE_stride;
-  int finish = PE_start + stride * PE_size;
-  int pe = my_pe;
-
-  int wg_id = get_flat_block_id();
-  int wg_size = get_flat_block_size();
-
-  for (int i = wg_id; i < nelems; i += wg_size) {
-    dst[i] = src[i];
-  }
-  __syncthreads();
-
-  for (int i = PE_start; i < finish; i += stride) {
-    if (i != pe) {
-      putmem_nbi_wg(&pWrk[pe * nelems], reinterpret_cast<const void *>(src),
-                    nelems * sizeof(T), i);
-
-      if (is_thread_zero_in_block()) {
-        fence();
-        p(&pSync[pe], 1L, i);
-      }
-      __syncthreads();
-    }
-  }
-
-  // Do the compute and pSync reset in parallel.
-
-  for (int i = PE_start; i < finish; i += stride) {
-    if (i != pe) {
-      // Wait for leader thread to see that the buffer is ready.
-      if (is_thread_zero_in_block()) {
-        wait_until(&pSync[i], ROCSHMEM_CMP_EQ, 1L);
-      }
-      __syncthreads();
-
-      T *ptr = &pWrk[i * nelems];
-      compute_reduce<T, Op>(ptr, dst, nelems, wg_id, wg_size);
-    }
-  }
-
-  __syncthreads();
-
-  for (int i = wg_id; i < num_pes; i += wg_size) {
-    pSync[i] = ROCSHMEM_SYNC_VALUE;
-  }
-
-  __syncthreads();
-}
-
-template <typename T, ROCSHMEM_OP Op>
-__device__ void GPUIBContext::to_all(rocshmem_team_t team, T *dest,
-                                     const T *source, int nreduce) {
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-power-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-
-  int pe_start = team_obj->tinfo_wrt_world->pe_start;
-  int pe_size = team_obj->tinfo_wrt_world->size;
-
-  long *p_sync = team_obj->reduce_pSync;
-  T *pWrk = reinterpret_cast<T *>(team_obj->pWrk);
-
-  to_all<T, Op>(dest, source, nreduce, pe_start, log_pe_stride, pe_size, pWrk,
-                p_sync);
-}
-
-template <typename T, ROCSHMEM_OP Op>
-__device__ void GPUIBContext::to_all(T *dest, const T *source, int nreduce,
-                                     int PE_start, int logPE_stride,
-                                     int PE_size, T *pWrk,
-                                     long *pSync) {  // NOLINT(runtime/int)
-  size_t direct_pWrk = num_pes * nreduce;
-  size_t direct_pSync = num_pes;
-
-  size_t ring_pSync = 2 * num_pes;
-
-  size_t provided_pWrk =
-      max(nreduce / 2 + 1, ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
-  size_t provided_pSync = ROCSHMEM_REDUCE_SYNC_SIZE;
-
-  // TODO(bpotter):
-  // We basically do a direct reduce if pWrk is big enough, else we
-  // give up. In the future we will want to design algorithms to work
-  // with nreduce/2 + 1 space, which would cover every case per the
-  // standard.
-  if (provided_pWrk >= direct_pWrk && provided_pSync >= direct_pSync) {
-    internal_direct_allreduce<T, Op>(dest, source, nreduce, PE_start,
-                                     logPE_stride, PE_size, pWrk, pSync);
-  } else {
-    if (ring_pSync <= ROCSHMEM_REDUCE_SYNC_SIZE) {
-      int chunk_size = 1024;
-      size_t ring_pWrk = chunk_size * num_pes;
-      if (provided_pWrk < ring_pWrk) {
-        ring_pWrk = max(nreduce / 2,  // NOLINT
-                        ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE);
-        chunk_size = ring_pWrk / num_pes;
-      }
-      int seg_size = ring_pWrk;
-      int n_seg = nreduce / seg_size;
-      if (n_seg == 0) {
-        n_seg = 1;
-        seg_size = nreduce;
-        chunk_size = seg_size / num_pes;
-      }
-      internal_ring_allreduce<T, Op>(dest, source, nreduce, PE_start,
-                                     logPE_stride, PE_size, pWrk, pSync, n_seg,
-                                     seg_size, chunk_size);
-    } else {
-      GPU_DPRINTF("Unsupported reduction size for gpu_ib.\n");
-    }
-  }
-}
-
-template <typename T>
-__device__ void GPUIBContext::put(T *dest, const T *source, size_t nelems,
-                                  int pe) {
-  putmem(dest, source, nelems * sizeof(T), pe);
-}
-
-template <typename T>
-__device__ T GPUIBContext::g(const T *source, int pe) {
-  T ret;
-  auto *src_const_cast = reinterpret_cast<const char *>(source);
-  uint64_t L_offset = const_cast<char *>(src_const_cast) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    ipcImpl_.ipcCopy(&ret, ipcImpl_.ipc_bases[pe] + L_offset, sizeof(T));
-    return ret;
-  } else {
-    int thread_id = get_flat_block_id();
-    int block_size = get_flat_block_size();
-    int offset = ctx_idx * block_size + thread_id;
-
-    char *base_dest = g_ret;
-    char *dest = &base_dest[offset * sizeof(int64_t)];
-    size_t nelems = sizeof(T);
-
-    bool must_send_message = wf_coal_.coalesce(pe, source, dest, &nelems);
-    if (!must_send_message) {
-      return ret;
-    }
-    getQueuePair(pe)->get_nbi<THREAD>(base_heap[pe] + L_offset, dest, nelems,
-                                      pe, true);
-    getQueuePair(pe)->quiet_single<THREAD>();
-    getQueuePair(my_pe)->hdp_policy->hdp_flush();
-    __threadfence();
-    ret = *(reinterpret_cast<T *>(dest));
-    return ret;
-  }
-  return ret;
-}
-
-template <typename T>
-__device__ void GPUIBContext::put_nbi(T *dest, const T *source, size_t nelems,
-                                      int pe) {
-  putmem_nbi(dest, source, sizeof(T) * nelems, pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::get(T *dest, const T *source, size_t nelems,
-                                  int pe) {
-  getmem(dest, source, sizeof(T) * nelems, pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::get_nbi(T *dest, const T *source, size_t nelems,
-                                      int pe) {
-  getmem_nbi(dest, source, sizeof(T) * nelems, pe);
-}
-
-template <typename T>
-__device__ T GPUIBContext::amo_fetch_add(void *dst, T value, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    return ipcImpl_.ipcAMOFetchAdd(
-        reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
-  } else {
-    auto *qp = getQueuePair(pe);
-    return qp->atomic_fetch(base_heap[pe] + L_offset, value, 0, pe, true,
-                            MLX5_OPCODE_ATOMIC_FA);
-  }
-}
-
-template <typename T>
-__device__ T GPUIBContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    return ipcImpl_.ipcAMOFetchCas(
-        reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), cond, value);
-  } else {
-    auto *qp = getQueuePair(pe);
-    return qp->atomic_fetch(base_heap[pe] + L_offset, value, cond, pe, true,
-                            MLX5_OPCODE_ATOMIC_CS);
-  }
-}
-
-template <typename T>
-__device__ void GPUIBContext::amo_add(void *dst, T value, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    ipcImpl_.ipcAMOAdd(reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset),
-                       value);
-  } else {
-    auto *qp = getQueuePair(pe);
-    qp->atomic_nofetch(base_heap[pe] + L_offset, value, 0, pe, true,
-                       MLX5_OPCODE_ATOMIC_FA);
-  }
-}
-
-template <typename T>
-__device__ void GPUIBContext::amo_set(void *dst, T value, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
-
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    ipcImpl_.ipcAMOSet(reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset),
-                       value);
-  } else {
-    auto *qp = getQueuePair(pe);
-
-    // Guess that the remote memory is zero by setting condition to zero.
-    // The compare-and-swap loop will execute at least twice if wrong.
-    // It may run additional times if contention on memory location.
-    T ret_val;
-    T cond = 0;
-    while ((ret_val = qp->atomic_fetch(base_heap[pe] + L_offset, value, cond,
-                                       pe, true, MLX5_OPCODE_ATOMIC_CS))) {
-      if (ret_val == cond) {
-        break;
-      }
-      cond = ret_val;
-    }
-  }
-}
-
-template <typename T>
-__device__ T GPUIBContext::amo_swap(void *dst, T value, int pe) {
-  assert(false);
-  return 0;
-}
-
-template <typename T>
-__device__ T GPUIBContext::amo_fetch_and(void *dst, T value, int pe) {
-  assert(false);
-  return 0;
-}
-
-template <typename T>
-__device__ void GPUIBContext::amo_and(void *dst, T value, int pe) {
-  assert(false);
-}
-
-template <typename T>
-__device__ T GPUIBContext::amo_fetch_or(void *dst, T value, int pe) {
-  assert(false);
-  return 0;
-}
-
-template <typename T>
-__device__ void GPUIBContext::amo_or(void *dst, T value, int pe) {
-  assert(false);
-}
-
-template <typename T>
-__device__ T GPUIBContext::amo_fetch_xor(void *dst, T value, int pe) {
-  assert(false);
-  return 0;
-}
-
-template <typename T>
-__device__ void GPUIBContext::amo_xor(void *dst, T value, int pe) {
-  assert(false);
-}
-
-template <typename T>
-__device__ void GPUIBContext::amo_cas(void *dst, T value, T cond, int pe) {
-  uint64_t L_offset = reinterpret_cast<char *>(dst) - base_heap[my_pe];
-  if (ipcImpl_.isIpcAvailable(my_pe, pe)) {
-    ipcImpl_.ipcAMOCas(reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset),
-                       cond, value);
-  } else {
-    auto *qp = getQueuePair(pe);
-    qp->atomic_nofetch(base_heap[pe] + L_offset, value, cond, pe, true,
-                       MLX5_OPCODE_ATOMIC_CS);
-  }
-}
-
-template <typename T>
-__device__ void GPUIBContext::internal_put_broadcast(
-    T *dst, const T *src, int nelems, int pe_root, int pe_start,
-    int log_pe_stride, int pe_size,
-    [[maybe_unused]] long *p_sync) {  // NOLINT(runtime/int)
-  if (my_pe == pe_root) {
-    int stride = 1 << log_pe_stride;
-    int finish = pe_start + stride * pe_size;
-    for (int i = pe_start; i < finish; i += stride) {
-      if (i != my_pe) {
-        put_nbi_wg(dst, src, nelems, i);
-      }
-    }
-  }
-}
-
-template <typename T>
-__device__ void GPUIBContext::internal_get_broadcast(
-    T *dst, const T *src, int nelems, int pe_root,
-    [[maybe_unused]] long *pSync) {  // NOLINT(runtime/int)
-  if (my_pe != pe_root) {
-    get_wg(dst, src, nelems, pe_root);
-  }
-}
-
-template <typename T>
-__device__ void GPUIBContext::broadcast(rocshmem_team_t team, T *dst,
-                                        const T *src, int nelems, int pe_root) {
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-
-  int pe_start = team_obj->tinfo_wrt_world->pe_start;
-  int pe_size = team_obj->tinfo_wrt_world->size;
-
-  long *p_sync = team_obj->bcast_pSync;
-
-  // Passed pe_root is relative to team, convert to world root
-  int pe_root_world = team_obj->get_pe_in_world(pe_root);
-
-  broadcast<T>(dst, src, nelems, pe_root_world, pe_start, log_pe_stride,
-               pe_size, p_sync);
-}
-
-template <typename T>
-__device__ void GPUIBContext::broadcast(T *dst, const T *src, int nelems,
-                                        int pe_root, int pe_start,
-                                        int log_pe_stride, int pe_size,
-                                        long *p_sync) {  // NOLINT(runtime/int)
-  if (num_pes < 4) {
-    internal_put_broadcast(dst, src, nelems, pe_root, pe_start, log_pe_stride,
-                           pe_size, p_sync);
-  } else {
-    internal_get_broadcast(dst, src, nelems, pe_root, p_sync);
-  }
-  // Synchronize on completion of broadcast
-  internal_sync(my_pe, pe_start, (1 << log_pe_stride), pe_size, p_sync);
-}
-
-template <typename T>
-__device__ void GPUIBContext::alltoall(rocshmem_team_t team, T *dst,
-                                       const T *src, int nelems) {
-  // Currently broadcast implementation performs the best
-  alltoall_broadcast(team, dst, src, nelems);
-}
-
-template <typename T>
-__device__ void GPUIBContext::alltoall_broadcast(rocshmem_team_t team, T *dst,
-                                                 const T *src, int nelems) {
-  // Broadcast implementation of alltoall collective
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-  int pe_start = team_obj->tinfo_wrt_world->pe_start;
-  int pe_size = team_obj->num_pes;
-  int stride = 1 << log_pe_stride;
-
-  long *pSync = team_obj->alltoall_pSync;
-  int my_pe_in_team = team_obj->my_pe;
-  // Have each PE put their designated data to the other PEs
-  for (int j = 0; j < pe_size; j++) {
-    int dest_pe = team_obj->get_pe_in_world(j);
-    put_nbi_wg(&dst[my_pe_in_team * nelems], &src[j * nelems], nelems, dest_pe);
-  }
-  if (is_thread_zero_in_block()) {
-    quiet();
-  }
-  // wait until everyone has obtained their designated data
-  internal_sync(my_pe, pe_start, stride, pe_size, pSync);
-}
-
-template <typename T>
-__device__ void GPUIBContext::alltoall_brucks(rocshmem_team_t team, T *dst,
-                                              const T *src, int nelems) {
-  // Brucks implementation of alltoall collective
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-  int pe_start = team_obj->tinfo_wrt_world->pe_start;
-  int pe_size = team_obj->num_pes;
-  int stride = 1 << log_pe_stride;
-
-  long *pSync = team_obj->alltoall_pSync;
-  int my_pe_in_team = team_obj->my_pe;
-  int tid = get_flat_block_id();
-  int blk_size = get_flat_block_size();
-
-  // Check if we have enough buffer space. If not, fail.
-  if (pe_size * nelems * 2 > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
-    GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
-    assert(false);
-  }
-
-  T *pAta1 = reinterpret_cast<T *>(team_obj->pAta);
-  T *pAta2 = &pAta1[pe_size * nelems];
-
-  // Phase 1: Shift all data by (pe_size * nelems) elements
-  for (size_t i = tid; i < pe_size * nelems; i += blk_size) {
-    size_t index = (i + my_pe_in_team * nelems) % (pe_size * nelems);
-    pAta1[i] = src[index];
-  }
-  __syncthreads();
-
-  // Phase 2: Perform packing and data transfers
-  for (int64_t shift = 0; ((int64_t)1 << shift) < pe_size; shift++) {
-    int64_t shift_decimal = ((int64_t)1 << shift);
-    // Step 1: Pack data to be sent
-    for (int64_t i = tid; i < pe_size * nelems; i += blk_size) {
-      int64_t pos = i / nelems;
-      int64_t offset = i % nelems;
-      // If bit is set in index, insert in data to be sent
-      if ((pos >> shift) & 1) {
-        int64_t index =
-            ((pos >> (shift + 1)) << shift) + (pos & (shift_decimal - 1));
-        pAta2[index * nelems + offset] = pAta1[i];
-      }
-    }
-    threadfence_system();
-    __syncthreads();
-
-    // Calculate how much data to be sent
-    int64_t region_size = shift_decimal * 2;
-    int64_t data_size = nelems * (pe_size / region_size * shift_decimal);
-    if (pe_size % region_size > shift_decimal)
-      data_size += pe_size % region_size - shift_decimal;
-
-    // Step 2: Send data
-    int dest_pe =
-        team_obj->get_pe_in_world((my_pe_in_team + shift_decimal) % pe_size);
-    put_wg(dst, pAta2, data_size, dest_pe);
-    if (is_thread_zero_in_block()) {
-      quiet();
-    }
-    threadfence_system();
-    // Need to synchronize with both receiver and sender. So just sync all.
-    internal_sync(my_pe, pe_start, stride, pe_size, pSync);
-    // Step 3: Unpack received data
-    for (int i = tid; i < pe_size * nelems; i += blk_size) {
-      int64_t pos = i / nelems;
-      int64_t offset = i % nelems;
-      // If bit is set in index, insert in data to be sent
-      if ((pos >> shift) & 1) {
-        int64_t index =
-            ((pos >> (shift + 1)) << shift) + (pos & (shift_decimal - 1));
-        pAta1[i] = dst[index * nelems + offset];
-      }
-    }
-    threadfence_system();
-    __syncthreads();
-  }
-
-  // Phase 3: Inverse rotation, shift data by (pe_size * nelems) elements
-  for (size_t i = tid; i < pe_size * nelems; i += blk_size) {
-    size_t offset = i % nelems;
-    size_t index = ((pe_size + my_pe_in_team - i / nelems) % pe_size) * nelems;
-    dst[index + offset] = pAta1[i];
-  }
-
-  // wait until everyone has sent the data
-  internal_sync(my_pe, pe_start, stride, pe_size, pSync);
-}
-
-template <typename T>
-__device__ void GPUIBContext::alltoall_gcen(rocshmem_team_t team, T *dst,
-                                            const T *src, int nelems) {
-  // GPU-centric implementation of alltoall collective
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-  int pe_size = team_obj->num_pes;
-  int stride = 1 << log_pe_stride;
-
-  long *pSync = team_obj->alltoall_pSync;
-  int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
-  int my_pe_in_team = team_obj->my_pe;
-
-  // Check if we have enough buffer space. If not, fail.
-  T *pAta = reinterpret_cast<T *>(team_obj->pAta);
-  if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
-    GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
-    assert(false);
-  }
-
-  // Works when number of PEs divisible by root(PE_size)
-  int num_clust = sqrt(pe_size);
-  int clust_size = (pe_size + num_clust - 1) / num_clust;
-  // TODO(bpotter): Allow any size of cluster
-  assert(num_clust * clust_size == pe_size);
-  int clust_id = my_pe_in_team / clust_size;
-
-  int64_t flag_val = 1;
-  // Step 1: Send data to PEs in cluster
-  for (int i = 0; i < pe_size; ++i) {
-    int src_pe =
-        team_obj->get_pe_in_world(clust_id * clust_size + (i % clust_size));
-    int src_loc = (i / clust_size) * clust_size + (my_pe_in_team % clust_size);
-    get_nbi_wg(&pAta[i * nelems], &src[src_loc * nelems], nelems, src_pe);
-  }
-  if (is_thread_zero_in_block()) {
-    quiet();
-  }
-  __syncthreads();
-  // Step 2: Send final data to PEs outside cluster
-  for (int i = 0; i < num_clust; i++) {
-    int dest_pe = team_obj->get_pe_in_world((my_pe_in_team % clust_size) +
-                                            i * clust_size);
-    int j = clust_id;
-    put_nbi_wg(&dst[j * nelems * clust_size], &pAta[i * nelems * clust_size],
-               nelems * clust_size, dest_pe);
-  }
-  if (is_thread_zero_in_block()) {
-    quiet();
-
-    // Now sync PEs in cluster and ring. Ideally, we overlap this.
-    int dest_pe = team_obj->get_pe_in_world(clust_id * clust_size);
-    if (dest_pe != my_pe) amo_add<int64_t>(pSync2, flag_val, dest_pe);
-
-    int dest_pe2 = team_obj->get_pe_in_world(my_pe_in_team % clust_size);
-    if (dest_pe2 != my_pe) amo_add<int64_t>(&pSync[0], flag_val, dest_pe2);
-
-    if (my_pe == dest_pe) {
-      wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
-      pSync2[0] = ROCSHMEM_SYNC_VALUE;
-      __threadfence_system();
-      for (int i = 1; i < clust_size; ++i)
-        put_nbi(&pSync2[0], &flag_val, 1,
-                team_obj->get_pe_in_world(my_pe_in_team + i));
-    } else {
-      wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
-      pSync2[0] = ROCSHMEM_SYNC_VALUE;
-      __threadfence_system();
-    }
-
-    if (my_pe == dest_pe2) {
-      wait_until(&pSync[0], ROCSHMEM_CMP_EQ, (int64_t)(num_clust - 1));
-      pSync[0] = ROCSHMEM_SYNC_VALUE;
-      threadfence_system();
-      for (size_t i = 1, j = dest_pe2 + clust_size * stride; i < num_clust;
-           ++i, j += clust_size * stride) {
-        put_nbi(&pSync[0], &flag_val, 1, j);
-      }
-    } else {
-      wait_until(&pSync[0], ROCSHMEM_CMP_EQ, flag_val);
-      pSync[0] = ROCSHMEM_SYNC_VALUE;
-      threadfence_system();
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__ void GPUIBContext::alltoall_gcen2(rocshmem_team_t team, T *dst,
-                                             const T *src, int nelems) {
-  // GPU-centric implementation of alltoall collective
-  // Uses in-place blocking sync
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-  int pe_size = team_obj->num_pes;
-  int stride = 1 << log_pe_stride;
-
-  long *pSync = team_obj->alltoall_pSync;
-  int64_t *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
-  int my_pe_in_team = team_obj->my_pe;
-
-  // Check if we have enough buffer space. If not, fail.
-  T *pAta = reinterpret_cast<T *>(team_obj->pAta);
-  if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
-    GPU_DPRINTF("Unsupported alltoall size for gpu_ib.\n");
-    assert(false);
-  }
-
-  // Works when number of PEs divisible by root(PE_size)
-  int num_clust = sqrt(pe_size);
-  int clust_size = (pe_size + num_clust - 1) / num_clust;
-  // TODO(bpotter): Allow any size of cluster
-  assert(num_clust * clust_size == pe_size);
-  int clust_id = my_pe_in_team / clust_size;
-
-  int64_t flag_val = 1;
-  // Step 1: Send data to PEs in cluster
-  for (int i = 0; i < pe_size; ++i) {
-    int src_pe =
-        team_obj->get_pe_in_world(clust_id * clust_size + (i % clust_size));
-    int src_loc = (i / clust_size) * clust_size + (my_pe_in_team % clust_size);
-    get_nbi_wg(&pAta[i * nelems], &src[src_loc * nelems], nelems, src_pe);
-  }
-
-  if (is_thread_zero_in_block()) {
-    int dest_pe = team_obj->get_pe_in_world(clust_id * clust_size);
-    if (dest_pe != my_pe) amo_add<int64_t>(pSync2, flag_val, dest_pe);
-    quiet();
-  }
-  __syncthreads();
-
-  // Step 2: Send final data to PEs outside cluster
-  // Have each PE put their designated data to the other PEs
-  for (int i = 0; i < num_clust; i++) {
-    int dest_pe = team_obj->get_pe_in_world((my_pe_in_team % clust_size) +
-                                            i * clust_size);
-    int j = clust_id;
-    put_nbi_wg(&dst[j * nelems * clust_size], &pAta[i * nelems * clust_size],
-               nelems * clust_size, dest_pe);
-  }
-
-  if (is_thread_zero_in_block()) {
-    quiet();
-    if ((my_pe_in_team % clust_size) == 0) {
-      wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
-      pSync2[0] = ROCSHMEM_SYNC_VALUE;
-      __threadfence_system();
-      for (int i = 1; i < clust_size; ++i)
-        put_nbi(&pSync2[0], &flag_val, 1,
-                team_obj->get_pe_in_world(my_pe_in_team + i));
-    } else {
-      wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
-      pSync2[0] = ROCSHMEM_SYNC_VALUE;
-      __threadfence_system();
-    }
-  }
-
-  // wait until everyone in ring has sent the data
-  internal_sync(my_pe, team_obj->get_pe_in_world(my_pe_in_team % clust_size),
-                clust_size * stride, num_clust, pSync);
-}
-
-template <typename T>
-__device__ void GPUIBContext::fcollect(rocshmem_team_t team, T *dst,
-                                       const T *src, int nelems) {
-  // Main function for fcollect
-  // Broadcast version performs moderately well
-  // But there still seems to be scope for optimisation
-  fcollect_broadcast(team, dst, src, nelems);
-}
-
-template <typename T>
-__device__ void GPUIBContext::fcollect_broadcast(rocshmem_team_t team, T *dst,
-                                                 const T *src, int nelems) {
-  // Broadcast implementation of fcollect collective
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-  int pe_start = team_obj->tinfo_wrt_world->pe_start;
-  int pe_size = team_obj->num_pes;
-  int stride = 1 << log_pe_stride;
-
-  long *pSync = team_obj->alltoall_pSync;
-  int my_pe_in_team = team_obj->my_pe;
-  // Have each PE put their designated data to the other PEs
-  for (int j = 0; j < pe_size; j++) {
-    int dest_pe = team_obj->get_pe_in_world(j);
-    put_nbi_wg(&dst[my_pe_in_team * nelems], src, nelems, dest_pe);
-  }
-
-  if (is_thread_zero_in_block()) {
-    quiet();
-  }
-  // wait until everyone has obtained their designated data
-  internal_sync(my_pe, pe_start, stride, pe_size, pSync);
-}
-
-template <typename T>
-__device__ void GPUIBContext::fcollect_brucks(rocshmem_team_t team, T *dst,
-                                              const T *src, int nelems) {
-  // Brucks implementation of fcollect collective
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-  int pe_start = team_obj->tinfo_wrt_world->pe_start;
-  int pe_size = team_obj->num_pes;
-  int stride = 1 << log_pe_stride;
-
-  long *pSync = team_obj->alltoall_pSync;
-  int my_pe_in_team = team_obj->my_pe;
-  int tid = get_flat_block_id();
-  int blk_size = get_flat_block_size();
-
-  // Check if we have enough buffer space. If not, fail.
-  if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
-    GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
-    assert(false);
-  }
-
-  T *pAta = reinterpret_cast<T *>(team_obj->pAta);
-
-  // Initial src transfer
-  put_wg(pAta, src, nelems, team_obj->get_pe_in_world(my_pe_in_team));
-
-  // Phase 1: Perform data transfers
-  for (int64_t shift = 0; ((int64_t)1 << shift) < pe_size; shift++) {
-    int64_t shift_decimal = ((int64_t)1 << shift);
-
-    // Calculate how much data to be sent
-    int64_t data_size =
-        min(shift_decimal, pe_size - shift_decimal) * nelems;  // NOLINT
-
-    // Send data
-    int dest_pe =
-        team_obj->get_pe_in_world((my_pe_in_team + shift_decimal) % pe_size);
-    put_wg(&pAta[shift_decimal * nelems], pAta, data_size, dest_pe);
-
-    // Need to synchronize with both receiver and sender. So just sync all.
-    internal_sync(my_pe, pe_start, stride, pe_size, pSync);
-  }
-
-  // Phase 2: Inverse rotation, shift data by (pe_size * nelems) elements
-  for (size_t i = tid; i < pe_size * nelems; i += blk_size) {
-    size_t offset = i % nelems;
-    size_t index =
-        ((pe_size + my_pe_in_team - i / nelems) % (pe_size)) * nelems;
-    dst[index + offset] = pAta[i];
-  }
-
-  // wait until everyone has sent the data
-  internal_sync(my_pe, pe_start, stride, pe_size, pSync);
-}
-
-template <typename T>
-__device__ void GPUIBContext::fcollect_gcen(rocshmem_team_t team, T *dst,
-                                            const T *src, int nelems) {
-  // GPU-centric implementation of fcollect collective
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-  int pe_size = team_obj->num_pes;
-  int stride = 1 << log_pe_stride;
-
-  long *pSync = team_obj->alltoall_pSync;
-  long *pSync2 = &team_obj->alltoall_pSync[ROCSHMEM_BARRIER_SYNC_SIZE];
-  int my_pe_in_team = team_obj->my_pe;
-
-  // Check if we have enough buffer space. If not, fail.
-  T *pAta = reinterpret_cast<T *>(team_obj->pAta);
-  if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
-    GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
-    assert(false);
-  }
-
-  // Works when number of PEs divisible by root(PE_size)
-  int num_clust = sqrt(pe_size);
-  int clust_size = (pe_size + num_clust - 1) / num_clust;
-  // TODO(bpotter): Allow any size of cluster
-  assert(num_clust * clust_size == pe_size);
-  int clust_id = my_pe_in_team / clust_size;
-
-  int64_t flag_val = 1;
-  // Step 1: Send data to PEs in cluster
-  for (int i = 0; i < clust_size; ++i) {
-    int src_pe =
-        team_obj->get_pe_in_world(clust_id * clust_size + (i % clust_size));
-    get_nbi_wg(&pAta[i * nelems], src, nelems, src_pe);
-  }
-
-  if (is_thread_zero_in_block()) {
-    int dest_pe = team_obj->get_pe_in_world(clust_id * clust_size);
-    if (dest_pe != my_pe) amo_add<int64_t>(pSync2, flag_val, dest_pe);
-    quiet();
-  }
-  __syncthreads();
-
-  // Step 2: Send final data to PEs outside cluster
-  // Have each PE put their designated data to the other PEs
-  for (int i = 0; i < num_clust; i++) {
-    int dest_pe = team_obj->get_pe_in_world((my_pe_in_team % clust_size) +
-                                            i * clust_size);
-    int j = clust_id;
-    put_nbi_wg(&dst[j * nelems * clust_size], pAta, nelems * clust_size,
-               dest_pe);
-  }
-
-  if (is_thread_zero_in_block()) {
-    quiet();
-    if ((my_pe_in_team % clust_size) == 0) {
-      wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val * (clust_size - 1));
-      pSync2[0] = ROCSHMEM_SYNC_VALUE;
-      threadfence_system();
-      for (int i = 1; i < clust_size; ++i)
-        put_nbi(&pSync2[0], &flag_val, 1,
-                team_obj->get_pe_in_world(my_pe_in_team + i));
-    } else {
-      wait_until(pSync2, ROCSHMEM_CMP_EQ, flag_val);
-      pSync2[0] = ROCSHMEM_SYNC_VALUE;
-      threadfence_system();
-    }
-  }
-
-  // wait until everyone in ring has sent the data
-  internal_sync(my_pe, team_obj->get_pe_in_world(my_pe_in_team % clust_size),
-                clust_size * stride, num_clust, pSync);
-}
-
-template <typename T>
-__device__ void GPUIBContext::fcollect_gcen2(rocshmem_team_t team, T *dst,
-                                             const T *src, int nelems) {
-  // GPU-centric implementation of fcollect collective
-  // Uses in-place blocking sync
-  GPUIBTeam *team_obj = reinterpret_cast<GPUIBTeam *>(team);
-
-  double dbl_log_pe_stride = team_obj->tinfo_wrt_world->log_stride;
-  int log_pe_stride = static_cast<int>(dbl_log_pe_stride);
-  /**
-   * Ensure that the stride is a multiple of 2 for GPU_IB.
-   * TODO(bpotter): enable GPU_IB to work with non-powers-of-2 strides
-   * and remove this assert.
-   */
-  assert((dbl_log_pe_stride - log_pe_stride) == 0);
-  int pe_size = team_obj->num_pes;
-  int stride = 1 << log_pe_stride;
-
-  long *pSync = team_obj->alltoall_pSync;
-  int my_pe_in_team = team_obj->my_pe;
-
-  // Check if we have enough buffer space. If not, fail.
-  T *pAta = reinterpret_cast<T *>(team_obj->pAta);
-  if (pe_size * nelems > ROCSHMEM_ATA_MAX_WRKDATA_SIZE) {
-    GPU_DPRINTF("Unsupported fcollect size for gpu_ib.\n");
-    assert(false);
-  }
-
-  // Works when number of PEs divisible by root(PE_size)
-  int num_clust = sqrt(pe_size);
-  int clust_size = (pe_size + num_clust - 1) / num_clust;
-  // TODO(bpotter): Allow any size of cluster
-  assert(num_clust * clust_size == pe_size);
-  int clust_id = my_pe_in_team / clust_size;
-
-  // Step 1: Send data to PEs in cluster
-  for (int i = 0; i < clust_size; ++i) {
-    int src_pe =
-        team_obj->get_pe_in_world(clust_id * clust_size + (i % clust_size));
-    get_nbi_wg(&pAta[i * nelems], src, nelems, src_pe);
-  }
-
-  if (is_thread_zero_in_block()) {
-    quiet();
-  }
-  internal_sync(my_pe, team_obj->get_pe_in_world(clust_id * clust_size), stride,
-                clust_size, pSync);
-
-  // Step 2: Send final data to PEs outside cluster
-  // Have each PE put their designated data to the other PEs
-  for (int i = 0; i < num_clust; i++) {
-    int dest_pe = team_obj->get_pe_in_world((my_pe_in_team % clust_size) +
-                                            i * clust_size);
-    int j = clust_id;
-    put_nbi_wg(&dst[j * nelems * clust_size], pAta, nelems * clust_size,
-               dest_pe);
-  }
-
-  if (is_thread_zero_in_block()) quiet();
-
-  // wait until everyone in ring has sent the data
-  internal_sync(my_pe, team_obj->get_pe_in_world(my_pe_in_team % clust_size),
-                clust_size * stride, num_clust, pSync);
-}
-
-/******************************************************************************
- ***************** SHMEM X API EXTENSION FOR BLOCK/WAVE LEVEL *****************
- *****************************************************************************/
-
-template <typename T>
-__device__ void GPUIBContext::put_wg(T *dest, const T *source, size_t nelems,
-                                     int pe) {
-  putmem_wg(dest, source, nelems * sizeof(T), pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::put_wave(T *dest, const T *source, size_t nelems,
-                                       int pe) {
-  putmem_wave(dest, source, nelems * sizeof(T), pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::put_nbi_wg(T *dest, const T *source,
-                                         size_t nelems, int pe) {
-  putmem_nbi_wg(dest, source, nelems * sizeof(T), pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::put_nbi_wave(T *dest, const T *source,
-                                           size_t nelems, int pe) {
-  putmem_nbi_wave(dest, source, nelems * sizeof(T), pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::get_wg(T *dest, const T *source, size_t nelems,
-                                     int pe) {
-  getmem_wg(dest, source, nelems * sizeof(T), pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::get_wave(T *dest, const T *source, size_t nelems,
-                                       int pe) {
-  getmem_wave(dest, source, nelems * sizeof(T), pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::get_nbi_wg(T *dest, const T *source,
-                                         size_t nelems, int pe) {
-  getmem_nbi_wg(dest, source, nelems * sizeof(T), pe);
-}
-
-template <typename T>
-__device__ void GPUIBContext::get_nbi_wave(T *dest, const T *source,
-                                           size_t nelems, int pe) {
-  getmem_nbi_wave(dest, source, nelems * sizeof(T), pe);
-}
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_DEVICE_HPP_
diff --git a/src/gpu_ib/context_ib_tmpl_host.hpp b/src/gpu_ib/context_ib_tmpl_host.hpp
deleted file mode 100644
index 259f158162..0000000000
--- a/src/gpu_ib/context_ib_tmpl_host.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
-#define LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../host/host_templates.hpp"
-
-namespace rocshmem {
-
-template <typename T>
-__host__ void GPUIBHostContext::p(T *dest, T value, int pe) {
-  host_interface->p<T>(dest, value, pe, context_window_info);
-}
-
-template <typename T>
-__host__ T GPUIBHostContext::g(const T *source, int pe) {
-  return host_interface->g<T>(source, pe, context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::put(T *dest, const T *source, size_t nelems,
-                                    int pe) {
-  host_interface->put<T>(dest, source, nelems, pe, context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::get(T *dest, const T *source, size_t nelems,
-                                    int pe) {
-  host_interface->get<T>(dest, source, nelems, pe, context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::put_nbi(T *dest, const T *source, size_t nelems,
-                                        int pe) {
-  host_interface->put_nbi<T>(dest, source, nelems, pe, context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::get_nbi(T *dest, const T *source, size_t nelems,
-                                        int pe) {
-  host_interface->get_nbi<T>(dest, source, nelems, pe, context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::amo_add(void *dst, T value, int pe) {
-  host_interface->amo_add(dst, value, pe, context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::amo_cas(void *dst, T value, T cond, int pe) {
-  host_interface->amo_cas(dst, value, cond, pe, context_window_info);
-}
-
-template <typename T>
-__host__ T GPUIBHostContext::amo_fetch_add(void *dst, T value, int pe) {
-  return host_interface->amo_fetch_add(dst, value, pe, context_window_info);
-}
-
-template <typename T>
-__host__ T GPUIBHostContext::amo_fetch_cas(void *dst, T value, T cond, int pe) {
-  return host_interface->amo_fetch_cas(dst, value, cond, pe,
-                                       context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::broadcast(
-    T *dest, const T *source, int nelems, int pe_root, int pe_start,
-    int log_pe_stride, int pe_size,
-    long *p_sync) {  // NOLINT(runtime/int)
-  host_interface->broadcast<T>(dest, source, nelems, pe_root, pe_start,
-                               log_pe_stride, pe_size, p_sync);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::broadcast(rocshmem_team_t team, T *dest,
-                                          const T *source, int nelems,
-                                          int pe_root) {
-  host_interface->broadcast<T>(team, dest, source, nelems, pe_root);
-}
-
-template <typename T, ROCSHMEM_OP Op>
-__host__ void GPUIBHostContext::to_all(T *dest, const T *source, int nreduce,
-                                       int pe_start, int log_pe_stride,
-                                       int pe_size, T *p_wrk,
-                                       long *p_sync) {  // NOLINT(runtime/int)
-  host_interface->to_all<T, Op>(dest, source, nreduce, pe_start, log_pe_stride,
-                                pe_size, p_wrk, p_sync);
-}
-
-template <typename T, ROCSHMEM_OP Op>
-__host__ void GPUIBHostContext::to_all(rocshmem_team_t team, T *dest,
-                                       const T *source, int nreduce) {
-  host_interface->to_all<T, Op>(team, dest, source, nreduce);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::wait_until(T *ivars, int cmp, T val) {
-  host_interface->wait_until<T>(ivars, cmp, val, context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::wait_until_all(T *ivars, size_t nelems,
-                                               const int* status,
-                                               int cmp, T val) {
-  host_interface->wait_until_all<T>(ivars, nelems, status, cmp, val, context_window_info);
-}
-
-template <typename T>
-__host__ size_t GPUIBHostContext::wait_until_any(T *ivars, size_t nelems,
-                                               const int* status,
-                                               int cmp, T val) {
-  return host_interface->wait_until_any<T>(ivars, nelems, status, cmp, val, context_window_info);
-}
-
-template <typename T>
-__host__ size_t GPUIBHostContext::wait_until_some(T *ivars, size_t nelems,
-                                                 size_t* indices,
-                                                 const int* status,
-                                                 int cmp, T val) {
-  return host_interface->wait_until_some<T>(ivars, nelems, indices, status, cmp, val, context_window_info);
-}
-
-template <typename T>
-__host__ void GPUIBHostContext::wait_until_all_vector(T *ivars, size_t nelems,
-                                                      const int* status,
-                                                      int cmp, T* vals) {
-  host_interface->wait_until_all_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
-}
-
-template <typename T>
-__host__ size_t GPUIBHostContext::wait_until_any_vector(T *ivars, size_t nelems,
-                                                        const int* status,
-                                                        int cmp, T* vals) {
-  return host_interface->wait_until_any_vector<T>(ivars, nelems, status, cmp, vals, context_window_info);
-}
-
-template <typename T>
-__host__ size_t GPUIBHostContext::wait_until_some_vector(T *ivars, size_t nelems,
-                                                         size_t* indices,
-                                                         const int* status,
-                                                         int cmp, T* vals) {
-  return host_interface->wait_until_some_vector<T>(ivars, nelems, indices, status, cmp, vals, context_window_info);
-}
-
-template <typename T>
-__host__ int GPUIBHostContext::test(T *ivars, int cmp, T val) {
-  return host_interface->test<T>(ivars, cmp, val, context_window_info);
-}
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_CONTEXT_IB_TMPL_HOST_HPP_
diff --git a/src/gpu_ib/debug.cpp b/src/gpu_ib/debug.cpp
deleted file mode 100644
index dd236e1cae..0000000000
--- a/src/gpu_ib/debug.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "rocshmem/rocshmem_debug.hpp"
-
-#include "qe_dumper.hpp"
-
-namespace rocshmem {
-
-void debug_print_cq(int dest_pe, int src_wg, int cqe_index) {
-  QeDumper dumper(dest_pe, src_wg, cqe_index);
-  dumper.dump_cq();
-}
-
-void debug_print_sq(int dest_pe, int src_wg, int wqe_index) {
-  QeDumper dumper(dest_pe, src_wg, wqe_index);
-  dumper.dump_sq();
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/dynamic_connection.cpp b/src/gpu_ib/dynamic_connection.cpp
deleted file mode 100644
index 7195601b24..0000000000
--- a/src/gpu_ib/dynamic_connection.cpp
+++ /dev/null
@@ -1,381 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "dynamic_connection.hpp"
-
-#include <mpi.h>
-
-#include "backend_ib.hpp"
-
-namespace rocshmem {
-
-DynamicConnection::DynamicConnection(GPUIBBackend* b) : Connection(b, 4) {
-  char* value = nullptr;
-
-  if ((value = getenv("ROCSHMEM_NUM_DCIs"))) {
-    num_dcis = atoi(value);
-  }
-
-  if ((value = getenv("ROCSHMEM_NUM_DCT"))) {
-    num_dct = atoi(value);
-  }
-}
-
-DynamicConnection::~DynamicConnection() {
-  CHECK_HIP(hipFree(vec_lids));
-  CHECK_HIP(hipFree(vec_dct_num));
-}
-
-ibv_qp_init_attr_ex DynamicConnection::dct_qp_init_attr(
-    ibv_cq* cq, ibv_srq* srq, [[maybe_unused]] uint8_t port) const {
-  ibv_qp_init_attr_ex attr{};
-
-  attr.comp_mask = IBV_QP_INIT_ATTR_PD;
-  attr.pd = ib_state->pd;
-  attr.recv_cq = cq;
-  attr.send_cq = cq;
-  attr.srq = srq;
-  attr.qp_type = IBV_QPT_DRIVER;
-
-  return attr;
-}
-
-mlx5dv_qp_init_attr DynamicConnection::dct_dv_init_attr() {
-  mlx5dv_qp_init_attr dv_attr{};
-  dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC;
-  dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT;
-  dv_attr.dc_init_attr.dct_access_key = DC_IB_KEY;
-
-  return dv_attr;
-}
-
-Connection::InitQPState DynamicConnection::initqp(uint8_t port) {
-  InitQPState initqp{};
-
-  initqp.exp_qp_attr.port_num = port;
-  initqp.exp_qp_attr.pkey_index = 0;
-  initqp.exp_qp_attr.qp_access_flags = 0;
-
-  return initqp;
-}
-
-Connection::RtrState DynamicConnection::rtr([[maybe_unused]] dest_info_t* dest,
-                                            uint8_t port) {
-  RtrState rtr{};
-
-  rtr.exp_qp_attr.ah_attr.is_global = 1;
-  rtr.exp_qp_attr.ah_attr.port_num = port;
-
-  rtr.exp_qp_attr.max_dest_rd_atomic = 0;
-  rtr.exp_qp_attr.min_rnr_timer = 0;
-
-  return rtr;
-}
-
-Connection::RtsState DynamicConnection::rts([
-    [maybe_unused]] dest_info_t* dest) {
-  RtsState rts{};
-  rts.exp_attr_mask |= IBV_QP_SQ_PSN;
-  return rts;
-}
-
-void DynamicConnection::connect_dci(ibv_qp* qp, uint8_t port) {
-  init_qp_status(qp, port);
-  change_status_rtr(qp, nullptr, port);
-  change_status_rts(qp, nullptr);
-}
-
-/*
- * create a DCT and get is to ready state
- */
-void DynamicConnection::create_dct(int32_t* dct_num, ibv_cq* cq, ibv_srq* srq,
-                                     uint8_t port) {
-  auto init_attr = dct_qp_init_attr(cq, srq, port);
-  auto dv_attr = dct_dv_init_attr();
-  auto dct = mlx5dv_create_qp(ib_state->context, &init_attr, &dv_attr);
-
-  if (dct == nullptr) {
-    printf("Failed to create dct \n");
-    abort();
-  }
-
-  ibv_qp_attr qp_attr{};
-  qp_attr.qp_state = IBV_QPS_INIT;
-  qp_attr.port_num = port;
-  qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
-                            IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
-
-  int attr_mask =
-      IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
-
-  int ret = ibv_modify_qp(dct, &qp_attr, attr_mask);
-  if (ret) {
-    abort();
-  }
-
-  qp_attr.qp_state = IBV_QPS_RTR;
-  qp_attr.path_mtu = IBV_MTU_4096;
-  qp_attr.min_rnr_timer = 7;
-  qp_attr.ah_attr.is_global = 1;
-  qp_attr.ah_attr.grh.hop_limit = 1;
-  qp_attr.ah_attr.grh.traffic_class = 0;
-  qp_attr.ah_attr.grh.sgid_index = 0;
-  qp_attr.ah_attr.port_num = port;
-
-  attr_mask = IBV_QP_STATE | IBV_QP_MIN_RNR_TIMER | IBV_QP_AV | IBV_QP_PATH_MTU;
-
-  ret = ibv_modify_qp(dct, &qp_attr, attr_mask);
-  if (ret) {
-    abort();
-  }
-
-  *dct_num = dct->qp_num;
-}
-
-/*
- * @brief create a qp (DCI qp) using DEVX
- */
-ibv_qp* DynamicConnection::create_qp_0(ibv_context* context,
-                                       ibv_qp_init_attr_ex* qp_attr) {
-  ibv_qp* qp;
-  qp_attr->qp_type = IBV_QPT_DRIVER;
-
-  mlx5dv_qp_init_attr dv_attr{};
-  dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC;
-  dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI;
-  dv_attr.dc_init_attr.dct_access_key = DC_IB_KEY;
-
-  qp = mlx5dv_create_qp(context, qp_attr, &dv_attr);
-
-  return qp;
-}
-
-void DynamicConnection::create_qps_1() {
-  ibv_srq_init_attr srq_init_attr{};
-  srq_init_attr.attr.max_wr = 1;
-  srq_init_attr.attr.max_sge = 1;
-
-  srq = ibv_create_srq(ib_state->pd, &srq_init_attr);
-  if (!srq) {
-    abort();
-  }
-
-  dct_cq = ibv_create_cq(ib_state->context, 100, nullptr, nullptr, 0);
-  if (!dct_cq) {
-    abort();
-  }
-}
-
-void DynamicConnection::create_qps_2(int port, int my_rank,
-                                       ibv_port_attr* ib_port_att) {
-  for (int i = 0; i < num_dct; i++) {
-    int32_t dct_num;
-    create_dct(&dct_num, dct_cq, srq, port);
-    dct_num = htobe32(dct_num);
-    dcts_num[my_rank * num_dct + i] = dct_num;
-  }
-  lids[my_rank] = htobe16(ib_port_att->lid);
-}
-
-void DynamicConnection::create_qps_3(
-    int port, ibv_qp* qp, [[maybe_unused]] int offset,
-    [[maybe_unused]] ibv_port_attr* ib_port_att) {
-  return connect_dci(qp, port);
-}
-
-void DynamicConnection::get_remote_conn(int* remote_conn) {
-  *remote_conn = num_dcis;
-}
-
-void DynamicConnection::allocate_dynamic_members([
-    [maybe_unused]] int num_wg) {
-  size_t num_pes_size_bytes = sizeof(uint16_t) * backend->num_pes;
-  lids = reinterpret_cast<uint16_t*>(malloc(num_pes_size_bytes));
-  if (lids == nullptr) {
-    abort();
-  }
-
-  size_t num_dcts = num_dct * backend->num_pes;
-  size_t num_dcts_size_bytes = sizeof(uint32_t) * num_dcts;
-  dcts_num = reinterpret_cast<uint32_t*>(malloc(num_dcts_size_bytes));
-  if (dcts_num == nullptr) {
-    abort();
-  }
-}
-
-/*
- * get the wqe_av information from tyhe ibv_ah
- * rely on DEVX to extract the AV. We use the AV to create
- * the DC segment
- */
-void DynamicConnection::dc_get_av(ibv_ah* ah, mlx5_wqe_av* mlx5_av) {
-  mlx5dv_obj dv;
-  mlx5dv_ah dah;
-
-  dv.ah.in = ah;
-  dv.ah.out = &dah;
-  mlx5dv_init_obj(&dv, MLX5DV_OBJ_AH);
-
-  memcpy(mlx5_av, dah.av, sizeof(mlx5_wqe_av));
-}
-
-void DynamicConnection::free_dynamic_members() {
-  free(lids);
-  free(dcts_num);
-}
-
-void DynamicConnection::initialize_1(int port, [[maybe_unused]] int num_wg) {
-  MPI_Allgather(MPI_IN_PLACE, sizeof(int32_t) * num_dct, MPI_CHAR, dcts_num,
-                sizeof(int32_t) * num_dct, MPI_CHAR, backend->thread_comm);
-
-  MPI_Allgather(MPI_IN_PLACE, sizeof(int16_t), MPI_CHAR, lids, sizeof(int16_t),
-                MPI_CHAR, backend->thread_comm);
-
-  hipStream_t stream;
-  CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-  CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&vec_dct_num),
-                      sizeof(int32_t) * num_dct * backend->num_pes));
-
-  CHECK_HIP(hipMemcpyAsync(vec_dct_num, dcts_num,
-                           sizeof(int32_t) * num_dct * backend->num_pes,
-                           hipMemcpyHostToDevice, stream));
-
-  CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&vec_lids),
-                      sizeof(int16_t) * backend->num_pes));
-
-  CHECK_HIP(hipMemcpyAsync(vec_lids, lids, sizeof(int16_t) * backend->num_pes,
-                           hipMemcpyHostToDevice, stream));
-
-  struct ibv_ah_attr ah_attr;
-  memset(&ah_attr, 0, sizeof(ah_attr));
-
-  ah_attr.is_global = 1;
-  ah_attr.dlid = ib_state->portinfo.lid;
-  ah_attr.sl = 1;
-  ah_attr.src_path_bits = 0;
-  ah_attr.port_num = port;
-
-  ah = ibv_create_ah(ib_state->pd, &ah_attr);
-  if (ah == nullptr) {
-    abort();
-  }
-
-  dc_get_av(ah, &mlx5_av);
-
-  CHECK_HIP(hipStreamSynchronize(stream));
-  CHECK_HIP(hipStreamDestroy(stream));
-}
-
-void DynamicConnection::initialize_rkey_handle(uint32_t** heap_rkey_handle,
-                                                 ibv_mr* mr) {
-  CHECK_HIP(hipMalloc(heap_rkey_handle, sizeof(uint32_t) * backend->num_pes));
-  (*heap_rkey_handle)[backend->my_pe] = htobe32(mr->rkey);
-}
-
-void DynamicConnection::free_rkey_handle(uint32_t* heap_rkey_handle) {
-  CHECK_HIP(hipFree(heap_rkey_handle));
-}
-
-Connection::QPInitAttr DynamicConnection::qpattr(ibv_qp_cap cap) {
-  QPInitAttr qpattr(cap);
-  return qpattr;
-}
-
-/*
- * Create and write the DC segment to SQ.
- * We get all the info needed from the mlx5_wqe_av that we extract from ibv_ah.
- */
-void DynamicConnection::set_dgram_seg(mlx5_wqe_datagram_seg* dc_seg,
-                                      uint64_t dc_key, uint32_t dct_num,
-                                      uint8_t ext, mlx5_wqe_av* mlx5_av) {
-  dc_seg->av.key.dc_key = htobe64(dc_key);
-  dc_seg->av.dqp_dct = htobe32(((uint32_t)ext << 31 | dct_num));
-  dc_seg->av.stat_rate_sl = mlx5_av->stat_rate_sl;
-  dc_seg->av.fl_mlid = mlx5_av->fl_mlid;
-  dc_seg->av.rlid = mlx5_av->rlid;
-}
-
-/*
- * create a DC wqe and post it to the SQ
- * we rely on mlx5dv functions to ceate the ctrl and data
- * segments but we use our own function to write teh DC and rdma segments
- */
-void DynamicConnection::post_dv_dc_wqe(int remote_conn) {
-  mlx5_wqe_ctrl_seg* ctrl;
-  mlx5_wqe_datagram_seg* dc_seg;
-  mlx5_wqe_raddr_seg* rdma;
-  mlx5_wqe_data_seg* data;
-
-  for (int i = 0; i < remote_conn; i++) {
-    uint64_t* ptr = get_address_sq(i);
-
-    const uint32_t nb_post = 4 * sq_size;
-    for (uint16_t index = 0; index < nb_post; index++) {
-      uint8_t op_mod = 0;
-      uint8_t op_code = 8;
-      uint32_t qp_num = qps[i]->qp_num;
-      uint8_t fm_ce_se = 0;
-      uint8_t ds = 4;
-      ctrl = reinterpret_cast<mlx5_wqe_ctrl_seg*>(ptr);
-      mlx5dv_set_ctrl_seg(ctrl, index, op_code, op_mod, qp_num, fm_ce_se, ds, 0,
-                          0);
-      ptr = ptr + 2;
-
-      uint32_t dct_num = dcts_num[i];
-      uint8_t ext = 1;
-      dc_seg = reinterpret_cast<mlx5_wqe_datagram_seg*>(ptr);
-      set_dgram_seg(dc_seg, (uint64_t)DC_IB_KEY, dct_num, ext, &mlx5_av);
-      ptr = ptr + 2;
-
-      uint64_t address = 0;
-      uint32_t rkey = 0;
-      rdma = reinterpret_cast<mlx5_wqe_raddr_seg*>(ptr);
-      set_rdma_seg(rdma, address, rkey);
-      ptr = ptr + 2;
-
-      uint32_t lkey = backend->networkImpl.heap_mr->lkey;
-      data = reinterpret_cast<mlx5_wqe_data_seg*>(ptr);
-      mlx5dv_set_data_seg(data, 1, lkey, 0);
-      ptr = ptr + 2;
-    }
-  }
-}
-
-// TODO(bpotter): remove redundancies with the other derived class
-void DynamicConnection::post_wqes() {
-  int remote_conn;
-  get_remote_conn(&remote_conn);
-  remote_conn *= backend->num_blocks_;
-  post_dv_dc_wqe(remote_conn);
-}
-
-void DynamicConnection::initialize_wr_fields([[maybe_unused]] ibv_send_wr* wr,
-                                             [[maybe_unused]] ibv_ah* ah,
-                                             [[maybe_unused]] int dc_key) {}
-
-int DynamicConnection::get_sq_dv_offset([[maybe_unused]] int pe_idx,
-                                        [[maybe_unused]] int num_qps,
-                                        int wg_idx) {
-  return wg_idx;
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/dynamic_connection.hpp b/src/gpu_ib/dynamic_connection.hpp
deleted file mode 100644
index ff85ef9305..0000000000
--- a/src/gpu_ib/dynamic_connection.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
-#define LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
-
-#include "connection.hpp"
-
-namespace rocshmem {
-
-class DynamicConnection : public Connection {
- public:
-  explicit DynamicConnection(GPUIBBackend* backend);
-
-  ~DynamicConnection() override;
-
-  void get_remote_conn(int* remote_conn) override;
-
-  void post_wqes() override;
-
-  void initialize_rkey_handle(uint32_t** heap_rkey_handle,
-                                ibv_mr* mr) override;
-
-  void free_rkey_handle(uint32_t* heap_rkey_handle) override;
-
-  uint32_t* get_vec_dct_num() const { return vec_dct_num; }
-
-  uint16_t* get_vec_lids() const { return vec_lids; }
-
- private:
-  InitQPState initqp(uint8_t port) override;
-
-  RtrState rtr(dest_info_t* dest, uint8_t port) override;
-
-  RtsState rts(dest_info_t* dest) override;
-
-  QPInitAttr qpattr(ibv_qp_cap cap) override;
-
-  void connect_dci(ibv_qp* qp, uint8_t port);
-
-  void create_dct(int32_t* dct_num, ibv_cq* cq, ibv_srq* srq, uint8_t port);
-
-  ibv_qp_init_attr_ex dct_qp_init_attr(ibv_cq* cq, ibv_srq* srq,
-                                       uint8_t port) const;
-
-  mlx5dv_qp_init_attr dct_dv_init_attr();
-
-  void dc_get_av(ibv_ah* ah, mlx5_wqe_av* mlx5_av);
-
-  void set_dgram_seg(mlx5_wqe_datagram_seg* dc_seg, uint64_t dc_key,
-                     uint32_t dct_num, uint8_t ext, mlx5_wqe_av* av);
-
-  void set_data_seg(mlx5_wqe_data_seg* data_seg, uint32_t lkey);
-
-  void post_dv_dc_wqe(int remote_conn);
-
-  void create_qps_1() override;
-
-  void create_qps_2(int port, int my_rank,
-                      ibv_port_attr* ib_port_att) override;
-
-  void create_qps_3(int port, ibv_qp* qp, int offset,
-                      ibv_port_attr* ib_port_att) override;
-
-  ibv_qp* create_qp_0(ibv_context* context,
-                      ibv_qp_init_attr_ex* qp_attr) override;
-
-  void allocate_dynamic_members(int num_wg) override;
-
-  void free_dynamic_members() override;
-
-  void initialize_1(int port, int num_wg) override;
-
-  void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, int dc_key) override;
-
-  int get_sq_dv_offset(int pe_idx, int32_t num_qps, int wg_idx) override;
-
-  int num_dcis{1};
-
-  int num_dct{1};
-
-  static constexpr int DC_IB_KEY{0x1ee7a330};
-
-  uint32_t* dcts_num{nullptr};
-
-  uint16_t* lids{nullptr};
-
-  mlx5_wqe_av mlx5_av{};
-
-  ibv_ah* ah{nullptr};
-
-  ibv_srq* srq{nullptr};
-
-  ibv_cq* dct_cq{nullptr};
-
-  uint32_t* vec_dct_num{nullptr};
-
-  uint16_t* vec_lids{nullptr};
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_DYNAMIC_CONNECTION_HPP_
diff --git a/src/gpu_ib/endian.cpp b/src/gpu_ib/endian.cpp
deleted file mode 100644
index 82c64e87b3..0000000000
--- a/src/gpu_ib/endian.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "endian.hpp"
-
-namespace rocshmem {
-
-template <typename T>
-__device__ void swap_endian_store(T *dst, const T val) {
-  typedef union U {
-    T val;
-    uint8_t bytes[sizeof(T)];
-  } union_type;
-  union_type src;
-  union_type dst_tmp;
-
-  src.val = val;
-  std::reverse_copy(src.bytes, src.bytes + sizeof(T), dst_tmp.bytes);
-  *dst = dst_tmp.val;
-}
-
-template <>
-__device__ void swap_endian_store(uint64_t *dst, const uint64_t val) {
-  uint64_t new_val = ((val << 8) & 0xFF00FF00FF00FF00ULL) |
-                     ((val >> 8) & 0x00FF00FF00FF00FFULL);
-
-  new_val = ((new_val << 16) & 0xFFFF0000FFFF0000ULL) |
-            ((new_val >> 16) & 0x0000FFFF0000FFFFULL);
-
-  *dst = (new_val << 32) | (new_val >> 32);
-}
-
-template <>
-__device__ void swap_endian_store(int64_t *dst, const int64_t val) {
-  swap_endian_store(reinterpret_cast<uint64_t *>(dst), (const uint64_t)val);
-}
-
-template <>
-__device__ void swap_endian_store(uint32_t *dst, const uint32_t val) {
-  uint32_t new_val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
-
-  *dst = (new_val << 16) | (new_val >> 16);
-}
-
-template <>
-__device__ void swap_endian_store(int32_t *dst, const int32_t val) {
-  swap_endian_store(reinterpret_cast<uint32_t *>(dst), (const uint32_t)val);
-}
-
-template <>
-__device__ void swap_endian_store(uint16_t *dst, const uint16_t val) {
-  *dst = ((val << 8) & 0xFF00) | ((val >> 8) & 0x00FF);
-}
-
-template <>
-__device__ void swap_endian_store(int16_t *dst, const int16_t val) {
-  swap_endian_store(reinterpret_cast<uint16_t *>(dst), (const uint16_t)val);
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/endian.hpp b/src/gpu_ib/endian.hpp
deleted file mode 100644
index 01f798cd5b..0000000000
--- a/src/gpu_ib/endian.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
-#define LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
-
-#include <hip/hip_runtime.h>
-
-namespace rocshmem {
-
-template <typename T>
-__device__ void swap_endian_store(T *dst, const T val);
-
-template <>
-__device__ void swap_endian_store(uint64_t *dst, const uint64_t val);
-
-template <>
-__device__ void swap_endian_store(int64_t *dst, const int64_t val);
-
-template <>
-__device__ void swap_endian_store(uint32_t *dst, const uint32_t val);
-
-template <>
-__device__ void swap_endian_store(int32_t *dst, const int32_t val);
-
-template <>
-__device__ void swap_endian_store(uint16_t *dst, const uint16_t val);
-
-template <>
-__device__ void swap_endian_store(int16_t *dst, const int16_t val);
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_ENDIAN_HPP_
diff --git a/src/gpu_ib/gpu_ib_team.cpp b/src/gpu_ib/gpu_ib_team.cpp
deleted file mode 100644
index 0aa04c8c83..0000000000
--- a/src/gpu_ib/gpu_ib_team.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "gpu_ib_team.hpp"
-
-#include "../backend_type.hpp"
-#include "backend_ib.hpp"
-
-namespace rocshmem {
-
-GPUIBTeam::GPUIBTeam(Backend *backend, TeamInfo *team_info_parent,
-                     TeamInfo *team_info_world, int num_pes, int my_pe,
-                     MPI_Comm mpi_comm, int pool_index)
-    : Team(backend, team_info_parent, team_info_world, num_pes, my_pe,
-           mpi_comm) {
-  type = BackendType::GPU_IB_BACKEND;
-  const GPUIBBackend *b = static_cast<const GPUIBBackend *>(backend);
-
-  pool_index_ = pool_index;
-
-  barrier_pSync =
-      &(b->barrier_pSync_pool[pool_index * ROCSHMEM_BARRIER_SYNC_SIZE]);
-  reduce_pSync =
-      &(b->reduce_pSync_pool[pool_index * ROCSHMEM_REDUCE_SYNC_SIZE]);
-  bcast_pSync = &(b->bcast_pSync_pool[pool_index * ROCSHMEM_BCAST_SYNC_SIZE]);
-  alltoall_pSync =
-      &(b->alltoall_pSync_pool[pool_index * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
-
-  pWrk = reinterpret_cast<char *>(b->pWrk_pool) +
-         ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE * sizeof(double) * pool_index;
-  pAta = reinterpret_cast<char *>(b->pAta_pool) +
-         ROCSHMEM_ATA_MAX_WRKDATA_SIZE * sizeof(double) * pool_index;
-}
-
-GPUIBTeam::~GPUIBTeam() {}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/gpu_ib_team.hpp b/src/gpu_ib/gpu_ib_team.hpp
deleted file mode 100644
index 3d7b18d1a7..0000000000
--- a/src/gpu_ib/gpu_ib_team.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
-#define LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
-
-#include "../team.hpp"
-
-namespace rocshmem {
-
-class GPUIBTeam : public Team {
- public:
-  GPUIBTeam(Backend* handle, TeamInfo* team_info_wrt_parent,
-            TeamInfo* team_info_wrt_world, int num_pes, int my_pe,
-            MPI_Comm team_comm, int pool_index);
-
-  virtual ~GPUIBTeam();
-
-  long* barrier_pSync{nullptr};
-  long* reduce_pSync{nullptr};
-  long* bcast_pSync{nullptr};
-  long* alltoall_pSync{nullptr};
-  void* pWrk{nullptr};
-  void* pAta{nullptr};
-
-  int pool_index_{-1};
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_GPU_IB_TEAM_HPP_
diff --git a/src/gpu_ib/infiniband_structs.hpp b/src/gpu_ib/infiniband_structs.hpp
deleted file mode 100644
index 3726be23f8..0000000000
--- a/src/gpu_ib/infiniband_structs.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
-#define LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
-
-#include <infiniband/mlx5dv.h>
-
-namespace rocshmem {
-
-typedef struct ib_mlx5_base_av {
-  uint64_t dc_key;
-  uint32_t dqp_dct;
-  uint8_t stat_rate_sl;
-  uint8_t fl_mlid;
-  uint16_t rlid;
-} ib_mlx5_base_av_t;
-
-union mlx5_segment {
-  mlx5_wqe_ctrl_seg ctrl_seg;
-  mlx5_wqe_raddr_seg raddr_seg;
-  mlx5_wqe_atomic_seg atomic_seg;
-  mlx5_wqe_data_seg data_seg;
-  mlx5_wqe_inl_data_seg inl_data_seg;
-  ib_mlx5_base_av_t base_av;
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_INFINIBAND_STRUCTS_HPP_
diff --git a/src/gpu_ib/memory_builder_policy.hpp b/src/gpu_ib/memory_builder_policy.hpp
deleted file mode 100644
index 1c6f211c95..0000000000
--- a/src/gpu_ib/memory_builder_policy.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
-#define LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
-
-#include <hip/hip_runtime.h>
-
-#include <utility>
-
-namespace rocshmem {
-
-class GPUIBContext;
-
-class MemoryBuilderPolicyWrapper {
- public:
-  __device__ MemoryBuilderPolicyWrapper() = default;
-
-  __device__ ~MemoryBuilderPolicyWrapper() {
-    if (wrapped_policy_) {
-      delete wrapped_policy_;
-    }
-  }
-
-  template <typename T>
-  __device__ MemoryBuilderPolicyWrapper(T&& policy)
-      : wrapped_policy_(new Wrapper<T>(std::forward<T>(policy))) {}
-
-  __device__ void operator()(GPUIBContext* context) {
-    return (*wrapped_policy_)(context);
-  }
-
- private:
-  class PolicyBase {
-   public:
-    __device__ virtual void operator()(GPUIBContext* context) = 0;
-
-    __device__ virtual ~PolicyBase() {}
-  };
-
-  template <typename T>
-  class Wrapper : public PolicyBase {
-   public:
-    __device__ Wrapper(const T& t) : wrapped_policy_(t) {}
-
-    __device__ void operator()(GPUIBContext* context) override {
-      return wrapped_policy_(context);
-    }
-
-   private:
-    T wrapped_policy_;
-  };
-
-  PolicyBase* wrapped_policy_;
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_MEMORY_BUILDER_POLICY_HPP_
diff --git a/src/gpu_ib/network_policy.cpp b/src/gpu_ib/network_policy.cpp
deleted file mode 100644
index b251dd2ad2..0000000000
--- a/src/gpu_ib/network_policy.cpp
+++ /dev/null
@@ -1,500 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "network_policy.hpp"
-
-#include <mpi.h>
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../atomic_return.hpp"
-#include "../context_incl.hpp"
-#include "backend_ib.hpp"
-#include "connection.hpp"
-#include "dynamic_connection.hpp"
-#include "queue_pair.hpp"
-#include "reliable_connection.hpp"
-
-namespace rocshmem {
-
-void NetworkOnImpl::dump_backend_stats(ROCStats *globalStats) {
-  /*
-   * TODO(bpotter): Refactor this into the Stats class to remove the ifdef
-   */
-#ifdef PROFILE
-  int statblocks = connection->total_number_connections();
-
-  uint64_t cycles_ring_sq_db = 0;
-  uint64_t cycles_update_wqe = 0;
-  uint64_t cycles_poll_cq = 0;
-  uint64_t cycles_next_cq = 0;
-  uint64_t cycles_init = gpu_qps[statblocks - 1].profiler.getStat(INIT);
-  uint64_t cycles_finalize = gpu_qps[statblocks - 1].profiler.getStat(FINALIZE);
-
-  uint64_t total_quiet_count = 0;
-  uint64_t total_db_count = 0;
-  uint64_t total_wqe_count = 0;
-
-  for (int i = 0; i < statblocks; i++) {
-    cycles_ring_sq_db += gpu_qps[i].profiler.getStat(RING_SQ_DB);
-    cycles_update_wqe += gpu_qps[i].profiler.getStat(UPDATE_WQE);
-    cycles_poll_cq += gpu_qps[i].profiler.getStat(POLL_CQ);
-    cycles_next_cq += gpu_qps[i].profiler.getStat(NEXT_CQ);
-    total_quiet_count += gpu_qps[i].profiler.getStat(QUIET_COUNT);
-    total_db_count += gpu_qps[i].profiler.getStat(DB_COUNT);
-    total_wqe_count += gpu_qps[i].profiler.getStat(WQE_COUNT);
-  }
-
-  double us_ring_sq_db = cycles_ring_sq_db / gpu_clock_freq_mhz;
-  double us_update_wqe = cycles_update_wqe / gpu_clock_freq_mhz;
-  double us_poll_cq = cycles_poll_cq / gpu_clock_freq_mhz;
-  double us_next_cq = cycles_next_cq / gpu_clock_freq_mhz;
-  double us_init = cycles_init / gpu_clock_freq_mhz;
-  double us_finalize = cycles_finalize / gpu_clock_freq_mhz;
-
-  const int FIELD_WIDTH = 20;
-  const int FLOAT_PRECISION = 2;
-
-  printf("Counts: Internal Quiets %lu DB Rings %lu WQE Posts %lu\n",
-         total_quiet_count, total_db_count, total_wqe_count);
-
-  printf("\n%*s%*s%*s%*s%*s%*s\n", FIELD_WIDTH + 1, "Init (us)",
-         FIELD_WIDTH + 1, "Finalize (us)", FIELD_WIDTH + 1, "Ring SQ DB (us)",
-         FIELD_WIDTH + 1, "Update WQE (us)", FIELD_WIDTH + 1, "Poll CQ (us)",
-         FIELD_WIDTH + 1, "Next CQ (us)");
-
-  uint64_t totalFinalize = globalStats->getStat(NUM_FINALIZE);
-  printf("%*.*f %*.*f %*.*f %*.*f %*.*f %*.*f\n", FIELD_WIDTH, FLOAT_PRECISION,
-         us_init / totalFinalize, FIELD_WIDTH, FLOAT_PRECISION,
-         us_finalize / totalFinalize, FIELD_WIDTH, FLOAT_PRECISION,
-         us_ring_sq_db / total_db_count, FIELD_WIDTH, FLOAT_PRECISION,
-         us_update_wqe / total_wqe_count, FIELD_WIDTH, FLOAT_PRECISION,
-         us_poll_cq / total_quiet_count, FIELD_WIDTH, FLOAT_PRECISION,
-         us_next_cq / total_quiet_count);
-#endif
-}
-
-void NetworkOnImpl::reset_backend_stats() {
-  int statblocks = connection->total_number_connections();
-
-  for (size_t i = 0; i < statblocks; i++) {
-    gpu_qps[i].profiler.resetStats();
-  }
-}
-
-void NetworkOnImpl::exchange_hdp_info(HdpPolicy *hdp_policy,
-                                      MPI_Comm thread_comm) {
-  /*
-   * Using Connection class, register the host-side hdp flush address
-   * with the InfiniBand network.
-   */
-  connection->reg_mr(hdp_policy->get_hdp_flush_ptr(), 32, &hdp_mr, false);
-
-  /*
-   * Allocate device-side memory for the remote HDP keys.
-   */
-  CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_rkey),
-                      num_pes * sizeof(uint32_t)));
-
-  /*
-   * Allocate device-side memory for the remote HDP addresses.
-   */
-  CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_address),
-                      num_pes * sizeof(uintptr_t)));
-
-  /*
-   * Allocate host-side memory to exchange hdp keys using MPI_Allgather.
-   */
-  uint32_t *host_hdp_cpy =
-      reinterpret_cast<uint32_t *>(malloc(num_pes * sizeof(uint32_t)));
-  if (host_hdp_cpy == nullptr) {
-    abort();
-  }
-
-  /*
-   * Allocate host-side memory to exchange hdp addresses using
-   * MPI_Allgather.
-   */
-  uint32_t **host_hdp_address_cpy =
-      reinterpret_cast<uint32_t **>(malloc(num_pes * sizeof(uint32_t *)));
-  if (host_hdp_address_cpy == nullptr) {
-    free(host_hdp_cpy);
-    abort();
-  }
-
-  /*
-   * This processing element writes its personal HDP key and HDP address
-   * into the host-side arrays which were just allocated.
-   */
-  int my_rank = my_pe;
-  host_hdp_cpy[my_rank] = htobe32(hdp_mr->rkey);
-  host_hdp_address_cpy[my_rank] = hdp_policy->get_hdp_flush_ptr();
-
-  /*
-   * Do all-to-all exchange of our HDP key with other processing elements.
-   */
-  MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_hdp_cpy,
-                sizeof(uint32_t), MPI_CHAR, thread_comm);
-
-  /*
-   * Do all-to-all exchange of our HDP address with other processing
-   * elements.
-   */
-  MPI_Allgather(MPI_IN_PLACE, sizeof(uintptr_t), MPI_CHAR, host_hdp_address_cpy,
-                sizeof(uint32_t *), MPI_CHAR, thread_comm);
-
-  /*
-   * Copy the recently exchanged HDP keys to device memory.
-   */
-  hipStream_t stream;
-  CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-  CHECK_HIP(hipMemcpyAsync(hdp_rkey, host_hdp_cpy, num_pes * sizeof(uint32_t),
-                           hipMemcpyHostToDevice, stream));
-
-  /*
-   * Copy the recently exchanged HDP addresses to device memory.
-   */
-  CHECK_HIP(hipMemcpyAsync(hdp_address, host_hdp_address_cpy,
-                           num_pes * sizeof(uint32_t *), hipMemcpyHostToDevice,
-                           stream));
-  CHECK_HIP(hipStreamSynchronize(stream));
-  CHECK_HIP(hipStreamDestroy(stream));
-
-  /*
-   * Free the host-side resources used to exchange HDP resources
-   * between processing elements.
-   */
-  free(host_hdp_cpy);
-  free(host_hdp_address_cpy);
-}
-
-void NetworkOnImpl::setup_atomic_region() {
-  /*
-   * Allocate fine-grained device-side memory for the atomic return
-   * region.
-   */
-  allocate_atomic_region(&atomic_ret, num_blocks);
-
-  /*
-   * Register the atomic return region on the InfiniBand network.
-   */
-  connection->reg_mr(atomic_ret->atomic_base_ptr,
-                     sizeof(uint64_t) * max_nb_atomic * num_blocks, &mr, false);
-
-  /*
-   * Set member variable from class.
-   */
-  atomic_ret->atomic_lkey = htobe32(mr->lkey);
-}
-
-void NetworkOnImpl::heap_memory_rkey(char *local_heap_base, size_t heap_size,
-                                     MPI_Comm thread_comm, bool is_managed) {
-  /*
-   * Allocate host-side memory to hold remote keys for all processing
-   * elements.
-   */
-  const size_t rkeys_size = sizeof(uint32_t) * num_pes;
-  uint32_t *host_rkey_cpy = reinterpret_cast<uint32_t *>(malloc(rkeys_size));
-  if (host_rkey_cpy == nullptr) {
-    abort();
-  }
-
-  /*
-   * Using the Connection class, register the symmetric heap with the
-   * InfiniBand network.
-   */
-  void *base_heap = local_heap_base;
-  connection->reg_mr(base_heap, heap_size, &heap_mr, is_managed);
-
-  /*
-   * Using the memory region from the prior heap memory registration,
-   * allocate and initialize some device-side memory to hold the remote
-   * keys for the symmetric heap base.
-   *
-   * Only the device-side memory entry for this processing element will be
-   * updated with the key for the heap memory region.
-   */
-  connection->initialize_rkey_handle(&heap_rkey, heap_mr);
-
-  /*
-   * Copy the device-side heap base remote key array to the host-side
-   * heap base remote key array.
-   */
-  hipStream_t stream;
-  CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-  CHECK_HIP(hipMemcpyAsync(host_rkey_cpy, heap_rkey, rkeys_size,
-                           hipMemcpyDeviceToHost, stream));
-  CHECK_HIP(hipStreamSynchronize(stream));
-
-  /*
-   * Do all-to-all exchange of symmetric heap base remote key between the
-   * processing elements.
-   */
-  MPI_Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_rkey_cpy,
-                sizeof(uint32_t), MPI_CHAR, thread_comm);
-
-  /*
-   * Copy the recently updated host-side heap base remote key array back
-   * to the device-side memory.
-   */
-  CHECK_HIP(hipMemcpyAsync(heap_rkey, host_rkey_cpy, rkeys_size,
-                           hipMemcpyHostToDevice, stream));
-  CHECK_HIP(hipStreamSynchronize(stream));
-  CHECK_HIP(hipStreamDestroy(stream));
-
-  /*
-   * Free the host-side resources used to do the processing element
-   * exchange of keys and addresses for the symmetric heap base.
-   */
-  free(host_rkey_cpy);
-
-  /*
-   * Initialize this member variable to hold the InfiniBand memory
-   * region's local key.
-   */
-  lkey = heap_mr->lkey;
-}
-
-void NetworkOnImpl::setup_gpu_qps(GPUIBBackend *B) {
-  /*
-   * Determine how many connections are needed.
-   * The number of connections depends on the connection type and the
-   * number of workgroups.
-   */
-  int connections;
-  connection->get_remote_conn(&connections);
-  connections *= num_blocks;
-
-  /*
-   * Allocate device-side memory for the queue pairs.
-   */
-  CHECK_HIP(hipMalloc(&gpu_qps, sizeof(QueuePair) * connections));
-
-  /*
-   * For every connection, initialize the QueuePair.
-   */
-  for (int i = 0; i < connections; i++) {
-    new (&gpu_qps[i]) QueuePair(B);
-    connection->init_gpu_qp_from_connection(&gpu_qps[i], i);
-  }
-}
-
-void NetworkOnImpl::rocshmem_g_init(SymmetricHeap *heap_handle,
-                                     MPI_Comm thread_comm) {
-  init_g_ret(heap_handle, thread_comm, num_blocks, &g_ret);
-}
-
-__host__ void NetworkOnImpl::networkHostSetup(GPUIBBackend *B) {
-  num_pes = B->num_pes;
-  my_pe = B->my_pe;
-  num_blocks = B->num_blocks_;
-
-#ifdef USE_DC
-  connection = new DynamicConnection(B);
-#else
-  connection = new ReliableConnection(B);
-#endif
-
-  connection->initialize(B->num_blocks_);
-  exchange_hdp_info(B->hdp_policy, B->thread_comm);
-
-  const auto &heap_bases{B->heap.get_heap_bases()};
-  heap_memory_rkey(heap_bases[my_pe], B->heap.get_size(), B->thread_comm,
-                   B->heap.is_managed());
-  // The earliest we can allow the main thread to launch a kernel to
-  // avoid potential deadlock
-  network_init_done = true;
-
-  setup_atomic_region();
-
-  connection->initialize_gpu_policy(&connection_policy, heap_rkey);
-
-  rocshmem_g_init(&B->heap, B->thread_comm);
-
-  connection->post_wqes();
-
-  setup_gpu_qps(B);
-}
-
-__host__ void NetworkOnImpl::networkHostFinalize() {
-  CHECK_HIP(hipFree(hdp_rkey));
-  hdp_rkey = nullptr;
-
-  CHECK_HIP(hipFree(hdp_address));
-  hdp_address = nullptr;
-
-  CHECK_HIP(hipFree(atomic_ret));
-  atomic_ret = nullptr;
-
-  CHECK_HIP(hipFree(gpu_qps));
-  gpu_qps = nullptr;
-
-  CHECK_HIP(hipFree(connection_policy));
-  connection_policy = nullptr;
-
-  connection->free_rkey_handle(heap_rkey);
-
-  connection->finalize();
-  delete connection;
-  connection = nullptr;
-}
-
-__host__ void NetworkOnImpl::networkHostInit(GPUIBContext *ctx, int buffer_id) {
-  int remote_conn = getNumQueuePairs();
-
-  CHECK_HIP(hipMalloc(&ctx->device_qp_proxy, remote_conn * sizeof(QueuePair)));
-
-  for (int i = 0; i < getNumQueuePairs(); i++) {
-    /*
-     * RC gpu_qp is actually [NUM_PE][NUM_BLOCK] qps but is flattened.
-     * Each num_pe entry contains num_block QPs connected to that PE.
-     * For RC, we need to iterate gpu_qp[i][buffer_id] to collect a
-     * single QP for each connected PE in order to build context.
-     * For DC, NUM_PE = 1 so can just use buffer_id directly.
-     */
-    int offset = num_blocks * i + buffer_id;
-    new (ctx->getQueuePair(i)) QueuePair(gpu_qps[offset]);
-
-    auto *qp = ctx->getQueuePair(i);
-    qp->global_qp = &gpu_qps[offset];
-    qp->num_cqs = getNumQueuePairs();
-    qp->atomic_ret.atomic_base_ptr =
-        &atomic_ret->atomic_base_ptr[max_nb_atomic * buffer_id];
-    qp->base_heap = ctx->base_heap;
-  }
-  ctx->g_ret = g_ret;
-}
-
-__device__ void NetworkOnImpl::networkGpuInit(GPUIBContext *ctx,
-                                              int buffer_id) {
-  for (int i = 0; i < getNumQueuePairs(); i++) {
-    int offset = num_blocks * i + buffer_id;
-
-    auto *qp = ctx->getQueuePair(i);
-    new (qp) QueuePair(gpu_qps[offset]);
-
-    qp->global_qp = &gpu_qps[offset];
-    qp->num_cqs = getNumQueuePairs();
-    qp->atomic_ret.atomic_base_ptr =
-        &atomic_ret->atomic_base_ptr[max_nb_atomic * buffer_id];
-    qp->base_heap = ctx->base_heap;
-  }
-  ctx->g_ret = g_ret;
-}
-
-__device__ __host__ QueuePair *NetworkOnImpl::getQueuePair(QueuePair *qp_handle,
-                                                           int pe) {
-#ifdef USE_DC
-  return qp_handle;
-#else
-  return &qp_handle[pe];
-#endif
-}
-
-__device__ __host__ int NetworkOnImpl::getNumQueuePairs() {
-#ifdef USE_DC
-  return 1;
-#else
-  return num_pes;
-#endif
-}
-
-void NetworkOffImpl::networkHostSetup(GPUIBBackend *B) {
-  num_pes = B->num_pes;
-  my_pe = B->my_pe;
-  num_blocks = B->num_blocks_;
-
-  exchange_hdp_info(B->hdp_policy, B->thread_comm);
-}
-void NetworkOffImpl::exchange_hdp_info(HdpPolicy *hdp_policy,
-                                       MPI_Comm thread_comm) {
-#ifdef USE_SINGLE_NODE
-  // We are using the symmetric heap for the HDP flush ptr
-  hdp_address = reinterpret_cast<uintptr_t *>(hdp_policy->get_hdp_flush_ptr());
-#else
-  /*
-   * Allocate device-side memory for the remote HDP addresses.
-   */
-  CHECK_HIP(hipMalloc(reinterpret_cast<void **>(&hdp_address),
-                      num_pes * sizeof(uintptr_t)));
-
-  /*
-   * Allocate host-side memory to exchange hdp keys using MPI_Allgather.
-   */
-  uint32_t *host_hdp_cpy =
-      reinterpret_cast<uint32_t *>(malloc(num_pes * sizeof(uint32_t)));
-  if (host_hdp_cpy == nullptr) {
-    abort();
-  }
-
-  /*
-   * Allocate host-side memory to exchange hdp addresses using
-   * MPI_Allgather.
-   */
-  uint32_t **host_hdp_address_cpy =
-      reinterpret_cast<uint32_t **>(malloc(num_pes * sizeof(uint32_t *)));
-  if (host_hdp_address_cpy == nullptr) {
-    free(host_hdp_cpy);
-    abort();
-  }
-
-  /*
-   * This processing element writes its personal HDP address
-   * into the host-side array which were just allocated.
-   */
-  int my_rank = my_pe;
-  host_hdp_address_cpy[my_rank] = hdp_policy->get_hdp_flush_ptr();
-
-  /*
-   * Do all-to-all exchange of our HDP address with other processing
-   * elements.
-   */
-  MPI_Allgather(MPI_IN_PLACE, sizeof(uintptr_t), MPI_CHAR, host_hdp_address_cpy,
-                sizeof(uint32_t *), MPI_CHAR, thread_comm);
-
-  /*
-   * Copy the recently exchanged HDP addresses to device memory.
-   */
-  hipStream_t stream;
-  CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-  CHECK_HIP(hipMemcpyAsync(hdp_address, host_hdp_address_cpy,
-                           num_pes * sizeof(uint32_t *), hipMemcpyHostToDevice,
-                           stream));
-  CHECK_HIP(hipStreamSynchronize(stream));
-  CHECK_HIP(hipStreamDestroy(stream));
-
-  /*
-   * Free the host-side resources used to exchange HDP resources
-   * between processing elements.
-   */
-  free(host_hdp_cpy);
-  free(host_hdp_address_cpy);
-#endif
-}
-
-void NetworkOffImpl::networkHostFinalize() {
-#ifndef USE_SINGLE_NODE
-  CHECK_HIP(hipFree(hdp_address));
-#endif
-  hdp_address = nullptr;
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/network_policy.hpp b/src/gpu_ib/network_policy.hpp
deleted file mode 100644
index ab3c75a589..0000000000
--- a/src/gpu_ib/network_policy.hpp
+++ /dev/null
@@ -1,357 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
-#define LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
-
-#include <hip/hip_runtime.h>
-#include <mpi.h>
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "rocshmem/rocshmem.hpp"
-#include "connection_policy.hpp"
-#include "queue_pair.hpp"
-#include "../hdp_policy.hpp"
-#include "../memory/symmetric_heap.hpp"
-#include "../stats.hpp"
-#include "../util.hpp"
-
-struct ibv_mr;
-struct hdp_reg_t;
-
-namespace rocshmem {
-
-struct atomic_ret_t;
-class GPUIBBackend;
-class GPUIBContext;
-class GPUIBHostContext;
-class Connection;
-
-class NetworkOnImpl {
- public:
-  void dump_backend_stats(ROCStats *globalStats);
-
-  void reset_backend_stats();
-
-  /**
-   * @brief setup the network resources and initialization for the
-   * GPUIBBackend
-   */
-  __host__ void networkHostSetup(GPUIBBackend *B);
-
-  /**
-   * @brief deallocate and close the network resources
-   */
-  __host__ void networkHostFinalize();
-
-  /**
-   * @brief initialize the network resources for each context
-   */
-  __host__ void networkHostInit(GPUIBContext *ctx, int buffer_id);
-
-  /**
-   * @brief initialize the network resources for each context on GPU side
-   */
-  __device__ void networkGpuInit(GPUIBContext *ctx, int buffer_id);
-
-  /**
-   * @brief returns the QP for the targeted pe
-   */
-  __device__ __host__ QueuePair *getQueuePair(QueuePair *qp, int pe);
-
-  /**
-   * @brief returns the numbers of QPs used per the calling PE
-   */
-  __device__ __host__ int getNumQueuePairs();
-
-  /**
-   * @brief returns the number of PEs accessible via network
-   */
-  __device__ __host__ int getNumDest() { return num_pes; }
-
-  static uint32_t externSharedBytes(int num_pes) {
-    int remote_conn{1};
-#ifndef USE_DC
-    remote_conn = num_pes;
-#endif
-    return remote_conn * sizeof(QueuePair);
-  }
-
- protected:
-  /**
-   * @brief flag to indicated that the helper thread reach this milestone
-   */
-  volatile bool network_init_done{false};
-
-  void heap_memory_rkey(char *local_heap_base, size_t heap_size,
-                        MPI_Comm thread_comm, bool is_managed);
-
-  /**
-   * @brief Exchange HDP information between all processing elements.
-   *
-   * Each device has a Host Data Path (HDP) associated with it must be
-   * manually controlled when using fine-grained memory accesses. (The
-   * symmetric heap is allocated with fine-grained memory to support both
-   * host memory accesses and device memory accesses.) The HDP can be
-   * cleared by accessing an address on the device. These addresses must be
-   * shared across the network (to support updates on remote accesses).
-   *
-   * These HDPs are visible to the network by registering them as
-   * InfiniBand memory regions. Every memory region has a remote key
-   * which needs to be shared across the network (to access the memory
-   * region).
-   *
-   * This method is responsible to allocating and initializing the
-   * library's HDP device-side memory and running the all-to-all exchange
-   * to share both the keys and addresses.
-   *
-   * @todo Implement HDP policy class methods to hide most of this
-   * method. The guts should be encapsulated in the policy class and
-   * not exposed here in the backend. Within the policy class methods,
-   * create helper function to improve code reuse regarding the many
-   * data transfers.
-   */
-  void exchange_hdp_info(HdpPolicy *hdp_policy, MPI_Comm thread_comm);
-
-  /**
-   * @brief Allocate and initialize the atomic region.
-   *
-   * The atomic region is used by the atomic operations which have return
-   * values. The library user does not need to provide an address for the
-   * return value so we are forced to do it on their behalf.
-   *
-   * The atomic_ret member is initialized upon completion of this method.
-   */
-  void setup_atomic_region();
-
-  /**
-   * @brief Allocate and initialize device-side queue pair objects.
-   *
-   * Upon completion, the gpu_qps member will be initialized.
-   */
-  void setup_gpu_qps(GPUIBBackend *B);
-
-  /**
-   * @brief Allocate and initialize device-side memory that will be used for
-   * the return of g shmem ops (eg: shmem_int_g)
-   */
-  void rocshmem_g_init(SymmetricHeap *heap_handle, MPI_Comm thread_comm);
-
-  /**
-   * @brief The backend delegates some InfiniBand connection setup to
-   * the Connection class.
-   */
-  Connection *connection{nullptr};
-
- public:
-  /**
-   * @brief Number of PEs. Get directly from the GPUIBBackend.
-   */
-  int num_pes{0};
-
-  /**
-   * @brief This PE's rank.
-   */
-  int my_pe{-1};
-
-  /**
-   * @brief Number of WG that will be performing communication
-   */
-  int num_blocks{0};
-
-  /**
-   * @brief Holds InfiniBand remote keys for HDP memory regions.
-   *
-   * The member holds a C-array allocation for remote keys (from
-   * InfiniBand memory registrations) for remote HDP registers. The C-array
-   * has one entry for each processing element (indexed by processing
-   * element ID).
-   *
-   * @todo Remove duplication between the backend class and the QueuePair
-   * class. QueuePair stores a copy of this member too. The backend
-   * class does not do much besides initialize this data structure and
-   * hold it until the QueuePair can consume it.
-   */
-  uint32_t *hdp_rkey{nullptr};
-
-  /**
-   * @brief Holds HDP register addresses for each processing element.
-   *
-   * The Host Data Path (HDP) addresses are used to clear a buffer
-   * which interferes with memory visibility of accesses to fine-grained
-   * allocations.
-   *
-   * The member holds a C-array allocation for the register addresses.
-   * The C-array has one entry for each processing element (indexed by
-   * processing element ID).
-   *
-   * @todo Remove duplication between the backend class and the QueuePair
-   * class. QueuePair stores a copy of this member too. The backend
-   * class does not do much besides initialize this data structure and
-   * hold it until the QueuePair can consume it.
-   */
-  uintptr_t *hdp_address{nullptr};
-
-  /**
-   * @brief Handle for the HDP memory region.
-   */
-  ibv_mr *hdp_mr{nullptr};
-
-  /**
-   * @brief Set of QueuePairs used by device to do networking.
-   *
-   * The member is used during Context creation.
-   *
-   * @todo What we really need here is a collection of Contexts that can
-   * either be copied into LDS or used directly by the GPU depending on
-   * what type of context it is (shareable, serialized, or private).
-   * No need to pool up QueuePairs, they can just be managed by their
-   * owning Context. Should then consider pushing into base class since
-   * it's not gpu-ib specific.
-   */
-  QueuePair *gpu_qps{nullptr};
-
-  /**
-   * @brief C-array of symmetric heap base pointers.
-   *
-   * A C-array of char* pointers corresponding to the heap base pointers
-   * virtual address for each processing element that we can communicate
-   * with.
-   */
-  uint32_t *heap_rkey{nullptr};
-
-  /**
-   * @brief Handle for the symmetric heap memory region.
-   */
-  ibv_mr *heap_mr{nullptr};
-
-  /**
-   * @brief Local key for the symmetric heap memory region.
-   */
-  uint32_t lkey{0};
-
-  /**
-   * @brief Control struct for atomic memory region.
-   *
-   * The atomic region is used by the atomic operations which have return
-   * values. The library user does not need to provide an address for the
-   * return value so we are forced to do it on their behalf.
-   */
-  atomic_ret_t *atomic_ret{nullptr};
-
-  /**
-   * @brief Handle for the atomic memory region.
-   *
-   * @todo Provide more descriptive variable name.
-   */
-  ibv_mr *mr{nullptr};
-
-  /**
-   * @brief Buffer used to store the results of a *_g operation.
-   *
-   * These operations do not provide a destination buffer so the runtime
-   * must manage one.
-   */
-  char *g_ret{nullptr};
-
-  /**
-   * @brief Compile-time configuration policy for InfiniBand connections.
-   *
-   * The configuration option "USE_DC" can be enabled to create
-   * Dynamic connection types. By default, Reliable connections are
-   * created.
-   */
-  ConnectionImpl *connection_policy{nullptr};
-};
-
-// clang-format off
-NOWARN(-Wunused-parameter,
-class NetworkOffImpl {
- public:
-  void dump_backend_stats(ROCStats *globalStats) { }
-
-  void reset_backend_stats() { }
-
-  __host__ void networkHostSetup(GPUIBBackend *B);
-
-  __host__ void exchange_hdp_info(HdpPolicy *hdp_policy, MPI_Comm thread_comm);
-
-  __host__ void networkHostFinalize();
-
-  __host__ void networkHostInit(GPUIBContext *ctx, int buffer_id) {}
-
-  __device__ void networkGpuInit(GPUIBContext *ctx, int buffer_id) {}
-
-  __device__ __host__ QueuePair *getQueuePair(QueuePair *qp, int pe) {
-    return nullptr;
-  }
-
-  __device__ __host__ int getNumQueuePairs() { return 0; }
-
-  __device__ __host__ int getNumDest() { return 0; }
-
-  static uint32_t externSharedBytes(int num_pes) { return 0; }
-
- public:
-  int num_pes{0};
-
-  int my_pe{-1};
-
-  int num_blocks{0};
-
-  uint32_t *hdp_rkey{nullptr};
-
-  uintptr_t *hdp_address{nullptr};
-
-  ibv_mr *hdp_mr{nullptr};
-
-  QueuePair *gpu_qps{nullptr};
-
-  uint32_t *heap_rkey{nullptr};
-
-  ibv_mr *heap_mr{nullptr};
-
-  uint32_t lkey{0};
-
-  atomic_ret_t *atomic_ret{nullptr};
-
-  ibv_mr *mr{nullptr};
-
-  char *g_ret{nullptr};
-
-  ConnectionImpl *connection_policy{nullptr};
-};
-)
-// clang-format on
-
-/*
- * Select which one of our IPC policies to use at compile time.
- */
-#ifdef USE_SINGLE_NODE
-typedef NetworkOffImpl NetworkImpl;
-#else
-typedef NetworkOnImpl NetworkImpl;
-#endif
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_NETWORK_POLICY_HPP_
diff --git a/src/gpu_ib/qe_dumper.cpp b/src/gpu_ib/qe_dumper.cpp
deleted file mode 100644
index e2360bd482..0000000000
--- a/src/gpu_ib/qe_dumper.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "qe_dumper.hpp"
-
-namespace rocshmem {
-
-QeDumper::QeDumper(int dest_pe, int src_wg, int index)
-    : dest_pe_(dest_pe), src_wg_(src_wg), index_(index) {
-  void* temp = malloc(sizeof(GPUIBBackend*));
-  gpu_backend_ = static_cast<GPUIBBackend*>(temp);
-
-  GPUIBBackend* device_backend_proxy_address;
-  CHECK_HIP(hipGetSymbolAddress(
-      reinterpret_cast<void**>(&device_backend_proxy_address),
-      HIP_SYMBOL(device_backend_proxy)));
-
-  CHECK_HIP(hipMemcpy(&gpu_backend_, device_backend_proxy_address,
-                      sizeof(GPUIBBackend*), hipMemcpyDeviceToHost));
-
-  int qp_offset = gpu_backend_->num_blocks_ * dest_pe_ + src_wg_;
-
-  qp_ = &(gpu_backend_->networkImpl.gpu_qps[qp_offset]);
-}
-
-QeDumper::~QeDumper() {
-  /*if (gpu_backend_) {
-      free(gpu_backend_);
-  }*/
-}
-
-void QeDumper::dump_cq() {
-  type_ = "CQ";
-
-  auto* raw_cqe = &(qp_->current_cq_q_H[index_]);
-  raw_u64_ = reinterpret_cast<uint64_t*>(raw_cqe);
-
-  dump_uint64_(8);
-}
-
-void QeDumper::dump_sq() {
-  type_ = "SQ";
-
-  auto* raw_sqe = &(qp_->current_sq_H[index_ * 8]);
-  raw_u64_ = reinterpret_cast<uint64_t*>(raw_sqe);
-
-  dump_uint64_(8);
-}
-
-void QeDumper::dump_uint64_(size_t num_elems) const {
-  printf("%s(%d, %d, %d) *** = ", type_.c_str(), dest_pe_, src_wg_, index_);
-
-  for (size_t i = 0; i < num_elems; i++) {
-    printf(" %lx ", raw_u64_[i]);
-  }
-
-  printf("done %s\n", type_.c_str());
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/qe_dumper.hpp b/src/gpu_ib/qe_dumper.hpp
deleted file mode 100644
index c613a190c0..0000000000
--- a/src/gpu_ib/qe_dumper.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
-#define LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
-
-#include <hip/hip_runtime.h>
-#include <infiniband/mlx5dv.h>
-
-#include <string>
-
-#include "backend_ib.hpp"
-#include "queue_pair.hpp"
-
-namespace rocshmem {
-
-class QeDumper {
- public:
-  QeDumper(int dest_pe, int src_wg, int index);
-
-  ~QeDumper();
-
-  void dump_cq();
-
-  void dump_sq();
-
- private:
-  void dump_uint64_(size_t num_elems) const;
-
-  int dest_pe_{-1};
-
-  int src_wg_{-1};
-
-  int index_{-1};
-
-  GPUIBBackend* gpu_backend_{nullptr};
-
-  std::string type_{};
-
-  QueuePair* qp_{nullptr};
-
-  uint64_t* raw_u64_{nullptr};
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_QE_DUMPER_HPP_
diff --git a/src/gpu_ib/queue_pair.cpp b/src/gpu_ib/queue_pair.cpp
deleted file mode 100644
index 49357ff66c..0000000000
--- a/src/gpu_ib/queue_pair.cpp
+++ /dev/null
@@ -1,437 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "queue_pair.hpp"
-
-#include <hip/hip_runtime.h>
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "backend_ib.hpp"
-#include "endian.hpp"
-#include "segment_builder.hpp"
-#include "../util.hpp"
-
-namespace rocshmem {
-
-QueuePair::QueuePair(GPUIBBackend *backend)
-    : hdp_policy(backend->hdp_policy),
-      connection_policy(*backend->networkImpl.connection_policy) {
-  hdp_rkey = backend->networkImpl.hdp_rkey;
-  hdp_address = backend->networkImpl.hdp_address;
-
-  atomic_ret.atomic_lkey = backend->networkImpl.atomic_ret->atomic_lkey;
-  atomic_ret.atomic_counter = 0;
-}
-
-__device__ QueuePair::~QueuePair() {
-  uint64_t start = profiler.startTimer();
-
-  global_qp->sq_counter = sq_counter;
-  global_qp->local_sq_cnt = local_sq_cnt;
-  global_qp->cq_consumer_counter = cq_consumer_counter;
-  global_qp->current_sq = current_sq;
-  global_qp->current_cq_q = current_cq_q;
-  global_qp->sq_overflow = sq_overflow;
-  global_qp->quiet_counter = quiet_counter;
-  profiler.endTimer(start, FINALIZE);
-
-  global_qp->profiler.accumulateStats(profiler);
-
-  __syncthreads();
-}
-
-__device__ uint8_t QueuePair::get_cq_error_syndrome(mlx5_cqe64 *cqe_entry) {
-  mlx5_err_cqe *cqe_err = reinterpret_cast<mlx5_err_cqe *>(cqe_entry);
-  return cqe_err->syndrome;
-}
-
-__device__ void QueuePair::ring_doorbell(uint64_t db_val) {
-  swap_endian_store(const_cast<uint32_t *>(dbrec_send),
-                    reinterpret_cast<uint32_t>(sq_counter));
-  STORE(db.ptr, db_val);
-  db.uint ^= 256;
-}
-
-__device__ void QueuePair::set_completion_flag_on_wqe(int num_wqes) {
-  uint64_t *wqe = &current_sq[8 * ((sq_counter - num_wqes) % max_nwqe)];
-  uint8_t *wqe_ce = reinterpret_cast<uint8_t *>(wqe) + 11;
-  *wqe_ce = 8;
-}
-
-template <>
-__device__ void QueuePair::update_wqe_ce_single<false>(int num_wqes) {
-  if (sq_counter % max_nwqe == (max_nwqe - 2)) {
-    set_completion_flag_on_wqe(num_wqes);
-    quiet_counter++;
-  }
-}
-
-template <>
-__device__ void QueuePair::update_wqe_ce_single<true>(int num_wqes) {
-  set_completion_flag_on_wqe(num_wqes);
-  quiet_counter++;
-}
-
-template <>
-__device__ void QueuePair::update_wqe_ce_thread<false>(int num_wqes) {}
-
-template <>
-__device__ void QueuePair::update_wqe_ce_thread<true>(int num_wqes) {
-  set_completion_flag_on_wqe(num_wqes);
-  atomicAdd(&quiet_counter, 1);
-}
-
-__device__ void QueuePair::compute_db_val_opcode(uint64_t *db_val,
-                                                 uint16_t dbrec_val,
-                                                 uint8_t opcode) {
-  uint64_t opcode64 = opcode;
-  opcode64 = opcode64 << 24 & 0x000000FFFF000000;
-
-  uint64_t dbrec = dbrec_val << 8;
-  dbrec = dbrec & 0x0000000000FFFF00;
-
-  uint64_t val = *db_val;
-  val = val & 0xFFFFFFFFFF0000FF;
-
-  *db_val = val | dbrec | opcode64;
-}
-
-template <class level>
-__device__ void QueuePair::quiet_internal() {
-  /*
-   * If there are nothing to quiet, just return early.
-   */
-  uint32_t quiet_val = quiet_counter;
-  if (!quiet_val) {
-    return;
-  }
-
-  profiler.incStat(QUIET_COUNT);
-  uint64_t start = profiler.startTimer();
-
-  /*
-   * Generate a pointer to the completion queue entry.
-   */
-  cq_consumer_counter = cq_consumer_counter + quiet_val - 1;
-  uint32_t indx = (cq_consumer_counter % cq_size);
-  mlx5_cqe64 *cqe_entry = &current_cq_q[indx];
-
-  /*
-   * Access the op_own value in the completion queue entry.
-   */
-  int val_ld = uncached_load_ubyte(&(cqe_entry->op_own));
-  uint8_t val_op_own = val_ld;
-
-  /*
-   * If the completion queue entry is not valid, wait for it to become so.
-   */
-  while (!((val_op_own & 0x1) == ((cq_consumer_counter >> cq_log_size) & 1)) ||
-         ((val_op_own) >> 4) == 0xF) {
-    val_ld = uncached_load_ubyte(&(cqe_entry->op_own));
-    val_op_own = val_ld;
-  }
-
-  /*
-   * Grab the opcode from the op_own field and report if it is an error.
-   */
-  uint8_t opcode = val_op_own >> 4;
-  if (opcode != 0) {
-    uint8_t syndrome = get_cq_error_syndrome(cqe_entry);
-    mlx5_err_cqe *cqe_err = reinterpret_cast<mlx5_err_cqe *>(cqe_entry);
-    GPU_DPRINTF("QUIET ERROR: signature %d opcode_qpn %llx wqe_cnt %llx \n",
-                syndrome, cqe_err->s_wqe_opcode_qpn, cqe_err->wqe_counter);
-  }
-
-  /*
-   * Decrement the quiet count by the amount determined at the beginning
-   * of this method.
-   *
-   * bpotter - There are two areas of concern in this method for me.
-   * 1) In multithreaded builds, we may need to make this method a critical
-   * section to prevent data races on these variables.
-   *
-   * 2) Is there a data race in the API if a one remote process calls quiet
-   * while another process continues adding events? Is it ever possible for
-   * a quiet to complete, but the quiet_counter decrement here is not set
-   * to zero?
-   */
-  level L;
-  L.decQuietCounter(&quiet_counter, quiet_val);
-
-  profiler.endTimer(start, POLL_CQ);
-  start = profiler.startTimer();
-
-  /*
-   * Increment the trailing index counter which tracks our spot in the
-   * completion queue.
-   */
-  cq_consumer_counter++;
-  swap_endian_store(const_cast<uint32_t *>(dbrec_cq), cq_consumer_counter);
-
-  profiler.endTimer(start, NEXT_CQ);
-}
-
-template <class level>
-__device__ void QueuePair::quiet_single() {
-  level L;
-  L.quiet(this);
-}
-
-template <class level>
-__device__ void QueuePair::quiet_single_heavy(int pe) {
-  level L;
-  L.quiet_heavy(this, pe);
-}
-
-template <class level, bool cqe>
-__device__ void QueuePair::update_posted_wqe_generic(
-    int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode,
-    int64_t atomic_data, int64_t atomic_cmp, bool ring_db,
-    uint64_t atomic_ret_pos, bool zero_byte_rd) {
-  uint64_t start = profiler.startTimer();
-
-  level L;
-  L.postLock(this, pe);
-  uint32_t num_wqes = connection_policy.getNumWqes(opcode);
-
-  // Get the index for my thread's put in the SQ.
-  uint64_t my_sq_counter = L.threadAtomicAdd(&sq_counter, num_wqes);
-  uint64_t my_sq_index = my_sq_counter % max_nwqe;
-
-  // 16-bit little endian version of the SQ index needed to build the cntrl
-  // segment in the WQE.
-  uint16_t le_sq_counter;
-  uint16_t sq_counter_u16 = my_sq_counter;
-  swap_endian_store(&le_sq_counter, sq_counter_u16);
-
-  bool flag = sq_overflow;
-  uint32_t lkey_in_stack_frame = lkey;
-  uint32_t rkey_in_stack_frame = rkey;
-  uint32_t ctrl_qp_sq_in_stack_frame = ctrl_qp_sq;
-  uint64_t ctrl_sig_in_stack_frame = ctrl_sig;
-
-  connection_policy.setRkey(&rkey_in_stack_frame, pe);
-
-  if (opcode == MLX5_OPCODE_RDMA_WRITE && !size) {
-    rkey_in_stack_frame = hdp_rkey[pe];
-    size = 4;
-  }
-
-  /*
-   * Build out all the segments required for my WQE(s) based on the
-   * operation, starting at my_sq_index into the SQ. SegmentBuilder will
-   * keep track of placing the segments in the correct location.
-   */
-  SegmentBuilder seg_build(my_sq_index, current_sq);
-  seg_build.update_cntrl_seg(opcode, le_sq_counter, ctrl_qp_sq_in_stack_frame,
-                             ctrl_sig_in_stack_frame, &connection_policy,
-                             zero_byte_rd);
-  seg_build.update_connection_seg(pe, &connection_policy);
-  seg_build.update_rdma_seg(raddr, rkey_in_stack_frame);
-
-  if (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS) {
-    seg_build.update_atomic_data_seg(atomic_data, atomic_cmp);
-    size = 8;
-    lkey_in_stack_frame = atomic_ret.atomic_lkey;
-    laddr = &atomic_ret.atomic_base_ptr[atomic_ret_pos];
-  }
-
-  if (size <= inline_threshold && opcode == MLX5_OPCODE_RDMA_WRITE) {
-    seg_build.update_inl_data_seg(laddr, size);
-  } else {
-    seg_build.update_data_seg(laddr, size, lkey_in_stack_frame);
-  }
-
-  profiler.incStat(WQE_COUNT);
-  profiler.endTimer(start, UPDATE_WQE);
-  start = profiler.startTimer();
-
-  L.template finishPost<cqe>(this, ring_db, num_wqes, pe, le_sq_counter,
-                             opcode);
-
-  profiler.incStat(DB_COUNT);
-  profiler.endTimer(start, RING_SQ_DB);
-}
-
-/******************************************************************************
- ****************************** SHMEM INTERFACE *******************************
- *****************************************************************************/
-template <class level>
-__device__ void QueuePair::put_nbi(void *dest, const void *source,
-                                   size_t nelems, int pe, bool db_ring) {
-  uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
-  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
-
-  update_posted_wqe_generic<level, false>(
-      pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0);
-}
-
-template <class level>
-__device__ void QueuePair::put_nbi_cqe(void *dest, const void *source,
-                                       size_t nelems, int pe, bool db_ring) {
-  uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
-  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
-
-  update_posted_wqe_generic<level, true>(
-      pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0);
-}
-
-template <class level>
-__device__ void QueuePair::get_nbi(void *dest, const void *source,
-                                   size_t nelems, int pe, bool db_ring) {
-  uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
-  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
-
-  update_posted_wqe_generic<level, false>(
-      pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0);
-}
-
-template <class level>
-__device__ void QueuePair::get_nbi_cqe(void *dest, const void *source,
-                                       size_t nelems, int pe, bool db_ring) {
-  uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
-  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
-
-  update_posted_wqe_generic<level, true>(
-      pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0);
-}
-
-template <class level>
-__device__ void QueuePair::zero_b_rd(int pe) {
-  uintptr_t *dst = reinterpret_cast<uintptr_t *>(base_heap[pe]);
-
-  update_posted_wqe_generic<level, true>(pe, 0, nullptr, dst,
-                                         MLX5_OPCODE_RDMA_READ, 0, 0, true, 0,
-                                         true);  // enable 0_byte read op
-}
-
-__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t value,
-                                           int64_t cond, int pe, bool db_ring,
-                                           uint8_t atomic_op) {
-  THREAD TH;
-  uint64_t pos = TH.threadAtomicAdd(
-      reinterpret_cast<unsigned long long *>(/* NOLINT(runtime/int) */
-                                             &atomic_ret.atomic_counter));
-
-  pos = pos % max_nb_atomic;
-
-  int64_t *atomic_base_ptr =
-      reinterpret_cast<int64_t *>(atomic_ret.atomic_base_ptr);
-
-  int64_t *load_address = &atomic_base_ptr[pos];
-
-  *load_address = -100;
-
-  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
-
-  update_posted_wqe_generic<THREAD, true>(pe, sizeof(int64_t), nullptr, dst,
-                                          atomic_op, value, cond, db_ring, pos);
-  quiet_single<THREAD>();
-
-  while (uncached_load(load_address) == -100) {
-  }
-
-  int64_t ret = *load_address;
-
-  __threadfence();
-
-  return ret;
-}
-
-__device__ void QueuePair::atomic_nofetch(void *dest, int64_t value,
-                                          int64_t cond, int pe, bool db_ring,
-                                          uint8_t atomic_op) {
-  THREAD TH;
-  uint64_t pos = TH.threadAtomicAdd(
-      reinterpret_cast<unsigned long long *>(/* NOLINT(runtime/int) */
-                                             &atomic_ret.atomic_counter));
-  pos = pos % max_nb_atomic;
-  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);
-
-  update_posted_wqe_generic<THREAD, true>(pe, sizeof(int64_t), nullptr, dst,
-                                          atomic_op, value, cond, db_ring, pos);
-
-  quiet_single<THREAD>();
-}
-
-__device__ void QueuePair::fence(int pe) {
-  // TODO(khamidou): should this be replaced by a zero_byte_rd?
-  // FIXME: the relaxed ordering requires an intervening read to order
-  // prior operations.
-  auto remote_hdp_uncast = hdp_address[pe];
-  uintptr_t *remote_hdp = reinterpret_cast<uintptr_t *>(remote_hdp_uncast);
-  update_posted_wqe_generic<THREAD, true>(
-      pe, 0, nullptr, remote_hdp, MLX5_OPCODE_RDMA_WRITE, 0, 0, true, 0);
-}
-
-__device__ void QueuePair::waitCQSpace(int num_msgs) {
-  // We cannot post more outstanding requests than the completion queue
-  // size.  Force a quiet if we are out of space.
-  if ((quiet_counter + num_msgs) >= cq_size) {
-    GPU_DPRINTF(
-        "*** inside post_cq forcing flush: outstanding %d "
-        "adding %d cq_size %d\n",
-        quiet_counter, num_msgs, cq_size);
-
-    // TODO(khamidou): More targeted flush would be better here.
-    quiet_single<THREAD>();
-  }
-}
-
-__device__ void QueuePair::waitSQSpace(int num_msgs) {
-  // We cannot post more outstanding requests than the Send queue
-  // size.  Force a quiet if we are out of space.
-  local_sq_cnt += num_msgs;
-  int div = local_sq_cnt / max_nwqe;
-
-  if (div > 0) {
-    GPU_DPRINTF(
-        "*** inside waitSQSpace forcing flush to overrun the SQ"
-        " sq_counter %d  adding %d quiet_conter %d \n",
-        sq_counter, num_msgs, max_nwqe, quiet_counter);
-
-    quiet_single<THREAD>();
-    local_sq_cnt = local_sq_cnt % max_nwqe;
-  }
-}
-
-void QueuePair::setDBval(uint64_t val) { db_val = val; }
-
-#define THREAD_LEVEL_GEN(T)                                                 \
-  template __device__ void QueuePair::put_nbi<T>(                           \
-      void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
-  template __device__ void QueuePair::put_nbi_cqe<T>(                       \
-      void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
-  template __device__ void QueuePair::get_nbi<T>(                           \
-      void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
-  template __device__ void QueuePair::get_nbi_cqe<T>(                       \
-      void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
-  template __device__ void QueuePair::zero_b_rd<T>(int pe);                 \
-  template __device__ void QueuePair::quiet_single<T>();                    \
-  template __device__ void QueuePair::quiet_single_heavy<T>(int pe);        \
-  template __device__ void QueuePair::quiet_internal<T>();
-
-THREAD_LEVEL_GEN(THREAD)
-THREAD_LEVEL_GEN(WG)
-THREAD_LEVEL_GEN(WAVE)
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/queue_pair.hpp b/src/gpu_ib/queue_pair.hpp
deleted file mode 100644
index f17c51b3fe..0000000000
--- a/src/gpu_ib/queue_pair.hpp
+++ /dev/null
@@ -1,431 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
-#define LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
-
-/**
- * @file queue_pair.hpp
- *
- * @section DESCRIPTION
- * An IB QueuePair (SQ and CQ) that the device can use to perform network
- * operations. Most important rocSHMEM operations are performed by this
- * class.
- */
-
-#include <infiniband/mlx5dv.h>
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../atomic_return.hpp"
-#include "connection_policy.hpp"
-#include "thread_policy.hpp"
-#include "../hdp_policy.hpp"
-#include "../stats.hpp"
-
-namespace rocshmem {
-
-class GPUIBBackend;
-
-enum gpu_ib_stats {
-  RING_SQ_DB = 0,
-  UPDATE_WQE,
-  POLL_CQ,
-  NEXT_CQ,
-  QUIET_COUNT,
-  DB_COUNT,
-  WQE_COUNT,
-  MEM_WAIT,
-  INIT,
-  FINALIZE,
-  GPU_IB_NUM_STATS
-};
-
-typedef union db_reg {
-  uint64_t *ptr;
-  uintptr_t uint;
-} db_reg_t;
-
-class QueuePair {
- public:
-  /**
-   * @brief Constructor.
-   *
-   * @param[in] backend Backend needed for member access.
-   */
-  explicit QueuePair(GPUIBBackend *backend);
-
-  /**
-   * @brief Destructor.
-   */
-  __device__ ~QueuePair();
-
-  /**
-   * @brief Inspect completion queue and possibly wait for free space.
-   *
-   * @param[in] num_msgs Number of entries needing space in completion queue.
-   */
-  __device__ void waitCQSpace(int num_msgs);
-
-  /**
-   * @brief Inspect send queue and possibly wait for free space.
-   *
-   * @param[in] num_msgs Number of entries needing space in send queue.
-   */
-  __device__ void waitSQSpace(int num_msgs);
-
-  /**
-   * @brief Create and enqueue a non-blocking put work queue entry (wqe).
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   *
-   * @param[in] dest Destination address for data transmission.
-   * @param[in] source Source address for data transmission.
-   * @param[in] nelems Size in bytes of data transmission.
-   * @param[in] pe Destination processing element of data transmission.
-   * @param[in] db_ring Denotes whether send queue door bell should be rung.
-   */
-  template <class level>
-  __device__ void put_nbi(void *dest, const void *source, size_t nelems, int pe,
-                          bool db_ring);
-
-  /**
-   * @brief Create and enqueue a non-blocking put work queue entry (wqe).
-   *
-   * @note This variant differs from put_nbi by requesting that a completion
-   * queue entry is generated in the completion queue.
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   *
-   * @param[in] dest Destination address for data transmission.
-   * @param[in] source Source address for data transmission.
-   * @param[in] nelems Size in bytes of data transmission.
-   * @param[in] pe Destination processing element of data transmission.
-   * @param[in] db_ring Denotes whether send queue door bell should be rung.
-   */
-  template <class level>
-  __device__ void put_nbi_cqe(void *dest, const void *source, size_t nelems,
-                              int pe, bool db_ring);
-
-  /**
-   * @brief Consume a completion queue entry from this queue pair's
-   * completion queue.
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   */
-  template <class level>
-  __device__ void quiet_single();
-
-  /**
-   * @brief Send a zero-byte read to enforce ordering and then consume
-   * a completion queue entry from this queue pair's completion queue.
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   *
-   * @param[in] pe Processing element id to send the zero_b_rd.
-   */
-  template <class level>
-  __device__ void quiet_single_heavy(int pe);
-
-  /**
-   * @brief Create and enqueue a HDP flush work queue entry on the remote PE.
-   *
-   * @param[in] pe Processing element id to send the HDP flush operation.
-   *
-   * TODO(@khamidou): does this require a zero_b_rd to enforce write ordering
-   * The HDP flush is itself a write. Could this write be reordered with
-   * respect to other write on the network and arrive out-of-order?
-   */
-  __device__ void fence(int pe);
-
-  /**
-   * @brief Create and enqueue a non-blocking get work queue entry (wqe).
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   *
-   * @param[in] dest Destination address for data transmission.
-   * @param[in] source Source address for data transmission.
-   * @param[in] nelems Size in bytes of data transmission.
-   * @param[in] pe Destination processing element of data transmission.
-   * @param[in] db_ring Denotes whether send queue door bell should be rung.
-   */
-  template <class level>
-  __device__ void get_nbi(void *dest, const void *source, size_t nelems, int pe,
-                          bool db_ring);
-
-  /**
-   * @brief Create and enqueue a non-blocking get work queue entry (wqe).
-   *
-   * @note This variant differs from get_nbi by requesting that a completion
-   * queue entry is generated in the completion queue.
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   *
-   * @param[in] dest Destination address for data transmission.
-   * @param[in] source Source address for data transmission.
-   * @param[in] nelems Size in bytes of data transmission.
-   * @param[in] pe Destination processing element of data transmission.
-   * @param[in] db_ring Denotes whether send queue door bell should be rung.
-   */
-  template <class level>
-  __device__ void get_nbi_cqe(void *dest, const void *source, size_t nelems,
-                              int pe, bool db_ring);
-
-  /**
-   * @brief Create and enqueue a zero-byte read to enforce write ordering.
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   *
-   * @param[in] pe Processing element id to send the zero_b_rd.
-   */
-  template <class level>
-  __device__ void zero_b_rd(int pe);
-
-  /**
-   * @brief Create and enqueue an atomic fetch work queue entry (wqe).
-   *
-   * @param[in] dest Destination address for data transmission.
-   * @param[in] value Data value for the atomic operation.
-   * @param[in] cond Used in atomic comparisons.
-   * @param[in] pe Destination processing element of data transmission.
-   * @param[in] db_ring Denotes whether send queue door bell should be rung.
-   * @param[in] atomic_op The atomic operation to perform.
-   *
-   * @return An atomic value
-   */
-  __device__ int64_t atomic_fetch(void *dest, int64_t value, int64_t cond,
-                                  int pe, bool db_ring, uint8_t atomic_op);
-
-  /**
-   * @brief Create and enqueue an atomic fetch work queue entry (wqe).
-   *
-   * @param[in] dest Destination address for data transmission.
-   * @param[in] value Data value for the atomic operation.
-   * @param[in] cond Used in atomic comparisons.
-   * @param[in] pe Destination processing element of data transmission.
-   * @param[in] db_ring Denotes whether send queue door bell should be rung.
-   * @param[in] atomic_op The atomic operation to perform.
-   */
-  __device__ void atomic_nofetch(void *dest, int64_t value, int64_t cond,
-                                 int pe, bool db_ring, uint8_t atomic_op);
-
-  /**
-   * @brief Helper method to set the doorbell's value.
-   *
-   * @param[in] val Desired value for the doorbell.
-   */
-  void setDBval(uint64_t val);
-
- protected:
-  /**
-   * @brief Helper method to build work requests for the send queue.
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   * @tparam cqe Flag to optionally generate cqes.
-   *
-   * @param[in] pe Destination processing element of data transmission.
-   * @param[in] size Size in bytes of data transmission.
-   * @param[in] laddr Local address.
-   * @param[in] raddr Remote address.
-   * @param[in] opcode Operation to be performed.
-   * @param[in] atomic_data An atomic data value to be used.
-   * @param[in] atomic_cmp An atomic comparison operation to be performed.
-   * @param[in] ring_db Boolean denoting if doorbell should be rung.
-   * @param[in] atomic_ret_pos Index into atomic return structure.
-   * @param[in] zero_byte_rd Boolean if zero byte read should be used.
-   */
-  template <class level, bool cqe>
-  __device__ __attribute__((noinline)) void update_posted_wqe_generic(
-      int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode,
-      int64_t atomic_data, int64_t atomic_cmp, bool ring_db,
-      uint64_t atomic_ret_pos, bool zero_byte_rd = false);
-
-  /**
-   * @brief Helper method to drain completion queue entries.
-   *
-   * @tparam level Implements specific behaviors for thread, warp, block access.
-   *
-   */
-  template <class level>
-  __device__ __attribute__((noinline)) void quiet_internal();
-
-  /**
-   * @brief Helper method to compute doorbell value opcode which is used to
-   * ring the doorbell.
-   *
-   * @param[in,out] db_val
-   * @param[in] dbrec_val
-   * @param[in] opcode
-   */
-  __device__ void compute_db_val_opcode(uint64_t *db_val, uint16_t dbrec_val,
-                                        uint8_t opcode);
-
-  /**
-   * @brief Helper method that sets the field in a work queue entry to
-   * generate a completion entry in the completion queue.
-   *
-   * @param num_wqes Number of work entries this completion entry represents.
-   */
-  __device__ void set_completion_flag_on_wqe(int num_wqes);
-
-  /**
-   * @brief Helper method to update fields for the work queue entry.
-   *
-   * @tparam cqe Flag to optionally generate cqes.
-   *
-   * @note Single variant is meant to be callable by a block leader.
-   */
-  template <bool cqe>
-  __device__ void update_wqe_ce_single(int num_wqes);
-
-  /**
-   * @brief Helper method to update fields for the work queue entry.
-   *
-   * @tparam cqe Flag to optionally generate cqes.
-   *
-   * @note Thread variant is meant to be callable by multiple threads.
-   */
-  template <bool cqe>
-  __device__ void update_wqe_ce_thread(int num_wqes);
-
-  /**
-   * @brief Helper method to ring the doorbell
-   *
-   * @param[in] db_val Doorbell value is written by method.
-   */
-  __device__ void ring_doorbell(uint64_t db_val);
-
-  /**
-   * @brief Helper method to extract syndrome field from cqe.
-   *
-   * @param[in] cq_entry Completion queue entry.
-   */
-  __device__ uint8_t get_cq_error_syndrome(mlx5_cqe64 *cq_entry);
-
- private:
-  const int inline_threshold{8};
-
-  /* TODO(bpotter): Most of these should be private/protected */
- public:
-#ifdef PROFILE
-  typedef Stats<GPU_IB_NUM_STATS> GPUIBStats;
-#else
-  typedef NullStats<GPU_IB_NUM_STATS> GPUIBStats;
-#endif
-
-  /*
-   * Pointer to the hardware doorbell register for the QP.
-   */
-  db_reg_t db{};
-
-  /*
-   * Base pointer of this QP's SQ
-   * TODO(bpotter): Use the correct struct type for this.
-   */
-  uint64_t *current_sq{nullptr};
-  uint64_t *current_sq_H{nullptr};
-
-  /*
-   * Base pointer of this QP's CQ
-   */
-  mlx5_cqe64 *current_cq_q{nullptr};
-  mlx5_cqe64 *current_cq_q_H{nullptr};
-
-  /*
-   * Pointer to the doorbell record for this SQ.
-   */
-  volatile uint32_t *dbrec_send{nullptr};
-
-  /*
-   * Pointer to the doorbell record for the CQ.
-   */
-  volatile uint32_t *dbrec_cq{nullptr};
-
-  uint32_t *hdp_rkey{nullptr};
-
-  uintptr_t *hdp_address{nullptr};
-
-  HdpPolicy *hdp_policy{};
-
-  atomic_ret_t atomic_ret{};
-
-  ThreadImpl threadImpl{};
-
-  ConnectionImpl connection_policy;
-
-  char *const *base_heap{nullptr};
-  /*
-   * Current index into the SQ (non-modulo size).
-   */
-  uint32_t sq_counter{0};
-  uint32_t local_sq_cnt{0};
-
-  /*
-   * Number of outstanding messages on this QP that need to be completed
-   * during a quiet operation.
-   */
-  uint32_t quiet_counter{0};
-
-  int num_cqs{0};
-
-  /*
-   * Current index into the SQ (non-module size).
-   */
-  uint32_t cq_consumer_counter{0};
-
-  uint16_t cq_log_size{0};
-
-  uint16_t cq_size{0};
-
-  uint32_t ctrl_qp_sq{0};
-
-  uint64_t ctrl_sig{0};
-
-  uint32_t rkey{0};
-
-  uint32_t lkey{0};
-
-  GPUIBStats profiler{};
-
-  uint16_t max_nwqe{0};
-
-  bool sq_overflow{0};
-
-  uint64_t db_val{};
-  /*
-   * Pointer to the QP in global memory that this QP is copied from.  When
-   * this QP is destroyed, the dynamic (indicies, stats, etc) in the
-   * global_qp are updated.
-   */
-  QueuePair *global_qp{nullptr};
-
-  friend SingleThreadImpl;
-  friend MultiThreadImpl;
-  friend THREAD;
-  friend WG;
-  friend WAVE;
-  friend RCConnectionImpl;
-  friend DCConnectionImpl;
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_QUEUE_PAIR_HPP_
diff --git a/src/gpu_ib/reliable_connection.cpp b/src/gpu_ib/reliable_connection.cpp
deleted file mode 100644
index 7988b6ad00..0000000000
--- a/src/gpu_ib/reliable_connection.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "reliable_connection.hpp"
-
-#include <mpi.h>
-
-#include "backend_ib.hpp"
-
-namespace rocshmem {
-
-ReliableConnection::ReliableConnection(GPUIBBackend* b) : Connection(b, 0) {}
-
-ReliableConnection::~ReliableConnection() {}
-
-Connection::InitQPState ReliableConnection::initqp(uint8_t port) {
-  InitQPState init{};
-
-  init.exp_qp_attr.qp_access_flags =
-      IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
-      IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
-  init.exp_qp_attr.port_num = port;
-
-  init.exp_attr_mask |= IBV_QP_ACCESS_FLAGS;
-
-  return init;
-}
-
-Connection::RtrState ReliableConnection::rtr(dest_info_t* dest, uint8_t port) {
-  RtrState rtr{};
-
-  rtr.exp_qp_attr.dest_qp_num = dest->qpn;
-  rtr.exp_qp_attr.rq_psn = dest->psn;
-  rtr.exp_qp_attr.ah_attr.port_num = port;
-  if (ib_state->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND) {
-    rtr.exp_qp_attr.ah_attr.dlid = dest->lid;
-  } else {
-    rtr.exp_qp_attr.ah_attr.is_global = 1;
-    rtr.exp_qp_attr.ah_attr.grh.dgid = dest->gid;
-    rtr.exp_qp_attr.ah_attr.grh.sgid_index = 0;
-    rtr.exp_qp_attr.ah_attr.grh.hop_limit = 1;
-  }
-
-  rtr.exp_attr_mask |= IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
-                       IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER;
-
-  return rtr;
-}
-
-Connection::RtsState ReliableConnection::rts(dest_info_t* dest) {
-  RtsState rts{};
-
-  rts.exp_qp_attr.sq_psn = dest->psn;
-
-  rts.exp_attr_mask |= IBV_QP_SQ_PSN;
-
-  return rts;
-}
-
-ibv_qp* ReliableConnection::create_qp_0(ibv_context* context,
-                                        ibv_qp_init_attr_ex* qp_attr) {
-  return ibv_create_qp_ex(context, qp_attr);
-}
-
-void ReliableConnection::create_qps_1() { }
-
-void ReliableConnection::create_qps_2(int port, int my_rank,
-                                        ibv_port_attr* ib_port_att) { }
-
-void ReliableConnection::create_qps_3(int port, ibv_qp* qp, int offset,
-                                        ibv_port_attr* ib_port_att) {
-  init_qp_status(qp, port);
-
-  all_qp[offset].lid = ib_port_att->lid;
-  all_qp[offset].qpn = qp->qp_num;
-  all_qp[offset].psn = 0;
-  union ibv_gid gid;
-  ibv_query_gid(ib_state->context, port, 0, &gid);
-  all_qp[offset].gid = gid;
-}
-
-void ReliableConnection::get_remote_conn(int* remote_conn) {
-  *remote_conn = backend->num_pes;
-}
-
-void ReliableConnection::allocate_dynamic_members(int num_blocks) {
-  all_qp.resize(backend->num_pes * num_blocks);
-}
-
-void ReliableConnection::free_dynamic_members() {
-}
-
-void ReliableConnection::initialize_1(int port, int num_blocks) {
-  MPI_Alltoall(MPI_IN_PLACE, sizeof(dest_info_t) * num_blocks, MPI_CHAR,
-               all_qp.data(), sizeof(dest_info_t) * num_blocks, MPI_CHAR,
-               backend->thread_comm);
-
-  for (int i = 0; i < qps.size(); i++) {
-    change_status_rtr(qps[i], &all_qp[i], port);
-  }
-
-  MPI_Barrier(backend->thread_comm);
-
-  for (int i = 0; i < qps.size(); i++) {
-    change_status_rts(qps[i], &all_qp[i]);
-  }
-}
-
-void ReliableConnection::initialize_rkey_handle(uint32_t** heap_rkey_handle,
-                                                  ibv_mr* mr) {
-  CHECK_HIP(
-      hipHostMalloc(heap_rkey_handle, sizeof(uint32_t) * backend->num_pes));
-  (*heap_rkey_handle)[backend->my_pe] = mr->rkey;
-}
-
-void ReliableConnection::free_rkey_handle(uint32_t* heap_rkey_handle) {
-  CHECK_HIP(hipHostFree(heap_rkey_handle));
-}
-
-Connection::QPInitAttr ReliableConnection::qpattr(ibv_qp_cap cap) {
-  QPInitAttr qpattr(cap);
-  qpattr.attr.qp_type = IBV_QPT_RC;
-  return qpattr;
-}
-
-void ReliableConnection::post_dv_rc_wqe(int remote_conn) {
-  mlx5_wqe_ctrl_seg* ctrl;
-  mlx5_wqe_raddr_seg* rdma;
-  mlx5_wqe_data_seg* data;
-
-  for (int i = 0; i < remote_conn; i++) {
-    int num_blocks = backend->num_blocks_;
-    for (int j = 0; j < num_blocks; j++) {
-      int qp_index = i * num_blocks + j;
-      uint64_t* ptr = get_address_sq(qp_index);
-
-      const uint16_t nb_post = 1;  // 4 * sq_size;
-      for (uint16_t index = 0; index < nb_post; index++) {
-        uint8_t op_mod = 0;
-        uint8_t op_code = 8;
-        uint32_t qp_num = qps[qp_index]->qp_num;
-        uint8_t fm_ce_se = 0;
-        uint8_t ds = 3;
-        ctrl = reinterpret_cast<mlx5_wqe_ctrl_seg*>(ptr);
-        mlx5dv_set_ctrl_seg(ctrl, index, op_code, op_mod, qp_num, fm_ce_se, ds,
-                            0, 0);
-        ptr = ptr + 2;
-
-        rdma = reinterpret_cast<mlx5_wqe_raddr_seg*>(ptr);
-        const auto& heap_bases = backend->heap.get_heap_bases();
-        auto temp = heap_bases[(backend->my_pe + 1) % 2];
-        uint64_t r_address = reinterpret_cast<uint64_t>(temp);
-        uint32_t rkey = backend->networkImpl.heap_rkey[i];
-        set_rdma_seg(rdma, r_address, rkey);
-        ptr = ptr + 2;
-
-        data = reinterpret_cast<mlx5_wqe_data_seg*>(ptr);
-        uint32_t lkey = backend->networkImpl.heap_mr->lkey;
-        temp = heap_bases[backend->my_pe];
-        uint64_t address = reinterpret_cast<uint64_t>(temp);
-        mlx5dv_set_data_seg(data, 1, lkey, address);
-        ptr = ptr + 4;
-      }
-    }
-  }
-}
-
-// TODO(bpotter): remove redundancies with the other derived class
-void ReliableConnection::post_wqes() {
-  int remote_conn;
-  get_remote_conn(&remote_conn);
-  post_dv_rc_wqe(remote_conn);
-}
-
-void ReliableConnection::initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah,
-                                              int dc_key) {}
-
-int ReliableConnection::get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) {
-  return pe_idx * num_qps + wg_idx;
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/reliable_connection.hpp b/src/gpu_ib/reliable_connection.hpp
deleted file mode 100644
index e642ab7990..0000000000
--- a/src/gpu_ib/reliable_connection.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
-#define LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
-
-#include <vector>
-
-#include "connection.hpp"
-
-namespace rocshmem {
-
-class ReliableConnection : public Connection {
- public:
-  explicit ReliableConnection(GPUIBBackend* backend);
-
-  ~ReliableConnection() override;
-
-  void get_remote_conn(int* remote_conn) override;
-
-  void post_wqes() override;
-
-  void initialize_rkey_handle(uint32_t** heap_rkey_handle,
-                                ibv_mr* mr) override;
-
-  void free_rkey_handle(uint32_t* heap_rkey_handle) override;
-
- private:
-  InitQPState initqp(uint8_t port) override;
-
-  RtrState rtr(dest_info_t* dest, uint8_t port) override;
-
-  RtsState rts(dest_info_t* dest) override;
-
-  QPInitAttr qpattr(ibv_qp_cap cap) override;
-
-  void create_qps_1() override;
-
-  void create_qps_2(int port, int my_rank,
-                      ibv_port_attr* ib_port_att) override;
-
-  void create_qps_3(int port, ibv_qp* qp, int offset,
-                      ibv_port_attr* ib_port_att) override;
-
-  ibv_qp* create_qp_0(ibv_context* context,
-                      ibv_qp_init_attr_ex* qp_attr) override;
-
-  void allocate_dynamic_members(int num_wg) override;
-
-  void free_dynamic_members() override;
-
-  void initialize_1(int port, int num_wg) override;
-
-  void initialize_wr_fields(ibv_send_wr* wr, ibv_ah* ah, int dc_key) override;
-
-  int get_sq_dv_offset(int pe_idx, int num_qps, int wg_idx) override;
-
-  std::vector<dest_info_t> all_qp;
-
-  void post_dv_rc_wqe(int remote_conn);
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_RELIABLE_CONNECTION_HPP_
diff --git a/src/gpu_ib/segment_builder.cpp b/src/gpu_ib/segment_builder.cpp
deleted file mode 100644
index 6f2a77a23e..0000000000
--- a/src/gpu_ib/segment_builder.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "segment_builder.hpp"
-
-#include "../util.hpp"
-#include "endian.hpp"
-
-namespace rocshmem {
-
-__device__ SegmentBuilder::SegmentBuilder(uint64_t wqe_idx, void *base) {
-  mlx5_segment *base_ptr = static_cast<mlx5_segment *>(base);
-  size_t segment_offset = SEGMENTS_PER_WQE * wqe_idx;
-  seg_ptr = &base_ptr[segment_offset];
-}
-
-__device__ void SegmentBuilder::update_cntrl_seg(
-    uint8_t opcode, uint16_t wqe_idx, uint32_t ctrl_qp_sq, uint64_t ctrl_sig,
-    ConnectionImpl *connection_policy, bool zero_byte_rd) {
-  mlx5_wqe_ctrl_seg ctrl_seg;
-
-  ctrl_seg.opmod_idx_opcode = (opcode << 24) | (wqe_idx << 8);
-
-  uint32_t DS = 2;
-  if (zero_byte_rd == false) {
-    DS = (opcode == MLX5_OPCODE_RDMA_WRITE || opcode == MLX5_OPCODE_RDMA_READ)
-             ? 3
-             : 4;
-  }
-
-  DS += connection_policy->wqeCntrlOffset();
-
-  ctrl_seg.qpn_ds = (DS << 24) | ctrl_qp_sq;
-
-  ctrl_seg.signature = ctrl_sig;
-
-  ctrl_seg.fm_ce_se = ctrl_sig >> 24;
-
-  ctrl_seg.imm = ctrl_sig >> 32;
-
-  memcpy(&seg_ptr->ctrl_seg, &ctrl_seg, sizeof(mlx5_wqe_ctrl_seg));
-
-  seg_ptr++;
-}
-
-__device__ void SegmentBuilder::update_atomic_data_seg(uint64_t atomic_data,
-                                                       uint64_t atomic_cmp) {
-  mlx5_wqe_atomic_seg atomic_seg;
-
-  swap_endian_store(reinterpret_cast<uint64_t *>(&atomic_seg.swap_add),
-                    atomic_data);
-
-  swap_endian_store(reinterpret_cast<uint64_t *>(&atomic_seg.compare),
-                    atomic_cmp);
-
-  memcpy(&seg_ptr->atomic_seg, &atomic_seg, sizeof(mlx5_wqe_atomic_seg));
-  seg_ptr++;
-}
-
-__device__ void SegmentBuilder::update_rdma_seg(uintptr_t *raddr,
-                                                uint32_t rkey) {
-  mlx5_wqe_raddr_seg raddr_seg;
-
-  raddr_seg.rkey = rkey;
-
-  swap_endian_store(reinterpret_cast<uint64_t *>(&raddr_seg.raddr),
-                    reinterpret_cast<uint64_t>(raddr));
-
-  memcpy(&seg_ptr->raddr_seg, &raddr_seg, sizeof(mlx5_wqe_raddr_seg));
-  seg_ptr++;
-}
-
-__device__ void SegmentBuilder::update_data_seg(uintptr_t *laddr, int32_t size,
-                                                uint32_t lkey) {
-  if (laddr == nullptr) {
-    return;
-  }
-
-  mlx5_wqe_data_seg data_seg;
-  data_seg.lkey = lkey;
-
-  swap_endian_store(&data_seg.byte_count, size & 0x7FFFFFFFU);
-  swap_endian_store(reinterpret_cast<uint64_t *>(&data_seg.addr),
-                    reinterpret_cast<uint64_t>(laddr));
-
-  memcpy(&seg_ptr->data_seg, &data_seg, sizeof(mlx5_wqe_data_seg));
-  seg_ptr++;
-}
-
-__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr,
-                                                    int32_t size) {
-  mlx5_wqe_inl_data_seg inl_data_seg;
-
-  swap_endian_store(&inl_data_seg.byte_count, (size & 0x3FF) | 0x80000000);
-
-  // Assume fence HDP flush
-  // TODO(khamidou): Rework fence interface to avoid this
-  size_t field_size{sizeof(mlx5_wqe_inl_data_seg)};
-  if (!laddr) {
-    uint8_t flush_val = 1;
-    memcpy(&inl_data_seg + 1, &flush_val, sizeof(flush_val));
-    field_size += sizeof(flush_val);
-  } else {
-    memcpy(&inl_data_seg + 1, laddr, size);
-    field_size += size;
-  }
-
-  memcpy(&seg_ptr->inl_data_seg, &inl_data_seg, field_size);
-  seg_ptr++;
-}
-
-__device__ void SegmentBuilder::update_connection_seg(
-    int pe, ConnectionImpl *conn_policy) {
-  if (conn_policy->updateConnectionSegmentImpl(&seg_ptr->base_av, pe)) {
-    seg_ptr++;
-  }
-}
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/segment_builder.hpp b/src/gpu_ib/segment_builder.hpp
deleted file mode 100644
index 866ff2c75b..0000000000
--- a/src/gpu_ib/segment_builder.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
-#define LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
-
-#include <infiniband/mlx5dv.h>
-
-#include "connection_policy.hpp"
-#include "infiniband_structs.hpp"
-#include "../util.hpp"
-
-namespace rocshmem {
-
-class SegmentBuilder {
- public:
-  __device__ SegmentBuilder(uint64_t wqe_idx, void *base);
-
-  __device__ void update_cntrl_seg(uint8_t opcode, uint16_t wqe_idx,
-                                   uint32_t ctrl_qp_sq, uint64_t ctrl_sig,
-                                   ConnectionImpl *connection_policy,
-                                   bool zero_byte_rd);
-
-  __device__ void update_connection_seg(int pe,
-                                        ConnectionImpl *connection_policy);
-
-  __device__ void update_atomic_data_seg(uint64_t atomic_data,
-                                         uint64_t atomic_cmp);
-
-  __device__ void update_rdma_seg(uintptr_t *raddr, uint32_t rkey);
-
-  __device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size);
-
-  __device__ void update_data_seg(uintptr_t *laddr, int32_t size,
-                                  uint32_t lkey);
-
- private:
-  const int SEGMENTS_PER_WQE = 4;
-
-  mlx5_segment *seg_ptr;
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_SEGMENT_BUILDER_HPP_
diff --git a/src/gpu_ib/thread_policy.cpp b/src/gpu_ib/thread_policy.cpp
deleted file mode 100644
index 80dbcf5482..0000000000
--- a/src/gpu_ib/thread_policy.cpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "thread_policy.hpp"
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "queue_pair.hpp"
-
-namespace rocshmem {
-
-__device__ void SingleThreadImpl::quiet(QueuePair *handle) {
-  handle->quiet_internal<THREAD>();
-}
-
-__device__ void SingleThreadImpl::quiet_heavy(QueuePair *handle, int pe) {
-  handle->zero_b_rd<THREAD>(pe);
-  handle->quiet_internal<THREAD>();
-}
-
-__device__ void MultiThreadImpl::quiet(QueuePair *handle) {
-  int thread_id = get_flat_block_id();
-  /*
-   * Each WF selects one thread to perform the quiet.  Only one thread
-   * per WG is allowed to do a quiet at once to avoid races with the CQ.
-   */
-  if (thread_id % WF_SIZE == lowerID()) {
-    while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
-    }
-    handle->quiet_internal<THREAD>();
-    __threadfence();
-    handle->threadImpl.cq_lock = 0;
-  }
-}
-
-__device__ void MultiThreadImpl::quiet_heavy(QueuePair *handle, int pe) {
-  int thread_id = get_flat_block_id();
-  /*
-   * Each WF selects one thread to perform the quiet.  Only one thread
-   * per WG is allowed to do a quiet at once to avoid races with the CQ.
-   */
-  if (thread_id % WF_SIZE == lowerID()) {
-    // zero_byte read
-    handle->zero_b_rd<THREAD>(pe);
-
-    while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
-    }
-    handle->quiet_internal<THREAD>();
-    __threadfence();
-    handle->threadImpl.cq_lock = 0;
-  }
-}
-
-__device__ void WG::quiet(QueuePair *handle) { handle->quiet_internal<WG>(); }
-
-__device__ void WG::quiet_heavy(QueuePair *handle, int pe) {
-  handle->zero_b_rd<THREAD>(pe);
-  handle->quiet_internal<WG>();
-}
-
-__device__ void WAVE::quiet(QueuePair *handle) {
-  int thread_id = get_flat_block_id();
-  /*
-   * Each WF selects one thread to perform the quiet.  Only one thread
-   * per WG is allowed to do a quiet at once to avoid races with the CQ.
-   */
-
-  if (thread_id % WF_SIZE == 0) {
-    while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
-    }
-    handle->quiet_internal<WAVE>();
-    __threadfence();
-    handle->threadImpl.cq_lock = 0;
-  }
-}
-
-__device__ void WAVE::quiet_heavy(QueuePair *handle, int pe) {
-  int thread_id = get_flat_block_id();
-  /*
-   * Each WF selects one thread to perform the quiet.  Only one thread
-   * per WG is allowed to do a quiet at once to avoid races with the CQ.
-   */
-  if (thread_id % WF_SIZE == 0) {
-    // post a zero-byte read
-    handle->zero_b_rd<THREAD>(pe);
-    while (atomicCAS(&(handle->threadImpl.cq_lock), 0, 1) == 1) {
-    }
-    handle->quiet_internal<WAVE>();
-    __threadfence();
-    handle->threadImpl.cq_lock = 0;
-  }
-}
-
-__device__ void SingleThreadImpl::decQuietCounter(uint32_t *quiet_counter,
-                                                  int num) {
-  *quiet_counter -= num;
-}
-
-__device__ void MultiThreadImpl::decQuietCounter(uint32_t *quiet_counter,
-                                                 int num) {
-  atomicSub(quiet_counter, num);
-}
-
-__device__ void WG::decQuietCounter(uint32_t *quiet_counter, int num) {
-  *quiet_counter -= num;
-}
-
-__device__ void WAVE::decQuietCounter(uint32_t *quiet_counter, int num) {
-  *quiet_counter -= num;
-}
-
-template <bool cqe>
-__device__ void SingleThreadImpl::finishPost(QueuePair *handle, bool ring_db,
-                                             int num_wqes, int pe,
-                                             uint16_t le_sq_counter,
-                                             uint8_t opcode) {
-  if (ring_db) {
-    uint64_t db_val = handle->db_val;
-    handle->compute_db_val_opcode(&db_val, le_sq_counter, opcode);
-    handle->update_wqe_ce_single<cqe>(num_wqes);
-    handle->ring_doorbell(db_val);
-  }
-}
-
-template <bool cqe>
-__device__ void MultiThreadImpl::finishPost(QueuePair *handle, bool ring_db,
-                                            int num_wqes, int pe,
-                                            uint16_t le_sq_counter,
-                                            uint8_t opcode) {
-  /*
-   * For RC, we can't allow a wave to have different PEs in it, else the
-   * doorbell ringing logic will not work.  This little for loop forces
-   * control flow divergence based on the PE.  It works well for small
-   * numbers of PEs, but we might want a different solution for large
-   * numbers.
-   */
-  if (handle->connection_policy.forcePostDivergence()) {
-    for (int i = 0; i < handle->num_cqs; i++) {
-      if (i != pe) {
-        continue;
-      }
-
-      finishPost_internal<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
-                               opcode);
-    }
-  } else {
-    finishPost_internal<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
-                             opcode);
-  }
-}
-
-template <bool cqe>
-__device__ void MultiThreadImpl::finishPost_internal(QueuePair *handle,
-                                                     bool ring_db, int num_wqes,
-                                                     int pe,
-                                                     uint16_t le_sq_counter,
-                                                     uint8_t opcode) {
-  /*
-   * Assuming here that postLock locks out all wavefronts in this WG but
-   * one, and that this will select a single thread in the wavefront.
-   */
-  __threadfence();
-  if (get_flat_block_id() % WF_SIZE == lowerID()) {
-    if (ring_db) {
-      uint64_t db_val =
-          handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
-                                  handle->max_nwqe)];
-      handle->update_wqe_ce_thread<true>(num_wqes);
-      handle->ring_doorbell(db_val);
-    }
-
-    handle->threadImpl.sq_lock = 0;
-  }
-}
-
-template <bool cqe>
-__device__ void WG::finishPost(QueuePair *handle, bool ring_db, int num_wqes,
-                               int pe, uint16_t le_sq_counter, uint8_t opcode) {
-  if (ring_db) {
-    uint64_t db_val = handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
-                                              handle->max_nwqe)];
-    handle->update_wqe_ce_single<cqe>(num_wqes);
-    handle->ring_doorbell(db_val);
-  }
-}
-
-template <bool cqe>
-__device__ void WAVE::finishPost(QueuePair *handle, bool ring_db, int num_wqes,
-                                 int pe, uint16_t le_sq_counter,
-                                 uint8_t opcode) {
-  if (ring_db) {
-    uint64_t db_val = handle->current_sq[8 * ((handle->sq_counter - num_wqes) %
-                                              handle->max_nwqe)];
-    handle->update_wqe_ce_thread<cqe>(num_wqes);
-    handle->ring_doorbell(db_val);
-  }
-  handle->threadImpl.sq_lock = 0;
-}
-
-__device__ void SingleThreadImpl::postLock(QueuePair *handle, int pe) {
-  handle->hdp_policy->hdp_flush();
-  // handle->waitCQSpace(1);
-  handle->waitSQSpace(1);
-}
-
-__device__ void MultiThreadImpl::postLock_internal(QueuePair *handle) {
-  int thread_id = get_flat_block_id();
-  int active_threads = wave_SZ();
-
-  if (thread_id % WF_SIZE == lowerID()) {
-    handle->hdp_policy->hdp_flush();
-    /*
-     * Don't let more than one wave in this WG go any further or a
-     * horrible variety of impossible to debug race conditions can occur.
-     */
-    while (atomicCAS(&(handle->threadImpl.sq_lock), 0, 1) == 1) {
-    }
-
-    /*
-     * This is a tiny bit over-aggressive as it assumes that all of the
-     * active_threads are going to the same PE when calculating whether
-     * we are full.
-     */
-    // handle->waitCQSpace(active_threads);
-    handle->waitSQSpace(active_threads);
-  }
-
-  /*
-   * Double check we've got the same exec mask (assuming divergence after
-   * the previous if.
-   */
-  if (active_threads != wave_SZ()) {
-    __builtin_trap();
-  }
-}
-
-__device__ void MultiThreadImpl::postLock(QueuePair *handle, int pe) {
-  /*
-   * For RC, we can't allow a wave to have different PEs in it, else the
-   * doorbell ringing logic will not work.  This little for loop forces
-   * control flow divergence based on the PE.  It works well for small
-   * numbers of PEs, but we might want a different solution for large
-   * numbers.
-   */
-  if (handle->connection_policy.forcePostDivergence()) {
-    for (int i = 0; i < handle->num_cqs; i++) {
-      if (i != pe) {
-        continue;
-      }
-
-      postLock_internal(handle);
-    }
-  } else {
-    postLock_internal(handle);
-  }
-}
-
-__device__ void WG::postLock(QueuePair *handle, int pe) {
-  handle->hdp_policy->hdp_flush();
-  // handle->waitCQSpace(1);
-  handle->waitSQSpace(1);
-}
-
-__device__ void WAVE::postLock(QueuePair *handle, int pe) {
-  handle->hdp_policy->hdp_flush();
-
-  /*
-   * Don't let more than one wave in this WG go any further or a horrible
-   * variety of impossible to debug race conditions can occur.
-   */
-  while (atomicCAS(&(handle->threadImpl.sq_lock), 0, 1) == 1) {
-  }
-
-  /*
-   * This is a tiny bit over-aggressive as it assumes that all of the
-   * active_threads are going to the same PE when calculating whether
-   * we are full.
-   */
-  // handle->waitCQSpace(1);
-  handle->waitSQSpace(1);
-}
-
-template <typename T>
-__device__ T SingleThreadImpl::threadAtomicAdd(T *val, T value) {
-  T old_val = *val;
-  *val += value;
-  return old_val;
-}
-
-template <typename T>
-__device__ T MultiThreadImpl::threadAtomicAdd(T *val, T value) {
-  return atomicAdd(val, value);
-}
-
-template <typename T>
-__device__ T WG::threadAtomicAdd(T *val, T value) {
-  T old_val = *val;
-  *val += value;
-  return old_val;
-}
-
-template <typename T>
-__device__ T WAVE::threadAtomicAdd(T *val, T value) {
-  return atomicAdd(val, value);
-}
-
-#define TYPE_GEN(T)                                                            \
-  template __device__ T SingleThreadImpl::threadAtomicAdd<T>(T * val,          \
-                                                             T value);         \
-  template __device__ T MultiThreadImpl::threadAtomicAdd<T>(T * val, T value); \
-  template __device__ T WG::threadAtomicAdd<T>(T * val, T value);              \
-  template __device__ T WAVE::threadAtomicAdd<T>(T * val, T value);
-
-TYPE_GEN(float)
-TYPE_GEN(double)
-TYPE_GEN(int)
-TYPE_GEN(unsigned int)
-TYPE_GEN(unsigned long long)  // NOLINT(runtime/int)
-
-#define TYPE_BOOL(T)                                                \
-  template __device__ void SingleThreadImpl::finishPost<T>(         \
-      QueuePair * handle, bool ring_db, int num_wqes, int pe,       \
-      uint16_t le_sq_counter, uint8_t opcode);                      \
-  template __device__ void MultiThreadImpl::finishPost<T>(          \
-      QueuePair * handle, bool ring_db, int num_wqes, int pe,       \
-      uint16_t le_sq_counter, uint8_t opcode);                      \
-  template __device__ void WG::finishPost<T>(                       \
-      QueuePair * handle, bool ring_db, int num_wqes, int pe,       \
-      uint16_t le_sq_counter, uint8_t opcode);                      \
-  template __device__ void WAVE::finishPost<T>(                     \
-      QueuePair * handle, bool ring_db, int num_wqes, int pe,       \
-      uint16_t le_sq_counter, uint8_t opcode);                      \
-  template __device__ void MultiThreadImpl::finishPost_internal<T>( \
-      QueuePair * handle, bool ring_db, int num_wqes, int pe,       \
-      uint16_t le_sq_counter, uint8_t opcode);
-
-TYPE_BOOL(true)
-TYPE_BOOL(false)
-
-}  // namespace rocshmem
diff --git a/src/gpu_ib/thread_policy.hpp b/src/gpu_ib/thread_policy.hpp
deleted file mode 100644
index be79cb350e..0000000000
--- a/src/gpu_ib/thread_policy.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
-#define LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
-
-#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
-#include "../util.hpp"
-
-namespace rocshmem {
-
-class QueuePair;
-
-/*
- * GPU single-thread policy class. Only a single work-item per work-group
- * is allowed to call into a rocSHMEM function (unless it is specifically
- * called out as a collective API. This thread policy is the fastest but
- * is not as flexible.
- */
-class SingleThreadImpl {
- public:
-  uint32_t cq_lock = 0;
-  uint32_t sq_lock = 0;
-
-  __device__ void quiet(QueuePair *handle);
-
-  __device__ void quiet_heavy(QueuePair *handle, int pe);
-
-  __device__ void decQuietCounter(uint32_t *quiet_counter, int num);
-
-  template <bool cqe>
-  __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
-                             int pe, uint16_t le_sq_counter, uint8_t opcode);
-
-  __device__ void postLock(QueuePair *handle, int pe);
-
-  template <typename T>
-  __device__ T threadAtomicAdd(T *val, T value = 1);
-};
-
-/*
- * GPU multi-thread policy class. Multiple work-items per work-group are
- * allowed to call into a rocSHMEM function.  A bit slower than its
- * single-thread counterpart but it enables a much more flexible user-facing
- * API.
- */
-class MultiThreadImpl {
-  /*
-   * Per-wg locks for the CQ and the SQ, respectively.
-   */
-  template <bool cqe>
-  __device__ void finishPost_internal(QueuePair *handle, bool ring_db,
-                                      int num_wqes, int pe,
-                                      uint16_t le_sq_counter, uint8_t opcode);
-
-  __device__ void postLock_internal(QueuePair *handle);
-
- public:
-  uint32_t cq_lock = 0;
-  uint32_t sq_lock = 0;
-
-  __device__ void quiet(QueuePair *handle);
-
-  __device__ void quiet_heavy(QueuePair *handle, int pe);
-
-  __device__ void decQuietCounter(uint32_t *quiet_counter, int num);
-
-  template <bool cqe>
-  __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
-                             int pe, uint16_t le_sq_counter, uint8_t opcode);
-
-  __device__ void postLock(QueuePair *handle, int pe);
-
-  template <typename T>
-  __device__ T threadAtomicAdd(T *val, T value = 1);
-};
-
-/*
- * Select which one of our thread policies to use at compile time.
- */
-#ifdef USE_THREADS
-typedef MultiThreadImpl ThreadImpl;
-#else
-typedef SingleThreadImpl ThreadImpl;
-#endif
-
-class THREAD {
- public:
-  ThreadImpl threadImpl;
-
-  __device__ void quiet(QueuePair *handle) { threadImpl.quiet(handle); }
-
-  __device__ void quiet_heavy(QueuePair *handle, int pe) {
-    threadImpl.quiet_heavy(handle, pe);
-  }
-
-  __device__ void decQuietCounter(uint32_t *quiet_counter, int num) {
-    threadImpl.decQuietCounter(quiet_counter, num);
-  }
-
-  template <bool cqe>
-  __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
-                             int pe, uint16_t le_sq_counter, uint8_t opcode) {
-    threadImpl.finishPost<cqe>(handle, ring_db, num_wqes, pe, le_sq_counter,
-                               opcode);
-  }
-
-  __device__ void postLock(QueuePair *handle, int pe) {
-    threadImpl.postLock(handle, pe);
-  }
-
-  template <typename T>
-  __device__ T threadAtomicAdd(T *val, T value = 1) {
-    T tmp = threadImpl.threadAtomicAdd(val, value);
-    return tmp;
-  }
-};
-
-class WAVE {
- public:
-  __device__ void quiet(QueuePair *handle);
-
-  __device__ void quiet_heavy(QueuePair *handle, int pe);
-
-  __device__ void decQuietCounter(uint32_t *quiet_counter, int num);
-
-  template <bool cqe>
-  __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
-                             int pe, uint16_t le_sq_counter, uint8_t opcode);
-
-  __device__ void postLock(QueuePair *handle, int pe);
-
-  template <typename T>
-  __device__ T threadAtomicAdd(T *val, T value = 1);
-};
-
-class WG {
- public:
-  __device__ void quiet(QueuePair *handle);
-
-  __device__ void quiet_heavy(QueuePair *handle, int pe);
-
-  __device__ void decQuietCounter(uint32_t *quiet_counter, int num);
-
-  template <bool cqe>
-  __device__ void finishPost(QueuePair *handle, bool ring_db, int num_wqes,
-                             int pe, uint16_t le_sq_counter, uint8_t opcode);
-
-  __device__ void postLock(QueuePair *handle, int pe);
-
-  template <typename T>
-  __device__ T threadAtomicAdd(T *val, T value = 1);
-};
-
-}  // namespace rocshmem
-
-#endif  // LIBRARY_SRC_GPU_IB_THREAD_POLICY_HPP_
diff --git a/src/ipc/backend_ipc.cpp b/src/ipc/backend_ipc.cpp
index 9cd99f9329..172472b474 100644
--- a/src/ipc/backend_ipc.cpp
+++ b/src/ipc/backend_ipc.cpp
@@ -35,7 +35,7 @@ namespace rocshmem {
 
 extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
 
-rocshmem_team_t get_external_team(GPUIBTeam *team) {
+rocshmem_team_t get_external_team(IPCTeam *team) {
   return reinterpret_cast<rocshmem_team_t>(team);
 }
 
@@ -223,7 +223,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team,
    * Allocate device-side memory for team_world and
    * construct a IPC team in it
    */
-  GPUIBTeam *new_team_obj;
+  IPCTeam *new_team_obj;
   CHECK_HIP(hipMalloc(&new_team_obj, sizeof(IPCTeam)));
   new (new_team_obj)
       IPCTeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes,
diff --git a/src/ipc/context_ipc_device.hpp b/src/ipc/context_ipc_device.hpp
index a2ab824fc5..d2e0f49565 100644
--- a/src/ipc/context_ipc_device.hpp
+++ b/src/ipc/context_ipc_device.hpp
@@ -298,4 +298,4 @@ class IPCContext : public Context {
 
 }  // namespace rocshmem
 
-#endif  // LIBRARY_SRC_GPU_IB_CONTEXT_IB_DEVICE_HPP_
+#endif // LIBRARY_SRC_IPC_CONTEXT_DEVICE_HPP_
diff --git a/src/memory/symmetric_heap.hpp b/src/memory/symmetric_heap.hpp
index ac36108273..4c6a39396c 100644
--- a/src/memory/symmetric_heap.hpp
+++ b/src/memory/symmetric_heap.hpp
@@ -35,7 +35,7 @@
  * both host access and device access to the memory space.
  *
  * The symmetric heaps are visible to network by registering them as
- * InfiniBand memory regions. Every memory region has a remote key
+ * memory regions. Every memory region has a remote key
  * which needs to be shared across the network (to access the memory
  * region).
  */
diff --git a/src/reverse_offload/context_ro_tmpl_host.hpp b/src/reverse_offload/context_ro_tmpl_host.hpp
index fb9a319d18..e989f2c5a9 100644
--- a/src/reverse_offload/context_ro_tmpl_host.hpp
+++ b/src/reverse_offload/context_ro_tmpl_host.hpp
@@ -29,14 +29,14 @@ namespace rocshmem {
 
 template <typename T>
 __host__ void ROHostContext::p(T *dest, T value, int pe) {
-  DPRINTF("Function: gpu_ib_host_p\n");
+  DPRINTF("Function: ro_host_p\n");
 
   host_interface->p<T>(dest, value, pe, context_window_info);
 }
 
 template <typename T>
 __host__ T ROHostContext::g(const T *source, int pe) {
-  DPRINTF("Function: gpu_ib_host_g\n");
+  DPRINTF("Function: ro_host_g\n");
 
   return host_interface->g<T>(source, pe, context_window_info);
 }
@@ -44,7 +44,7 @@ __host__ T ROHostContext::g(const T *source, int pe) {
 template <typename T>
 __host__ void ROHostContext::put(T *dest, const T *source, size_t nelems,
                                  int pe) {
-  DPRINTF("Function: gpu_ib_host_put\n");
+  DPRINTF("Function: ro_host_put\n");
 
   host_interface->put<T>(dest, source, nelems, pe, context_window_info);
 }
@@ -52,7 +52,7 @@ __host__ void ROHostContext::put(T *dest, const T *source, size_t nelems,
 template <typename T>
 __host__ void ROHostContext::get(T *dest, const T *source, size_t nelems,
                                  int pe) {
-  DPRINTF("Function: gpu_ib_host_get\n");
+  DPRINTF("Function: ro_host_get\n");
 
   host_interface->get<T>(dest, source, nelems, pe, context_window_info);
 }
@@ -60,7 +60,7 @@ __host__ void ROHostContext::get(T *dest, const T *source, size_t nelems,
 template <typename T>
 __host__ void ROHostContext::put_nbi(T *dest, const T *source, size_t nelems,
                                      int pe) {
-  DPRINTF("Function: gpu_ib_host_put_nbi\n");
+  DPRINTF("Function: ro_host_put_nbi\n");
 
   host_interface->put_nbi<T>(dest, source, nelems, pe, context_window_info);
 }
@@ -68,7 +68,7 @@ __host__ void ROHostContext::put_nbi(T *dest, const T *source, size_t nelems,
 template <typename T>
 __host__ void ROHostContext::get_nbi(T *dest, const T *source, size_t nelems,
                                      int pe) {
-  DPRINTF("Function: gpu_ib_host_get_nbi\n");
+  DPRINTF("Function: ro_host_get_nbi\n");
 
   host_interface->get_nbi<T>(dest, source, nelems, pe, context_window_info);
 }
@@ -107,7 +107,7 @@ __host__ void ROHostContext::broadcast(T *dest, const T *source, int nelems,
                                        int pe_root, int pe_start,
                                        int log_pe_stride, int pe_size,
                                        long *p_sync) {
-  DPRINTF("Function: gpu_ib_host_broadcast\n");
+  DPRINTF("Function: ro_host_broadcast\n");
 
   host_interface->broadcast<T>(dest, source, nelems, pe_root, pe_start,
                                log_pe_stride, pe_size, p_sync);
diff --git a/src/rocshmem.cpp b/src/rocshmem.cpp
index e9106c8fb4..931e4c83a0 100644
--- a/src/rocshmem.cpp
+++ b/src/rocshmem.cpp
@@ -36,10 +36,7 @@
 
 #include "backend_bc.hpp"
 #include "context_incl.hpp"
-#ifdef USE_GPU_IB
-#include "gpu_ib/backend_ib.hpp"
-#include "gpu_ib/context_ib_tmpl_host.hpp"
-#elif defined(USE_RO)
+#ifdef USE_RO
 #include "reverse_offload/backend_ro.hpp"
 #include "reverse_offload/context_ro_tmpl_host.hpp"
 #else
@@ -86,10 +83,7 @@ rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
 
   rocshmem_env_config_init();
 
-#ifdef USE_GPU_IB
-  CHECK_HIP(hipHostMalloc(&backend, sizeof(GPUIBBackend)));
-  backend = new (backend) GPUIBBackend(comm);
-#elif defined(USE_RO)
+#ifdef USE_RO
   CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
   backend = new (backend) ROBackend(comm);
 #else
diff --git a/src/rocshmem_gpu.cpp b/src/rocshmem_gpu.cpp
index 5517da4b47..fc78f0931e 100644
--- a/src/rocshmem_gpu.cpp
+++ b/src/rocshmem_gpu.cpp
@@ -49,9 +49,7 @@
 #include "templates.hpp"
 #include "util.hpp"
 
-#ifdef USE_GPU_IB
-#include "gpu_ib/context_ib_tmpl_device.hpp"
-#elif defined(USE_RO)
+#ifdef USE_RO
 #include "reverse_offload/context_ro_tmpl_device.hpp"
 #else
 #include "ipc/context_ipc_tmpl_device.hpp"
diff --git a/src/team.cpp b/src/team.cpp
index b162df67e4..2980337794 100644
--- a/src/team.cpp
+++ b/src/team.cpp
@@ -36,10 +36,6 @@ __host__ __device__ Team* get_internal_team(rocshmem_team_t team) {
   return reinterpret_cast<Team*>(team);
 }
 
-GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team) {
-  return reinterpret_cast<GPUIBTeam*>(team);
-}
-
 ROTeam* get_internal_ro_team(rocshmem_team_t team) {
   return reinterpret_cast<ROTeam*>(team);
 }
diff --git a/src/team.hpp b/src/team.hpp
index 249c1b123b..77aa63750d 100644
--- a/src/team.hpp
+++ b/src/team.hpp
@@ -33,7 +33,6 @@ namespace rocshmem {
 class Backend;
 class Team;
 class ROTeam;
-class GPUIBTeam;
 class IPCTeam;
 
 class TeamInfo {
@@ -154,13 +153,11 @@ class Team {
    *
    * @note This is required to do some reinterpret_casts.
    */
-  BackendType type{BackendType::GPU_IB_BACKEND};
+  BackendType type{BackendType::RO_BACKEND};
 };
 
 __host__ __device__ Team* get_internal_team(rocshmem_team_t team);
 
-GPUIBTeam* get_internal_gpu_ib_team(rocshmem_team_t team);
-
 ROTeam* get_internal_ro_team(rocshmem_team_t team);
 
 IPCTeam* get_internal_ipc_team(rocshmem_team_t team);