Runtime selection of IONIC (#272)

* Split ionic code to a subdirectory; dyld libionicl; move the fntable to provider_gda_xxx.hpp pass the pattr to ionic_setup_pd, include endian.hpp Enable building IONIC conduit for runtime selection * Uniform style for the fntable between ionic and the rest * Move mlx5 gda conduit to a subdir; resolve conflict with backend_can_run function * Don't forget to init qp for ionic, move mlx5 specialized init qp code to the mlx5 subdir * Don't add cmakecaches... Typo: GDA_BXNT * Add gda-ionic to all_backends build scripts * Apply suggestion from reviews Co-authored-by: Omri Mor <omri50@gmail.com> Co-authored-by: Edgar Gabriel <edgar.gabriel@amd.com> * Remove duplicate definitiion of DLSYM macros --------- Co-authored-by: Omri Mor <omri50@gmail.com> Co-authored-by: Edgar Gabriel <edgar.gabriel@amd.com> [ROCm/rocshmem commit: 3cfe76522e]
2025-10-16 15:53:01 -04:00
@@ -25,7 +25,6 @@
 find_package(PkgConfig QUIET)
 if (PkgConfig_FOUND)
 if (IBVerbs_ROOT )
-  # We don't use IBVerbs_DIR as this is supposed to be used when finding hwloc-config.cmake only
  set(ENV{PKG_CONFIG_PATH} "${IBVerbs_ROOT}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
 endif()
 pkg_check_modules(PC_IBVerbs QUIET libibverbs)
@@ -42,32 +41,11 @@ find_library(IBVerbs_LIBRARY
  PATH_SUFFIXES lib lib64
 )

-if (GDA_IONIC)
-list(APPEND provider_vars IBVerbs_IONIC_LIBRARY IBVerbs_IONIC_INCLUDE_DIR)
-find_path(IBVerbs_IONIC_INCLUDE_DIR infiniband/ionic_dv.h
-  HINTS ${PC_IBVerbs_INCLUDEDIR} ${PC_IBVerbs_INCLUDE_DIRS}
-  PATH_SUFFIXES include
-)
-
-find_library(IBVerbs_IONIC_LIBRARY
-  NAMES ionic libionic
-  HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS}
-  PATH_SUFFIXES lib lib64
-)
-
-add_library(IBVerbs::verbs_ionic UNKNOWN IMPORTED)
-set_target_properties(IBVerbs::verbs_ionic PROPERTIES
-  IMPORTED_LOCATION "${IBVerbs_IONIC_LIBRARY}"
-  INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_IONIC_INCLUDE_DIR}"
-)
-endif()
-
 find_package_handle_standard_args(IBVerbs DEFAULT_MSG
  IBVerbs_LIBRARY
  IBVerbs_INCLUDE_DIR
-  ${provider_vars}
 )
-mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR ${provider_vars})
+mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR)

 if (IBVerbs_FOUND)
 add_library(IBVerbs::verbs UNKNOWN IMPORTED)
@@ -76,9 +54,4 @@ set_target_properties(IBVerbs::verbs PROPERTIES
  INTERFACE_COMPILE_OPTIONS "${PC_IBVerbs_CFLAGS_OTHER}"
  INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_INCLUDE_DIR}"
 )
-
-target_link_libraries(IBVerbs::verbs INTERFACE
-  $<TARGET_NAME_IF_EXISTS:IBVerbs::verbs_ionic>
-)
-
 endif()
@@ -40,7 +40,7 @@ cmake \
    -DUSE_GDA=ON \
    -DGDA_MLX5=ON \
    -DGDA_BNXT=ON \
-    -DGDA_IONIC=OFF \
+    -DGDA_IONIC=ON \
    -DUSE_RO=ON \
    -DUSE_IPC=ON \
    -DUSE_THREADS=OFF \
@@ -35,7 +35,6 @@ target_sources(
    gda_team.cpp
    queue_pair.cpp
    endian.cpp
-    segment_builder.cpp
    topology.cpp
 )

@@ -48,4 +47,6 @@ target_link_libraries(
    numa
 )

+add_subdirectory(mlx5)
 add_subdirectory(bnxt)
+add_subdirectory(ionic)
@@ -122,11 +122,7 @@ GDABackend::~GDABackend() {
  cleanup_heap_memory_rkey();
  cleanup_ibv();

-  if (bnxtdv_handle_ != nullptr)
-    dlclose(bnxtdv_handle_);
-
-  if (mlx5dv_handle_ != nullptr)
-    dlclose(mlx5dv_handle_);
+  close_dv_libs();
 }

 void GDABackend::read_env() {
@@ -532,52 +528,45 @@ void GDABackend::rte_barrier() {
  }
 }

-int GDABackend::mlx5_dv_dl_init () {
-  mlx5dv_handle_ = dlopen("libmlx5.so", RTLD_NOW);
-  if (!mlx5dv_handle_) {
-    DPRINTF("Could not open libmlx5.so. Returning\n");
-    return ROCSHMEM_ERROR;
-  }
-
-  DLSYM_HELPER(mlx5dv_ftable_, mlx5dv_, mlx5dv_handle_, init_obj);
-  return ROCSHMEM_SUCCESS;
-}
-
 /* Currently we only check whether we can dlopen a Direct Verbs library.
-** We might need to extend this logic to check whether we have interfaces that
-** can use those DV libraries
-*/
+ * We might need to extend this logic to check whether we have interfaces that
+ * can use those DV libraries
+ */
 int GDABackend::backend_can_run() {
  void *handle{nullptr};

  /* Try opening bnxt DV libraries */
-  handle = dlopen("libbnxt_re.so", RTLD_NOW);
+#if defined(GDA_BNXT)
+  handle = bnxt_dv_dlopen();
  if (handle) {
    dlclose(handle);
    return ROCSHMEM_SUCCESS;
-  } else {
-    /* Try hard-coded PATH */
-    handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW);
-    if (handle) {
-      dlclose(handle);
-      return ROCSHMEM_SUCCESS;
-    }
  }
+#endif //defined(GDA_BNXT)
+
+  /* Try opening ionic DV libraries */
+#if defined(GDA_IONIC)
+  handle = ionic_dv_dlopen();
+  if (handle) {
+    dlclose(handle);
+    return ROCSHMEM_SUCCESS;
+  }
+#endif //defined(GDA_IONIC)

  /* Try opening mlx5 DV libraries */
-  handle = dlopen("libmlx5.so", RTLD_NOW);
+#if defined(GDA_MLX5)
+  handle = mlx5_dv_dlopen();
  if (handle) {
    dlclose(handle);
    return ROCSHMEM_SUCCESS;
  }
-
-  /* ToDo: opening ionic DV libraries */
+#endif //defined(GDA_MLX5)

  return ROCSHMEM_ERROR;
 }

 void GDABackend::setup_ibv() {
-  autodetect_dv_libs();
+  open_dv_libs();

  open_ib_device();

@@ -599,22 +588,22 @@ void GDABackend::cleanup_ibv() {
    CHECK_HIP(hipHostUnregister(db_region_attr.dbr));

    for (int i = 0; i < qps.size(); i++) {
-      err = bnxtdv_ftable_.destroy_qp(qps[i]);
+      err = bnxt_re_dv.destroy_qp(qps[i]);
      CHECK_ZERO(err, "bnxt_re_dv_destroy_qp");

-      err = bnxtdv_ftable_.umem_dereg(bnxt_qps[i].attr.rq_umem_handle);
+      err = bnxt_re_dv.umem_dereg(bnxt_qps[i].attr.rq_umem_handle);
      CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (RQ)");

-      err = bnxtdv_ftable_.umem_dereg(bnxt_qps[i].attr.sq_umem_handle);
+      err = bnxt_re_dv.umem_dereg(bnxt_qps[i].attr.sq_umem_handle);
      CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (SQ)");

      CHECK_HIP(hipFree(bnxt_qps[i].sq_buf));
      CHECK_HIP(hipFree(bnxt_qps[i].rq_buf));

-      err = bnxtdv_ftable_.destroy_cq(cqs[i]);
+      err = bnxt_re_dv.destroy_cq(cqs[i]);
      CHECK_ZERO(err, "bnxt_re_dv_destroy_cq");

-      err = bnxtdv_ftable_.umem_dereg(bnxt_cqs[i].umem_handle);
+      err = bnxt_re_dv.umem_dereg(bnxt_cqs[i].umem_handle);
      CHECK_ZERO(err, "bnxt_re_dv_umem_dereg");

      CHECK_HIP(hipFree(bnxt_cqs[i].buf));
@@ -647,13 +636,14 @@ void GDABackend::cleanup_ibv() {
  CHECK_ZERO(err, "ibv_close_device");
 }

-void GDABackend::autodetect_dv_libs() {
+void GDABackend::open_dv_libs() {
  int ret;

-#ifdef GDA_IONIC
-  gda_vendor = GDAVendor::IONIC;
-#endif
+  //TODO: environment variable selection/deselection
+  //this hardcoded init order will always prefer BNXT>IONIC>MLX5
+  //if all three drivers are installed

+#if defined(GDA_BNXT)
  if (gda_vendor == GDAVendor::NONE) {
    ret = bnxt_dv_dl_init();

@@ -663,7 +653,21 @@ void GDABackend::autodetect_dv_libs() {
      DPRINTF("Initializing rocSHMEM BNXT GDA support failed\n");
    }
  }
+#endif // defined(GDA_BNXT)

+#if defined(GDA_IONIC)
+  if (gda_vendor == GDAVendor::NONE) {
+    ret = ionic_dv_dl_init();
+
+    if (ret == ROCSHMEM_SUCCESS) {
+      gda_vendor = GDAVendor::IONIC;
+    } else {
+      DPRINTF("Initializing rocSHMEM IONIC GDA support failed\n");
+    }
+  }
+#endif // defined(GDA_IONIC)
+
+#if defined(GDA_MLX5)
  if (gda_vendor == GDAVendor::NONE) {
    ret = mlx5_dv_dl_init();

@@ -673,13 +677,26 @@ void GDABackend::autodetect_dv_libs() {
      DPRINTF("Initializing rocSHMEM MLX5 GDA support failed\n");
    }
  }
+#endif // defined(GDA_MLX5)

  if (gda_vendor == GDAVendor::NONE) {
-    printf("Initializing rocSHMEM with IONIC, BNXT, or MLX5 GDA support failed\n");
+    printf("Initializing rocSHMEM with IONIC, BNXT, or MLX5 GDA support failed: no DV library found\n");
    abort();
  }
 }

+void GDABackend::close_dv_libs() {
+  if (ionicdv_handle_ != nullptr)
+    dlclose(ionicdv_handle_);
+
+  if (bnxtdv_handle_ != nullptr)
+    dlclose(bnxtdv_handle_);
+
+  if (mlx5dv_handle_ != nullptr)
+    dlclose(mlx5dv_handle_);
+
+  gda_vendor = GDAVendor::NONE;
+}

 void GDABackend::exchange_qp_dest_info() {
  for (int i = 0; i < qps.size(); i++) {
@@ -752,11 +769,7 @@ void GDABackend::setup_gpu_qps() {
    new (&host_qps[i]) QueuePair(pd_orig, gda_vendor);
    CHECK_HIP(hipMemcpy(&gpu_qps[i], &host_qps[i], sizeof(QueuePair), hipMemcpyDefault));

-    if (gda_vendor == GDAVendor::BNXT) {
-      bnxt_initialize_gpu_qp(&gpu_qps[i], i);
-    } else {
-      initialize_gpu_qp(&gpu_qps[i], i);
-    }
+    initialize_gpu_qp(&gpu_qps[i], i);
  }
 }

@@ -844,7 +857,7 @@ void GDABackend::modify_qps_reset_to_init() {

  for (int i =0; i < qps.size() ; i++) {
    if (gda_vendor == GDAVendor::BNXT) {
-      err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
+      err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0);
    } else {
      err = ibv_modify_qp(qps[i], &attr, attr_mask);
    }
@@ -896,7 +909,7 @@ void GDABackend::modify_qps_init_to_rtr() {
    }

    if (gda_vendor == GDAVendor::BNXT) {
-      err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
+      err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0);
    } else {
      err = ibv_modify_qp(qps[i], &attr, attr_mask);
    }
@@ -932,7 +945,7 @@ void GDABackend::modify_qps_rtr_to_rts() {
    attr.sq_psn = dest_info[i].psn;

    if (gda_vendor == GDAVendor::BNXT) {
-      err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
+      err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0);
    } else {
      err = ibv_modify_qp(qps[i], &attr, attr_mask);
    }
@@ -1057,19 +1070,9 @@ void GDABackend::create_parent_domain() {
  CHECK_NNULL(pd_parent, "ibv_alloc_parent_domain");
  dump_ibv_pd(pd_parent);

-#ifdef GDA_IONIC
-  ionic_dv_pd_set_sqcmb(pd_parent, false, false, false);
-  ionic_dv_pd_set_rqcmb(pd_parent, false, false, false);
-
-  for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) {
-    pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(context, &pattr);
-    CHECK_NNULL(pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)");
-
-    ionic_dv_pd_set_sqcmb(pd_uxdma[uxdma_i], false, false, false);
-    ionic_dv_pd_set_rqcmb(pd_uxdma[uxdma_i], false, false, false);
-    ionic_dv_pd_set_udma_mask(pd_uxdma[uxdma_i], 1u << uxdma_i);
+  if (gda_vendor == GDAVendor::IONIC) {
+    ionic_setup_parent_domain(&pattr);
  }
-#endif /* GDA_IONIC */
 }

 void GDABackend::create_cqs(int cqe) {
@@ -1099,123 +1102,18 @@ void GDABackend::create_cqs(int cqe) {
 }

 void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
-  int hip_dev_id{-1};
-  CHECK_HIP(hipGetDevice(&hip_dev_id));
-
-#ifdef GDA_IONIC
-  ionic_dv_ctx dvctx;
-  ionic_dv_get_ctx(&dvctx, context);
-
-  void* gpu_db_page = nullptr;
-  rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id);
-
-  uint64_t *db_page_u64 = reinterpret_cast<uint64_t*>(dvctx.db_page);
-  uint64_t *gpu_db_page_u64 = reinterpret_cast<uint64_t*>(gpu_db_page);
-
-  uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64];
-
-  gpu_db_page = gpu_db_page;
-  gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype];
-  gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype];
-
-  uint8_t udma_idx = ionic_dv_qp_get_udma_idx(qps[conn_num]);
-
-  ionic_dv_cq dvcq;
-  ionic_dv_get_cq(&dvcq, cqs[conn_num], udma_idx);
-
-  gpu_qp->cq_dbreg = gpu_db_cq;
-  gpu_qp->cq_dbval = dvcq.q.db_val;
-  gpu_qp->cq_mask = dvcq.q.mask;
-
-  gpu_qp->ionic_cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
-
-  ionic_dv_qp dvqp;
-  ionic_dv_get_qp(&dvqp, qps[conn_num]);
-
-  gpu_qp->sq_dbreg = gpu_db_sq;
-  gpu_qp->sq_dbval = dvqp.sq.db_val;
-  gpu_qp->sq_mask = dvqp.sq.mask;
-  gpu_qp->ionic_sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
-
-  strncpy(gpu_qp->dev_name,
-          qps[conn_num]->context->device->name,
-          sizeof(gpu_qp->dev_name));
-  gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0;
-
-  gpu_qp->qp_num = qps[conn_num]->qp_num;
-  gpu_qp->lkey = heap_mr->lkey;
-  gpu_qp->rkey = heap_rkey[conn_num % num_pes];
-  gpu_qp->inline_threshold = 32;
-#endif /* GDA_IONIC */
-  if (gda_vendor == GDAVendor::MLX5) {
-    mlx5dv_cq cq_out;
-    mlx5dv_obj mlx_obj;
-    mlx_obj.cq.in = cqs[conn_num];
-    mlx_obj.cq.out = &cq_out;
-    mlx5dv_ftable_.init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
-    dump_mlx5dv_cq(&cq_out, conn_num);
-
-    /*
-     * struct mlx5dv_cq {
-     *   void                    *buf;
-     *   __be32                  *dbrec;
-     *   uint32_t                cqe_cnt;
-     *   uint32_t                cqe_size;
-     *   void                    *cq_uar;
-     *   uint32_t                cqn;
-     *   uint64_t                comp_mask;
-     * };
-    */
-
-    gpu_qp->cq_buf = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
-    gpu_qp->cq_cnt = cq_out.cqe_cnt;
-    gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt);
-    gpu_qp->cq_dbrec = cq_out.dbrec;
-
-    mlx5dv_qp qp_out;
-    mlx_obj.qp.in = qps[conn_num];
-    mlx_obj.qp.out = &qp_out;
-    mlx5dv_ftable_.init_obj(&mlx_obj, MLX5DV_OBJ_QP);
-    dump_mlx5dv_qp(&qp_out, conn_num);
-
-    /*
-     * struct mlx5dv_qp {
-     *   __be32 *dbrec;
-     *   struct {
-     *     void *buf;
-     *     uint32_t wqe_cnt;
-     *     uint32_t stride;
-     *   } sq;
-     *   struct {
-     *     void *buf;
-     *     uint32_t wqe_cnt;
-     *     uint32_t stride;
-     *   } rq;
-     *   struct {
-     *     void *reg;
-     *     uint32_t size;
-     *   } bf;
-     *   uint64_t comp_mask;
-     *   off_t uar_mmap_offset;
-     *   uint32_t tirn;
-     *   uint32_t tisn;
-     *   uint32_t rqn;
-     *   uint32_t sqn;
-     *   uint64_t tir_icm_addr;
-     * };
-     */
-
-    gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR
-    gpu_qp->sq_buf = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
-    gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt;
-    gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]);
-    gpu_qp->lkey = htobe32(heap_mr->lkey);
-    gpu_qp->qp_num = qps[conn_num]->qp_num;
-    gpu_qp->inline_threshold = inline_threshold;
-    // The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers
-    void* gpu_ptr{nullptr};
-    rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id);
-    gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
+  switch (gda_vendor) {
+  case GDAVendor::IONIC:
+    ionic_initialize_gpu_qp(gpu_qp, conn_num);
+    break;
+  case GDAVendor::BNXT:
+    bnxt_initialize_gpu_qp(gpu_qp, conn_num);
+    break;
+  case GDAVendor::MLX5:
+    mlx5_initialize_gpu_qp(gpu_qp, conn_num);
+    break;
+  default:
+    assert(false /* GDAVendor initialize_gpu_qp */);
  }
 }

@@ -40,32 +40,6 @@
 #include "gda/bnxt/provider_gda_bnxt.hpp"
 #include "gda/mlx5/provider_gda_mlx5.hpp"

-struct bnxtdv_funcs_t {
-  int (*init_obj)(struct bnxt_re_dv_obj *obj, uint64_t obj_type);
-  struct ibv_qp* (*create_qp)(struct ibv_pd *pd,
-                              struct bnxt_re_dv_qp_init_attr *qp_attr);
-  int (*destroy_qp)(struct ibv_qp *ibvqp);
-  int (*modify_qp)(struct ibv_qp *ibv_qp, struct ibv_qp_attr *attr,
-                   int attr_mask, uint32_t type, uint32_t value);
-  int (*qp_mem_alloc)(struct ibv_pd *ibvpd,
-                      struct ibv_qp_init_attr *attr,
-                      struct bnxt_re_dv_qp_mem_info *dv_qp_mem);
-  struct ibv_cq* (*create_cq)(struct ibv_context *ibvctx,
-                              struct bnxt_re_dv_cq_init_attr *cq_attr);
-  int (*destroy_cq)(struct ibv_cq *ibv_cq);
-  void* (*cq_mem_alloc)(struct ibv_context *ibvctx, int num_cqe,
-                        struct bnxt_re_dv_cq_attr *cq_attr);
-  void* (*umem_reg)(struct ibv_context *ibvctx,
-                    struct bnxt_re_dv_umem_reg_attr *in);
-  int (*umem_dereg)(void *umem_handle);
-  int (*get_default_db_region)(struct ibv_context *ibvctx,
-                               struct bnxt_re_dv_db_region_attr *out);
-};
-
-struct mlx5dv_funcs_t {
-  int (*init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type);
-};
-
 namespace rocshmem {

 class GDAContext;
@@ -325,6 +299,8 @@ class GDABackend : public Backend {

  void initialize_gpu_qp(QueuePair* qp, int conn_num);
  void bnxt_initialize_gpu_qp(QueuePair* qp, int conn_num);
+  void ionic_initialize_gpu_qp(QueuePair* qp, int conn_num);
+  void mlx5_initialize_gpu_qp(QueuePair* qp, int conn_num);

  /**
   * @brief Setup InfiniBand Resources
@@ -337,9 +313,14 @@ class GDABackend : public Backend {
  void cleanup_ibv();

  /**
-   * @brief Detect the available direct verbs libraries
+   * @brief Detect and load the available direct verbs libraries
   */
-  void autodetect_dv_libs();
+  void open_dv_libs();
+
+  /**
+   * @ brief Close opened direct verbs libraries
+   */
+  void close_dv_libs();

  /**
   * @brief Open InfiniBand Device and create common structures
@@ -404,6 +385,7 @@ class GDABackend : public Backend {
  static void pd_release(ibv_pd* pd, void* pd_context, void* ptr, uint64_t resource_type);

  void create_parent_domain();
+  void ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* pattr);

  void setup_gpu_qps();
  void cleanup_gpu_qps();
@@ -509,7 +491,7 @@ class GDABackend : public Backend {
   * @brief structures holding the function pointers to the direct verbs functionality
   * of each network driver.
   */
-  bnxtdv_funcs_t bnxtdv_ftable_;
+  bnxtdv_funcs_t bnxt_re_dv;

  /**
   * @brief handle used for the dlopen of the BCOM library
@@ -521,11 +503,16 @@ class GDABackend : public Backend {
   */
  int bnxt_dv_dl_init();

+  /**
+   * @brief open bnxt dv lib
+   */
+  static void* bnxt_dv_dlopen();
+
  /**
   * @brief structures holding the function pointers to the direct verbs functionality
   * of each network driver.
   */
-  mlx5dv_funcs_t mlx5dv_ftable_;
+  mlx5dv_funcs_t mlx5dv;

  /**
   * @brief handle used for the dlopen of the MLX5 library
@@ -536,6 +523,32 @@ class GDABackend : public Backend {
   * @brief initialize function table for MLNX direct verbs support
   */
  int mlx5_dv_dl_init();
+
+  /**
+   * @brief open mlx5 dv lib
+   */
+  static void* mlx5_dv_dlopen();
+
+  /**
+   * @brief structures holding the function pointers to the direct verbs functionality
+   * of each network driver.
+   */
+  ionicdv_funcs_t ionic_dv;
+
+  /**
+   * @brief handle used for the dlopen of the IONIC library
+   */
+  void *ionicdv_handle_{nullptr};
+
+  /**
+   * @brief initialize function table for IONIC direct verbs support
+   */
+  int ionic_dv_dl_init();
+
+  /**
+   * @brief open ionic dv lib
+   */
+  static void* ionic_dv_dlopen();
 };

 }  // namespace rocshmem
@@ -42,7 +42,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
  dv_obj.cq.in  = cqs[conn_num];
  dv_obj.cq.out = &dv_cq;

-  err = bnxtdv_ftable_.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ);
+  err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ);
  CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)");

  memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq));
@@ -56,7 +56,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
  dv_obj.qp.in  = ib_qp;
  dv_obj.qp.out = &dv_qp;

-  err = bnxtdv_ftable_.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP);
+  err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP);
  CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");

  memset(&gpu_qp->sq, 0, sizeof(bnxt_device_sq));
@@ -76,7 +76,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
  gpu_qp->sq.mtu         = ibv_mtu_to_int(portinfo.active_mtu);

  /* Export DB */
-  err = bnxtdv_ftable_.get_default_db_region(context, &db_region_attr);
+  err = bnxt_re_dv.get_default_db_region(context, &db_region_attr);
  CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");

  CHECK_HIP(hipHostRegister(db_region_attr.dbr, getpagesize(), hipHostRegisterDefault));
@@ -98,7 +98,7 @@ void GDABackend::bnxt_create_cqs(int cqe) {
  for (int i = 0; i < qps.size(); i++) {
    /* Allocate CQ mem */
    memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr));
-    bnxt_cqs[i].handle = bnxtdv_ftable_.cq_mem_alloc(context, cqe, &cq_attr);
+    bnxt_cqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr);
    CHECK_NNULL(bnxt_cqs[i].handle, "bnxt_re_dv_cq_mem_alloc");

    /* Allocate CQ UMEM */
@@ -112,7 +112,7 @@ void GDABackend::bnxt_create_cqs(int cqe) {
    umem_attr.size         = bnxt_cqs[i].length;
    umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;

-    bnxt_cqs[i].umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr);
+    bnxt_cqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
    CHECK_NNULL(bnxt_cqs[i].umem_handle, "bnxt_re_dv_umem_reg(cq_buf)");

    /* Create CQ */
@@ -121,7 +121,7 @@ void GDABackend::bnxt_create_cqs(int cqe) {
    cq_init_attr.umem_handle = bnxt_cqs[i].umem_handle;
    cq_init_attr.ncqe        = cq_attr.ncqe;

-    cqs[i] = bnxtdv_ftable_.create_cq(context, &cq_init_attr);
+    cqs[i] = bnxt_re_dv.create_cq(context, &cq_init_attr);
    CHECK_NNULL(cqs[i], "bnxt_re_dv_create_cq");
  }
 }
@@ -152,7 +152,7 @@ void GDABackend::bnxt_create_qps(int sq_length) {

    /* Alloc qp_mem_info */
    memset(&bnxt_qps[i].mem_info, 0, sizeof(struct bnxt_re_dv_qp_mem_info));
-    err = bnxtdv_ftable_.qp_mem_alloc(pd_orig, &ib_qp_attr, &bnxt_qps[i].mem_info);
+    err = bnxt_re_dv.qp_mem_alloc(pd_orig, &ib_qp_attr, &bnxt_qps[i].mem_info);
    CHECK_ZERO(err, "bnxt_re_dv_qp_mem_alloc");

    /* Alloc SQ */
@@ -177,7 +177,7 @@ void GDABackend::bnxt_create_qps(int sq_length) {
    umem_attr.size         = bnxt_qps[i].mem_info.sq_len;
    umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;

-    sq_umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr);
+    sq_umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
    CHECK_NNULL(sq_umem_handle, "bnxt_re_dv_umem_reg(sq)");

    memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr));
@@ -185,7 +185,7 @@ void GDABackend::bnxt_create_qps(int sq_length) {
    umem_attr.size         = bnxt_qps[i].mem_info.rq_len;
    umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;

-    rq_umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr);
+    rq_umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
    CHECK_NNULL(rq_umem_handle, "bnxt_re_dv_umem_reg(rq)");

    /* IB DV QP Init Attr */
@@ -214,33 +214,40 @@ void GDABackend::bnxt_create_qps(int sq_length) {
    bnxt_qps[i].attr.comp_mask = bnxt_qps[i].mem_info.comp_mask;

    /* Alloc QP */
-    qps[i] = bnxtdv_ftable_.create_qp(pd_orig, &bnxt_qps[i].attr);
+    qps[i] = bnxt_re_dv.create_qp(pd_orig, &bnxt_qps[i].attr);
    CHECK_NNULL(qps[i], "bnxt_re_dv_create_qp");
  }
 }

-int GDABackend::bnxt_dv_dl_init() {
-  bnxtdv_handle_ = dlopen("libbnxt_re.so", RTLD_NOW);
-  if (!bnxtdv_handle_) {
+void* GDABackend::bnxt_dv_dlopen() {
+  void* dv_handle{nullptr};
+  dv_handle = dlopen("libbnxt_re.so", RTLD_NOW);
+  if (!dv_handle) {
    // Try hard-coded PATH
-    bnxtdv_handle_ = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW);
-    if (!bnxtdv_handle_) {
+    dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW);
+    if (!dv_handle) {
      DPRINTF("Could not open libbnxt_re.so. Returning\n");
-      return ROCSHMEM_ERROR;
    }
  }
+  return dv_handle;
+}

-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, init_obj);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, create_qp);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, destroy_qp);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, modify_qp);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, qp_mem_alloc);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, create_cq);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, destroy_cq);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, cq_mem_alloc);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, umem_reg);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, umem_dereg);
-  DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, get_default_db_region);
+int GDABackend::bnxt_dv_dl_init() {
+  bnxtdv_handle_ = bnxt_dv_dlopen();
+  if (!bnxtdv_handle_)
+    return ROCSHMEM_ERROR;
+
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, init_obj);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, create_qp);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, destroy_qp);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, modify_qp);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, qp_mem_alloc);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, create_cq);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, destroy_cq);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, cq_mem_alloc);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, umem_reg);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, umem_dereg);
+  DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, get_default_db_region);

  return ROCSHMEM_SUCCESS;
 }
@@ -83,4 +83,26 @@ struct bnxt_host_qp {

 /*****************************************************************************/

+struct bnxtdv_funcs_t {
+  int (*init_obj)(struct bnxt_re_dv_obj *obj, uint64_t obj_type);
+  struct ibv_qp* (*create_qp)(struct ibv_pd *pd,
+                              struct bnxt_re_dv_qp_init_attr *qp_attr);
+  int (*destroy_qp)(struct ibv_qp *ibvqp);
+  int (*modify_qp)(struct ibv_qp *ibv_qp, struct ibv_qp_attr *attr,
+                   int attr_mask, uint32_t type, uint32_t value);
+  int (*qp_mem_alloc)(struct ibv_pd *ibvpd,
+                      struct ibv_qp_init_attr *attr,
+                      struct bnxt_re_dv_qp_mem_info *dv_qp_mem);
+  struct ibv_cq* (*create_cq)(struct ibv_context *ibvctx,
+                              struct bnxt_re_dv_cq_init_attr *cq_attr);
+  int (*destroy_cq)(struct ibv_cq *ibv_cq);
+  void* (*cq_mem_alloc)(struct ibv_context *ibvctx, int num_cqe,
+                        struct bnxt_re_dv_cq_attr *cq_attr);
+  void* (*umem_reg)(struct ibv_context *ibvctx,
+                    struct bnxt_re_dv_umem_reg_attr *in);
+  int (*umem_dereg)(void *umem_handle);
+  int (*get_default_db_region)(struct ibv_context *ibvctx,
+                               struct bnxt_re_dv_db_region_attr *out);
+};
+
 #endif  //LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_
@@ -0,0 +1,36 @@
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+target_sources(
+  ${PROJECT_NAME}
+  PRIVATE
+    backend_gda_ionic.cpp
+)
+
+if(GDA_IONIC)
+target_sources(
+  ${PROJECT_NAME}
+  PRIVATE
+    queue_pair_ionic.cpp
+)
+endif()
@@ -0,0 +1,122 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "gda/backend_gda.hpp"
+#include "util.hpp"
+
+namespace rocshmem {
+
+void GDABackend::ionic_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
+  ionic_dv_ctx dvctx;
+  ionic_dv.get_ctx(&dvctx, context);
+
+  int hip_dev_id{-1};
+  CHECK_HIP(hipGetDevice(&hip_dev_id));
+
+  void* gpu_db_page = nullptr;
+  rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id);
+
+  uint64_t *db_page_u64 = reinterpret_cast<uint64_t*>(dvctx.db_page);
+  uint64_t *gpu_db_page_u64 = reinterpret_cast<uint64_t*>(gpu_db_page);
+
+  uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64];
+
+  gpu_db_page = gpu_db_page;
+  gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype];
+  gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype];
+
+  uint8_t udma_idx = ionic_dv.qp_get_udma_idx(qps[conn_num]);
+
+  ionic_dv_cq dvcq;
+  ionic_dv.get_cq(&dvcq, cqs[conn_num], udma_idx);
+
+  gpu_qp->cq_dbreg = gpu_db_cq;
+  gpu_qp->cq_dbval = dvcq.q.db_val;
+  gpu_qp->cq_mask = dvcq.q.mask;
+
+  gpu_qp->ionic_cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
+
+  ionic_dv_qp dvqp;
+  ionic_dv.get_qp(&dvqp, qps[conn_num]);
+
+  gpu_qp->sq_dbreg = gpu_db_sq;
+  gpu_qp->sq_dbval = dvqp.sq.db_val;
+  gpu_qp->sq_mask = dvqp.sq.mask;
+  gpu_qp->ionic_sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
+
+  strncpy(gpu_qp->dev_name,
+          qps[conn_num]->context->device->name,
+          sizeof(gpu_qp->dev_name));
+  gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0;
+
+  gpu_qp->qp_num = qps[conn_num]->qp_num;
+  gpu_qp->lkey = heap_mr->lkey;
+  gpu_qp->rkey = heap_rkey[conn_num % num_pes];
+  gpu_qp->inline_threshold = 32;
+}
+
+void GDABackend::ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* pattr) {
+  ionic_dv.pd_set_sqcmb(pd_parent, false, false, false);
+  ionic_dv.pd_set_rqcmb(pd_parent, false, false, false);
+
+  for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) {
+    pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(context, pattr);
+    CHECK_NNULL(pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)");
+
+    ionic_dv.pd_set_sqcmb(pd_uxdma[uxdma_i], false, false, false);
+    ionic_dv.pd_set_rqcmb(pd_uxdma[uxdma_i], false, false, false);
+    ionic_dv.pd_set_udma_mask(pd_uxdma[uxdma_i], 1u << uxdma_i);
+  }
+}
+
+void* GDABackend::ionic_dv_dlopen() {
+  void* dv_handle{nullptr};
+  dv_handle = dlopen("libionic.so", RTLD_NOW);
+  if (!dv_handle) {
+    // Try hard-coded PATH
+    dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_NOW);
+    if (!dv_handle) {
+      DPRINTF("Could not open libionic.so. Returning\n");
+    }
+  }
+  return dv_handle;
+}
+
+int GDABackend::ionic_dv_dl_init() {
+  ionicdv_handle_ = ionic_dv_dlopen();
+  if (!ionicdv_handle_)
+    return ROCSHMEM_ERROR;
+
+  DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_ctx);
+  DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, qp_get_udma_idx);
+  DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_cq);
+  DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_qp);
+  DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_sqcmb);
+  DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_rqcmb);
+  DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_udma_mask);
+
+  return ROCSHMEM_SUCCESS;
+}
+
+}  // namespace rocshmem
@@ -0,0 +1,245 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.  All rights reserved.
+ */
+
+#ifndef IONIC_DV_H
+#define IONIC_DV_H
+
+#include <stdbool.h>
+#include <infiniband/verbs.h>
+
+struct ibv_cq;
+struct ibv_qp;
+
+/** IONIC_PD_TAG - tag used for parent domain resource allocation. */
+#define IONIC_PD_TAG		((uint64_t)RDMA_DRIVER_IONIC << 32)
+#define IONIC_PD_TAG_CQ		(IONIC_PD_TAG | 1)
+#define IONIC_PD_TAG_SQ		(IONIC_PD_TAG | 2)
+#define IONIC_PD_TAG_RQ		(IONIC_PD_TAG | 3)
+#define IONIC_PD_TAG_RCQ	(IONIC_PD_TAG | 4)
+
+/* deprecated */
+#define IONIC_SQ_SIG_ALL	1
+/* deprecated */
+#define IONIC_SQ_SIG_HACK_HIGH	2
+
+/** IONIC_UDMA_MASK_LOW - flag represents the udma0 pipeline in the udma mask. */
+#define IONIC_UDMA_MASK_LOW	1
+/** IONIC_UDMA_MASK_HIGH - flag represents the udma1 pipeline in the udma mask. */
+#define IONIC_UDMA_MASK_HIGH	2
+
+#define IONIC_DV_PUEC_NPORTS_MAX 8
+
+/** struct ionic_dv_ctx - Context information for gpu-initiated rdma. */
+struct ionic_dv_ctx {
+	void			*db_page;
+	uint64_t		*db_ptr;
+	uint8_t			sq_qtype;
+	uint8_t			rq_qtype;
+	uint8_t			cq_qtype;
+};
+
+/** struct ionic_dv_ctx - Queue information for gpu-initiated rdma. */
+struct ionic_dv_queue {
+	void			*ptr;
+	size_t			size;
+	uint64_t		db_val;
+	uint16_t		mask;
+	uint8_t			depth_log2;
+	uint8_t			stride_log2;
+};
+
+/** struct ionic_dv_ctx - CQ information for gpu-initiated rdma. */
+struct ionic_dv_cq {
+	struct ionic_dv_queue	q;
+};
+
+/** struct ionic_dv_ctx - QP information for gpu-initiated rdma. */
+struct ionic_dv_qp {
+	struct ionic_dv_queue	rq;
+	struct ionic_dv_queue	sq;
+};
+
+/** struct ionic_puec_route - Info needed to setup a PUEC plane route. */
+struct ionic_dv_puec_route {
+        union ibv_gid	dgid;
+        union ibv_gid	sgid;
+        uint32_t	flow_label;
+        uint8_t		hop_limit;
+        uint8_t		traffic_class;
+	uint8_t		sl;
+	uint8_t		rsvd[5];
+	uint32_t	flags;
+};
+
+/**
+ * ionic_dv_is_ionic_ctx - Test if context belongs to ionic provider.
+ */
+bool ionic_dv_is_ionic_ctx(struct ibv_context *ibctx);
+
+/**
+ * ionic_dv_is_ionic_pd - Test if pd belongs to ionic provider.
+ */
+bool ionic_dv_is_ionic_pd(struct ibv_pd *ibpd);
+
+/**
+ * ionic_dv_is_ionic_cq - Test if cq belongs to ionic provider.
+ */
+bool ionic_dv_is_ionic_cq(struct ibv_cq *ibcq);
+
+/**
+ * ionic_dv_is_ionic_qp - Test if qp belongs to ionic provider.
+ */
+bool ionic_dv_is_ionic_qp(struct ibv_qp *ibqp);
+
+
+/**
+ * ionic_dv_ctx_get_udma_count - Get number of udma pipelines.
+ */
+uint8_t ionic_dv_ctx_get_udma_count(struct ibv_context *ibctx);
+
+/**
+ * ionic_dv_ctx_get_udma_mask - Get mask of udma pipeline ids.
+ */
+uint8_t ionic_dv_ctx_get_udma_mask(struct ibv_context *ibctx);
+
+/**
+ * ionic_dv_pd_get_udma_mask - Get mask of udma pipeline ids of pd or parent domain.
+ */
+uint8_t ionic_dv_pd_get_udma_mask(struct ibv_pd *ibpd);
+
+/**
+ * ionic_dv_pd_set_udma_mask - Restrict pipeline ids of pd or parent domain.
+ *
+ * Queues associated with this pd will be restricted to one of the pipelines enabled by
+ * the mask at the time of queue creation.
+ *
+ * Recommended usage is to create a pd, then parent domains of that pd for each different
+ * udma mask.  Set the desired udma mask on each parent domain.  Then, create queues
+ * associated with the parent domain with the desired udma mask.
+ *
+ * Alternative usage is to create a pd, and set the desired udma mask prior to creating
+ * each queue.  Changing the udma mask of the pd has no effect on previously created
+ * queues.
+ */
+int ionic_dv_pd_set_udma_mask(struct ibv_pd *ibpd, uint8_t udma_mask);
+
+/**
+ * ionic_dv_cq_get_udma_mask - Get mask of udma pipeline ids of completion queueue.
+ */
+uint8_t ionic_dv_cq_get_udma_mask(struct ibv_cq *ibcq);
+
+/**
+ * ionic_dv_qp_get_udma_idx - Get udma pipeline id of queueue pair.
+ */
+uint8_t ionic_dv_qp_get_udma_idx(struct ibv_qp *ibqp);
+
+
+/**
+ * ionic_dv_pd_set_sqcmb - Specify send queue preference for controller memory bar.
+ *
+ * Send queues associated with this pd will use the controller memory bar according to
+ * this preference at the time of queue creation.
+ *
+ * @enable - Allow the use of the controller memory bar.
+ * @expdb - Allow the use of express doorbell optimizations.
+ * @require - Require preferences to be met, no fallback.
+ */
+int ionic_dv_pd_set_sqcmb(struct ibv_pd *ibpd, bool enable, bool expdb, bool require);
+
+/**
+ * ionic_dv_pd_set_rqcmb - Specify receive queue preference for controller memory bar.
+ *
+ * See ionic_dv_pd_set_sqcmb().
+ */
+int ionic_dv_pd_set_rqcmb(struct ibv_pd *ibpd, bool enable, bool expdb, bool require);
+
+
+/**
+ * ionic_dv_qp_set_gda - Enable or disable GPU-Direct Async (GDA) mode.
+ *
+ * In GDA mode, when the application calls ibv_post_send() or ibv_post_recv(), the
+ * provider writes WQEs in the descriptor ring without ringing the doorbell.
+ *
+ * To ring the doorbell, after posting the work the application should query to get the
+ * doorbell data, and later write that data to the memory mapped doorbell register.
+ *
+ * See also: ionic_dv_get_ctx()
+ * See also: ionic_dv_qp_get_send_dbell_data()
+ * See also: ionic_dv_qp_get_recv_dbell_data()
+ *
+ * @ibqp - Set GDA mode for this queue pair.
+ * @enable_send - Enable GDA mode for the send queue.
+ * @enable_recv - Enable GDA mode for the recv queue.
+ */
+int ionic_dv_qp_set_gda(struct ibv_qp *ibqp, bool enable_send, bool enable_recv);
+
+/**
+ * ionic_dv_qp_get_send_dbell_data - Get send queue doorbell data.
+ *
+ * In GDA mode, when the application calls ibv_post_send() the provider writes WQEs in
+ * the descriptor ring without ringing the doorbell.  The application should query the
+ * doorbell data immediately after posting the work.  The application requests the
+ * GPU to fill the source buffers of the data transfer with the result of computation.
+ * The application requests the GPU to write the doorbell data to the memory mapped
+ * doorbell register immediately when the computation is complete, triggering the data
+ * transfer.
+ *
+ * It is important that the GPU ring the doorbell in sequential order.  If work requests
+ * are posted in batches A, B, and C, with respective doorbell data, the data path must
+ * not write B or C before A, and must not write C before B.  It is ok to skip writing a
+ * doorbell, like writing only C, which will trigger the data transfer for all of the
+ * work up to that point in the sequence.
+ *
+ * @ibqp - Get send doorbell data for this queue pair.
+ * @dbdata - Output parameter for doorbell data.
+ */
+int ionic_dv_qp_get_send_dbell_data(struct ibv_qp *ibqp, uint64_t *dbdata);
+
+/**
+ * ionic_dv_qp_get_recv_dbell_data - Get recv queue doorbell data.
+ *
+ * In GDA mode, when the application calls ibv_post_recv() the provider writes WQEs in
+ * the descriptor ring without ringing the doorbell.  After polling recv completions, the
+ * application can immediately re-post the receive buffers without ringing the doorbell.
+ * The application should query the doorbell data immediately after posting the buffers.
+ * The application requests the GPU consume the data from the receive buffers.  The
+ * application requests the GPU to write the doorbell data to the memory mapped doorbell
+ * register immediately after the received data is consumed, making the buffers available
+ * for the next data transfer.
+ *
+ * It is important that the GPU ring the doorbell in sequential order.  If work requests
+ * are posted in batches A, B, and C, with respective doorbell data, the data path must
+ * not write B or C before A, and must not write C before B.  It is ok to skip writing a
+ * doorbell, like writing only C, which will make buffers available up to that point in
+ * the sequence.
+ *
+ * @ibqp - Get recv doorbell data for this queue pair.
+ * @dbdata - Output parameter for doorbell data.
+ */
+int ionic_dv_qp_get_recv_dbell_data(struct ibv_qp *ibqp, uint64_t *dbdata);
+
+
+/**
+ * ionic_dv_get_ctx - Extract context information for gpu-initiated rdma.
+ */
+int ionic_dv_get_ctx(struct ionic_dv_ctx *dvctx, struct ibv_context *ibctx);
+
+/**
+ * ionic_dv_get_cq - Extract cq information for gpu-initiated rdma.
+ */
+int ionic_dv_get_cq(struct ionic_dv_cq *dvcq, struct ibv_cq *ibcq, uint8_t udma_idx);
+
+/**
+ * ionic_dv_get_qp - Extract qp information for gpu-initiated rdma.
+ */
+int ionic_dv_get_qp(struct ionic_dv_qp *dvqp, struct ibv_qp *ibqp);
+
+/**
+ * ionic_dv_qp_set_puec_plane_route - set route info for a PUEC plane.
+ */
+int ionic_dv_qp_set_puec_plane_route(struct ibv_qp *ibqp, uint8_t plane_idx,
+				     struct ionic_dv_puec_route *ipr);
+
+#endif /* IONIC_DV_H */
@@ -0,0 +1,534 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2018-2022 Pensando Systems, Inc.  All rights reserved.
+ *               2022-2024 Advanced Micro Devices, Inc.  All rights reserved.
+ */
+
+#ifndef IONIC_FW_H
+#define IONIC_FW_H
+
+#if !defined(__cplusplus)
+#include <util/util.h>
+#else
+#ifndef BIT
+#define BIT(n) (1u << (n))
+#endif
+#endif
+
+#define IONIC_EXP_DBELL_SZ 8
+
+/* common to all versions */
+
+/* wqe scatter gather element */
+struct ionic_sge {
+	__be64				va;
+	__be32				len;
+	__be32				lkey;
+};
+
+/* admin queue mr type */
+enum ionic_mr_flags {
+	/* bits that determine mr access */
+	IONIC_MRF_LOCAL_WRITE		= BIT(0),
+	IONIC_MRF_REMOTE_WRITE		= BIT(1),
+	IONIC_MRF_REMOTE_READ		= BIT(2),
+	IONIC_MRF_REMOTE_ATOMIC		= BIT(3),
+	IONIC_MRF_MW_BIND		= BIT(4),
+	IONIC_MRF_ZERO_BASED		= BIT(5),
+	IONIC_MRF_ON_DEMAND		= BIT(6),
+	IONIC_MRF_PB			= BIT(7),
+	IONIC_MRF_ACCESS_MASK		= BIT(12) - 1,
+
+	/* bits that determine mr type */
+	IONIC_MRF_IS_MW			= BIT(14),
+	IONIC_MRF_INV_EN		= BIT(15),
+
+	/* base flags combinations for mr types */
+	IONIC_MRF_USER_MR		= 0,
+	IONIC_MRF_PHYS_MR		= IONIC_MRF_INV_EN,
+	IONIC_MRF_MW_1			= IONIC_MRF_IS_MW,
+	IONIC_MRF_MW_2			= IONIC_MRF_IS_MW | IONIC_MRF_INV_EN,
+};
+
+static inline int to_ionic_mr_flags(int access)
+{
+	int flags = 0;
+
+	if (access & IBV_ACCESS_LOCAL_WRITE)
+		flags |= IONIC_MRF_LOCAL_WRITE;
+
+	if (access & IBV_ACCESS_REMOTE_READ)
+		flags |= IONIC_MRF_REMOTE_READ;
+
+	if (access & IBV_ACCESS_REMOTE_WRITE)
+		flags |= IONIC_MRF_REMOTE_WRITE;
+
+	if (access & IBV_ACCESS_REMOTE_ATOMIC)
+		flags |= IONIC_MRF_REMOTE_ATOMIC;
+
+	if (access & IBV_ACCESS_MW_BIND)
+		flags |= IONIC_MRF_MW_BIND;
+
+	if (access & IBV_ACCESS_ZERO_BASED)
+		flags |= IONIC_MRF_ZERO_BASED;
+
+	return flags;
+}
+
+/* cqe status indicated in status_length field when err bit is set */
+enum ionic_status {
+	IONIC_STS_OK,
+	IONIC_STS_LOCAL_LEN_ERR,
+	IONIC_STS_LOCAL_QP_OPER_ERR,
+	IONIC_STS_LOCAL_PROT_ERR,
+	IONIC_STS_WQE_FLUSHED_ERR,
+	IONIC_STS_MEM_MGMT_OPER_ERR,
+	IONIC_STS_BAD_RESP_ERR,
+	IONIC_STS_LOCAL_ACC_ERR,
+	IONIC_STS_REMOTE_INV_REQ_ERR,
+	IONIC_STS_REMOTE_ACC_ERR,
+	IONIC_STS_REMOTE_OPER_ERR,
+	IONIC_STS_RETRY_EXCEEDED,
+	IONIC_STS_RNR_RETRY_EXCEEDED,
+	IONIC_STS_XRC_VIO_ERR,
+};
+
+static inline int ionic_to_ibv_status(int sts)
+{
+	switch (sts) {
+	case IONIC_STS_OK:
+		return IBV_WC_SUCCESS;
+	case IONIC_STS_LOCAL_LEN_ERR:
+		return IBV_WC_LOC_LEN_ERR;
+	case IONIC_STS_LOCAL_QP_OPER_ERR:
+		return IBV_WC_LOC_QP_OP_ERR;
+	case IONIC_STS_LOCAL_PROT_ERR:
+		return IBV_WC_LOC_PROT_ERR;
+	case IONIC_STS_WQE_FLUSHED_ERR:
+		return IBV_WC_WR_FLUSH_ERR;
+	case IONIC_STS_MEM_MGMT_OPER_ERR:
+		return IBV_WC_MW_BIND_ERR;
+	case IONIC_STS_BAD_RESP_ERR:
+		return IBV_WC_BAD_RESP_ERR;
+	case IONIC_STS_LOCAL_ACC_ERR:
+		return IBV_WC_LOC_ACCESS_ERR;
+	case IONIC_STS_REMOTE_INV_REQ_ERR:
+		return IBV_WC_REM_INV_REQ_ERR;
+	case IONIC_STS_REMOTE_ACC_ERR:
+		return IBV_WC_REM_ACCESS_ERR;
+	case IONIC_STS_REMOTE_OPER_ERR:
+		return IBV_WC_REM_OP_ERR;
+	case IONIC_STS_RETRY_EXCEEDED:
+		return IBV_WC_RETRY_EXC_ERR;
+	case IONIC_STS_RNR_RETRY_EXCEEDED:
+		return IBV_WC_RNR_RETRY_EXC_ERR;
+	case IONIC_STS_XRC_VIO_ERR:
+	default:
+		return IBV_WC_GENERAL_ERR;
+	}
+}
+
+/* fw abi v1 */
+
+/* data payload part of v1 wqe */
+union ionic_v1_pld {
+	struct ionic_sge	sgl[2];
+	__be32			spec32[8];
+	__be16			spec16[16];
+	__u8			data[32];
+};
+
+/* completion queue v1 cqe */
+struct ionic_v1_cqe {
+	union {
+		struct {
+			__le64		wqe_idx_timestamp;
+			__be32		src_qpn_op;
+			__u8		src_mac[6];
+			__be16		vlan_tag;
+			__be32		imm_data_rkey;
+		} recv;
+		struct {
+			__u8		rsvd[4];
+			__be32		msg_msn;
+			__u8		rsvd2[8];
+			__le64		npg_wqe_idx_timestamp;
+		} send;
+	};
+	__be32				status_length;
+	__be32				qid_type_flags;
+};
+
+/* bits for cqe wqe_idx and timestamp */
+enum ionic_v1_cqe_wqe_idx_timestamp_bits {
+	IONIC_V1_CQE_WQE_IDX_MASK	= 0xffff,
+	IONIC_V1_CQE_TIMESTAMP_SHIFT	= 16,
+};
+
+/* bits for cqe recv */
+enum ionic_v1_cqe_src_qpn_bits {
+	IONIC_V1_CQE_RECV_QPN_MASK	= 0xffffff,
+	IONIC_V1_CQE_RECV_OP_SHIFT	= 24,
+
+	/* MASK could be 0x3, but need 0x1f for makeshift values:
+	 * OP_TYPE_RDMA_OPER_WITH_IMM, OP_TYPE_SEND_RCVD
+	 */
+	IONIC_V1_CQE_RECV_OP_MASK	= 0x1f,
+	IONIC_V1_CQE_RECV_OP_SEND	= 0,
+	IONIC_V1_CQE_RECV_OP_SEND_INV	= 1,
+	IONIC_V1_CQE_RECV_OP_SEND_IMM	= 2,
+	IONIC_V1_CQE_RECV_OP_RDMA_IMM	= 3,
+
+	IONIC_V1_CQE_RECV_IS_IPV4	= BIT(7 + IONIC_V1_CQE_RECV_OP_SHIFT),
+	IONIC_V1_CQE_RECV_IS_VLAN	= BIT(6 + IONIC_V1_CQE_RECV_OP_SHIFT),
+};
+
+/* bits for cqe qid_type_flags */
+enum ionic_v1_cqe_qtf_bits {
+	IONIC_V1_CQE_COLOR		= BIT(0),
+	IONIC_V1_CQE_ERROR		= BIT(1),
+	IONIC_V1_CQE_TYPE_SHIFT		= 5,
+	IONIC_V1_CQE_TYPE_MASK		= 0x7,
+	IONIC_V1_CQE_QID_SHIFT		= 8,
+
+	IONIC_V1_CQE_TYPE_RECV		= 1,
+	IONIC_V1_CQE_TYPE_SEND_MSN	= 2,
+	IONIC_V1_CQE_TYPE_SEND_NPG	= 3,
+	IONIC_V1_CQE_TYPE_RECV_INDIR	= 4,
+};
+
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_HCC__)
+
+static inline bool ionic_v1_cqe_color(struct ionic_v1_cqe *cqe)
+{
+	return !!(cqe->qid_type_flags & htobe32(IONIC_V1_CQE_COLOR));
+}
+
+static inline bool ionic_v1_cqe_error(struct ionic_v1_cqe *cqe)
+{
+	return !!(cqe->qid_type_flags & htobe32(IONIC_V1_CQE_ERROR));
+}
+
+static inline bool ionic_v1_cqe_recv_is_ipv4(struct ionic_v1_cqe *cqe)
+{
+	return !!(cqe->recv.src_qpn_op &
+		  htobe32(IONIC_V1_CQE_RECV_IS_IPV4));
+}
+
+static inline bool ionic_v1_cqe_recv_is_vlan(struct ionic_v1_cqe *cqe)
+{
+	return !!(cqe->recv.src_qpn_op &
+		  htobe32(IONIC_V1_CQE_RECV_IS_VLAN));
+}
+
+static inline void ionic_v1_cqe_clean(struct ionic_v1_cqe *cqe)
+{
+	cqe->qid_type_flags |= htobe32(~0u << IONIC_V1_CQE_QID_SHIFT);
+}
+
+static inline uint32_t ionic_v1_cqe_qtf(struct ionic_v1_cqe *cqe)
+{
+	return be32toh(cqe->qid_type_flags);
+}
+
+#endif // !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_HCC__)
+
+static inline uint8_t ionic_v1_cqe_qtf_type(uint32_t qtf)
+{
+	return (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
+}
+
+static inline uint32_t ionic_v1_cqe_qtf_qid(uint32_t qtf)
+{
+	return qtf >> IONIC_V1_CQE_QID_SHIFT;
+}
+
+/* v1 base wqe header */
+struct ionic_v1_base_hdr {
+	__le64				wqe_idx;
+	__u8				op;
+	__u8				num_sge_key;
+	__be16				flags;
+	__be32				imm_data_key;
+};
+
+/* v1 receive wqe body */
+struct ionic_v1_recv_bdy {
+	__u8				rsvd[16];
+	union ionic_v1_pld		pld;
+};
+
+/* v1 send/rdma wqe body (common, has sgl) */
+struct ionic_v1_common_bdy {
+	union {
+		struct {
+			__be32		ah_id;
+			__be32		dest_qpn;
+			__be32		dest_qkey;
+		} send;
+		struct {
+			__be32		remote_va_high;
+			__be32		remote_va_low;
+			__be32		remote_rkey;
+		} rdma;
+	};
+	__be32				length;
+	union ionic_v1_pld		pld;
+};
+
+/* v1 atomic wqe body */
+struct ionic_v1_atomic_bdy {
+	__be32				remote_va_high;
+	__be32				remote_va_low;
+	__be32				remote_rkey;
+	__be32				swap_add_high;
+	__be32				swap_add_low;
+	__be32				compare_high;
+	__be32				compare_low;
+	__u8				rsvd[4];
+	struct ionic_sge		sge;
+};
+
+/* v2 atomic wqe body */
+struct ionic_v2_atomic_bdy {
+	__be32				remote_va_high;
+	__be32				remote_va_low;
+	__be32				remote_rkey;
+	__be32				swap_add_high;
+	__be32				swap_add_low;
+	__be32				compare_high;
+	__be32				compare_low;
+	__be32				lkey;
+	__be64				local_va;
+	__u8				rsvd_expdb[8];
+};
+
+/* v1 bind mw wqe body */
+struct ionic_v1_bind_mw_bdy {
+	__be64				va;
+	__be64				length;
+	__be32				lkey;
+	__be16				flags;
+	__u8				rsvd[26];
+};
+
+/* v1 send/recv wqe */
+struct ionic_v1_wqe {
+	struct ionic_v1_base_hdr	base;
+	union {
+		struct ionic_v1_recv_bdy	recv;
+		struct ionic_v1_common_bdy	common;
+		struct ionic_v1_atomic_bdy	atomic;
+		struct ionic_v2_atomic_bdy	atomic_v2;
+		struct ionic_v1_bind_mw_bdy	bind_mw;
+	};
+};
+
+/* queue pair v1 send opcodes */
+enum ionic_v1_op {
+	IONIC_V1_OP_SEND,
+	IONIC_V1_OP_SEND_INV,
+	IONIC_V1_OP_SEND_IMM,
+	IONIC_V1_OP_RDMA_READ,
+	IONIC_V1_OP_RDMA_WRITE,
+	IONIC_V1_OP_RDMA_WRITE_IMM,
+	IONIC_V1_OP_ATOMIC_CS,
+	IONIC_V1_OP_ATOMIC_FA,
+	IONIC_V1_OP_REG_MR,
+	IONIC_V1_OP_LOCAL_INV,
+	IONIC_V1_OP_BIND_MW,
+
+	/* flags */
+	IONIC_V1_FLAG_FENCE		= BIT(0),
+	IONIC_V1_FLAG_SOL		= BIT(1),
+	IONIC_V1_FLAG_INL		= BIT(2),
+	IONIC_V1_FLAG_SIG		= BIT(3),
+	IONIC_V1_FLAG_COLOR		= BIT(4),
+
+	/* flags last four bits for sgl spec format */
+	IONIC_V1_FLAG_SPEC32		= (1u << 12),
+	IONIC_V1_FLAG_SPEC16		= (2u << 12),
+	IONIC_V1_SPEC_FIRST_SGE		= 2,
+};
+
+/* queue pair v2 send opcodes */
+enum ionic_v2_op {
+	IONIC_V2_OPSL_OUT               = 0x20,
+	IONIC_V2_OPSL_IMM               = 0x40,
+	IONIC_V2_OPSL_INV               = 0x80,
+
+	IONIC_V2_OP_SEND                = 0x0 | IONIC_V2_OPSL_OUT,
+	IONIC_V2_OP_SEND_IMM            = IONIC_V2_OP_SEND | IONIC_V2_OPSL_IMM,
+	IONIC_V2_OP_SEND_INV            = IONIC_V2_OP_SEND | IONIC_V2_OPSL_INV,
+
+	IONIC_V2_OP_RDMA_WRITE          = 0x1 | IONIC_V2_OPSL_OUT,
+	IONIC_V2_OP_RDMA_WRITE_IMM      = IONIC_V2_OP_RDMA_WRITE | IONIC_V2_OPSL_IMM,
+
+	IONIC_V2_OP_RDMA_READ           = 0x2,
+
+	IONIC_V2_OP_ATOMIC_CS           = 0x4,
+	IONIC_V2_OP_ATOMIC_FA           = 0x5,
+	IONIC_V2_OP_REG_MR              = 0x6,
+	IONIC_V2_OP_LOCAL_INV           = 0x7,
+	IONIC_V2_OP_BIND_MW             = 0x8,
+};
+
+#if !defined(__cplusplus)
+
+static inline size_t ionic_v1_send_wqe_min_size(int min_sge, int min_data,
+						int spec, bool expdb)
+{
+	size_t sz_wqe, sz_sgl, sz_data;
+
+	if (spec > IONIC_V1_SPEC_FIRST_SGE)
+		min_sge += IONIC_V1_SPEC_FIRST_SGE;
+
+	if (expdb) {
+		min_sge += 1;
+		min_data += IONIC_EXP_DBELL_SZ;
+	}
+
+	sz_wqe = sizeof(struct ionic_v1_wqe);
+	sz_sgl = offsetof(struct ionic_v1_wqe, common.pld.sgl[min_sge]);
+	sz_data = offsetof(struct ionic_v1_wqe, common.pld.data[min_data]);
+
+	if (sz_sgl > sz_wqe)
+		sz_wqe = sz_sgl;
+
+	if (sz_data > sz_wqe)
+		sz_wqe = sz_data;
+
+	return roundup_pow_of_two(sz_wqe);
+}
+
+static inline int ionic_v1_send_wqe_max_sge(uint8_t stride_log2, int spec, bool expdb)
+{
+	struct ionic_v1_wqe *wqe = (void *)0;
+	struct ionic_sge *sge = (void *)(1ull << stride_log2);
+	int num_sge = 0;
+
+	if (expdb)
+		sge -= 1;
+
+	if (spec > IONIC_V1_SPEC_FIRST_SGE)
+		num_sge = IONIC_V1_SPEC_FIRST_SGE;
+
+	num_sge = sge - &wqe->common.pld.sgl[num_sge];
+
+	if (spec && num_sge > spec)
+		num_sge = spec;
+
+	return num_sge;
+}
+
+static inline int ionic_v1_send_wqe_max_data(uint8_t stride_log2, bool expdb)
+{
+	struct ionic_v1_wqe *wqe = (void *)0;
+	__u8 *data = (void *)(1ull << stride_log2);
+
+	if (expdb)
+		data -= IONIC_EXP_DBELL_SZ;
+
+	return data - wqe->common.pld.data;
+}
+
+static inline size_t ionic_v1_recv_wqe_min_size(int min_sge, int spec, bool expdb)
+{
+	size_t sz_wqe, sz_sgl;
+
+	if (spec > IONIC_V1_SPEC_FIRST_SGE)
+		min_sge += IONIC_V1_SPEC_FIRST_SGE;
+
+	if (expdb)
+		min_sge += 1;
+
+	sz_wqe = sizeof(struct ionic_v1_wqe);
+	sz_sgl = offsetof(struct ionic_v1_wqe, recv.pld.sgl[min_sge]);
+
+	if (sz_sgl > sz_wqe)
+		sz_wqe = sz_sgl;
+
+	return sz_wqe;
+}
+
+static inline int ionic_v1_recv_wqe_max_sge(uint8_t stride_log2, int spec, bool expdb)
+{
+	struct ionic_v1_wqe *wqe = (void *)0;
+	struct ionic_sge *sge = (void *)(1ull << stride_log2);
+	int num_sge = 0;
+
+	if (expdb)
+		sge -= 1;
+
+	if (spec > IONIC_V1_SPEC_FIRST_SGE)
+		num_sge = IONIC_V1_SPEC_FIRST_SGE;
+
+	num_sge = sge - &wqe->recv.pld.sgl[num_sge];
+
+	if (spec && num_sge > spec)
+		num_sge = spec;
+
+	return num_sge;
+}
+
+static inline int ionic_v1_use_spec_sge(int min_sge, int spec)
+{
+	if (!spec || min_sge > spec)
+		return 0;
+
+	if (min_sge <= IONIC_V1_SPEC_FIRST_SGE)
+		return IONIC_V1_SPEC_FIRST_SGE;
+
+	return spec;
+}
+
+#define IONIC_RCQ_SIZE 4096
+#define IONIC_RCQ_DEPTH 128
+#define IONIC_RCQ_DEPTH_LOG2 7
+#define IONIC_RCQ_STRIDE_LOG2 4
+
+struct ionic_rcq_hdr {
+	uint8_t pad[60];
+	uint32_t seq_pad;
+};
+
+struct ionic_rcqe {
+	uint32_t status_length;
+	uint32_t imm_data;
+	uint32_t seq_flags;
+	uint32_t rsvd;
+};
+
+enum ionic_rcqe_flag {
+	IONIC_RCQE_C = BIT(7),
+	IONIC_RCQE_I = BIT(6),
+};
+
+struct ionic_rcq {
+	struct ionic_rcq_hdr hdr;
+	struct ionic_rcqe ring[IONIC_RCQ_DEPTH];
+};
+
+static inline uint32_t ionic_rcq_hdr_seq(struct ionic_rcq_hdr *hdr)
+{
+	return be32toh(hdr->seq_pad) >> 8;
+}
+
+static inline uint32_t ionic_rcqe_seq(struct ionic_rcqe *rcqe)
+{
+	return be32toh(rcqe->seq_flags) >> 8;
+}
+
+static inline bool ionic_rcqe_color(struct ionic_rcqe *rcqe)
+{
+	return !!(rcqe->seq_flags & htobe32(IONIC_RCQE_C));
+}
+
+static inline bool ionic_rcqe_imm(struct ionic_rcqe *rcqe)
+{
+	return !!(rcqe->seq_flags & htobe32(IONIC_RCQE_I));
+}
+
+#endif // !defined(__cplusplus)
+
+#endif /* IONIC_FW_H */
@@ -25,11 +25,19 @@
 #ifndef LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_
 #define LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_

-#ifdef GDA_IONIC
 extern "C" {
-#include <infiniband/ionic_dv.h>
-#include <infiniband/ionic_fw.h>
+#include "gda/ionic/ionic_dv.h"
+#include "gda/ionic/ionic_fw.h"
 }
-#endif
+
+struct ionicdv_funcs_t {
+  int (*get_ctx)(struct ionic_dv_ctx *dvctx, struct ibv_context *ibctx);
+  uint8_t (*qp_get_udma_idx)(struct ibv_qp *ibqp);
+  int (*get_cq)(struct ionic_dv_cq *dvcq, struct ibv_cq *ibcq, uint8_t udma_idx);
+  int (*get_qp)(struct ionic_dv_qp *dvqp, struct ibv_qp *ibqp);
+  int (*pd_set_sqcmb)(struct ibv_pd *ibpd, bool enable, bool expdb, bool require);
+  int (*pd_set_rqcmb)(struct ibv_pd *ibpd, bool enable, bool expdb, bool require);
+  int (*pd_set_udma_mask)(struct ibv_pd *ibpd, uint8_t udma_mask);
+};

 #endif  //LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_
@@ -0,0 +1,312 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "gda/queue_pair.hpp"
+#include "gda/endian.hpp"
+#include "util.hpp"
+#include "containers/free_list_impl.hpp"
+
+namespace rocshmem {
+
+__device__ uint64_t QueuePair::get_same_qp_lane_mask() {
+  uint64_t lane_mask = get_active_lane_mask();
+  uintptr_t this_val = reinterpret_cast<uintptr_t>(this);
+
+  // exclude threads operating on a different qp from this thread lane mask
+  #pragma unroll
+  for (int i = 0; i < 64; ++i) {
+    uint64_t bit_i = 1ull << i;
+    if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) {
+      lane_mask &= ~bit_i;
+    }
+  }
+
+  return lane_mask;
+}
+
+__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) {
+  uint32_t my_sq_prod = 0;
+
+  // reserve space for wqes in sq
+  if (is_first_active_lane(activemask)) {
+    my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask));
+
+  // wait for that space to be available
+  ionic_quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask);
+
+  return my_sq_prod;
+}
+
+__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) {
+  uint32_t dbprod = my_sq_prod + num_wqes;
+
+  spin_lock_acquire_shared(&sq_lock, activemask);
+
+  if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) {
+    sq_dbprod = dbprod;
+
+    ionic_ring_doorbell(dbprod);
+  }
+
+  spin_lock_release_shared(&sq_lock, activemask);
+
+  return dbprod;
+}
+
+__device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
+  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
+  uint32_t my_cq_pos = cq_pos + my_logical_lane_id;
+
+  /* Look at the cqe at the current position in the cq buffer */
+  struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask];
+
+  /* Determine expected color based on cq wrap count */
+  uint32_t qtf_color_bit = swap_endian_val<uint32_t>(IONIC_V1_CQE_COLOR);
+  uint32_t qtf_color_exp = qtf_color_bit;
+  if (my_cq_pos & (cq_mask + 1)) {
+    qtf_color_exp = 0;
+  }
+
+  /* Check if my cqe color == expected color */
+  uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
+  if ((qtf_be & qtf_color_bit) != qtf_color_exp) {
+    return;
+  }
+
+  uint32_t msn = swap_endian_val<uint32_t>(cqe->send.msg_msn);
+
+  /* Report if the completion indicates an error. */
+  if (!!(qtf_be & swap_endian_val<uint32_t>(IONIC_V1_CQE_ERROR))) {
+#if defined(DEBUG)
+    uint32_t qtf = swap_endian_val<uint32_t>(qtf_be);
+    uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT;
+    uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
+    uint32_t flag = qtf & 0xf;
+    uint32_t status = swap_endian_val<uint32_t>(cqe->status_length);
+    uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK;
+
+    printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n",
+           dev_name, qid, type, flag, status, msn, npg);
+#endif
+    /* No other way to signal an error, so just crash. */
+    abort();
+  }
+
+  /* Only proceed with the furthest ahead cqe to update the sq state */
+  uint64_t my_lane_mask = 1ull << __lane_id();
+  uint64_t lesser_lane_mask = my_lane_mask - 1;
+  if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) {
+    return;
+  }
+
+  /* update position in the cq */
+  cq_pos = my_cq_pos + 1;
+
+  /*
+   * Ring cq doorbell frequently enough to avoid cq full.
+   *
+   * NB: IONIC_CQ_GRACE is 100
+   */
+  if (((cq_pos - cq_dbpos) & cq_mask) >= 100) {
+    cq_dbpos = cq_pos;
+    __atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed?
+  }
+
+  sq_msn = msn;
+}
+
+__device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) {
+  uint32_t greed = 10;
+
+  /* wait for sq_msn to catch up or pass cons. */
+  /* 0x800000 - sign bit for 24-bit fields     */
+  while ((sq_msn - cons) & 0x800000) {
+    if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) {
+      continue;
+    }
+
+    /* with lock acquired, this wave polls cqes until caught up */
+    while ((sq_msn - cons) & 0x800000) {
+      uint32_t old_sq_msn = sq_msn;
+
+      poll_wave_cqes(activemask);
+
+      if (!((sq_msn - cons) & 0x800000)) {
+        if (sq_msn == old_sq_msn) {
+          break;
+        }
+        if (!greed) {
+          break;
+        }
+        --greed;
+      }
+    }
+
+    spin_lock_release_shared(&cq_lock, activemask);
+    break;
+  }
+}
+
+__device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) {
+  // TODO When threads write at once to the same address, not all writes reach the bus.
+  for (int i = 0; i < 64; ++i) {
+    if (__lane_id() == i) {
+      __threadfence();
+      __atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST);
+    }
+  }
+  __threadfence();
+}
+
+__device__ void QueuePair::ionic_quiet() {
+  ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod);
+}
+
+__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
+  uint64_t activemask = get_same_qp_lane_mask();
+  uint32_t num_wqes = get_active_lane_count(activemask);
+  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
+  uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
+  uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
+  struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
+  uint16_t wqe_flags = 0;
+
+  if (!(my_sq_pos & (sq_mask + 1))) {
+    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
+  }
+
+  if (is_last_active_lane(activemask)) {
+    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
+  }
+
+  // TODO why is this needed?
+  if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) {
+    size = 1;
+  }
+
+  wqe->base.wqe_idx = my_sq_pos;
+  wqe->base.op = opcode;
+  wqe->base.num_sge_key = size ? 1 : 0;
+  wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
+
+  wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
+  wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
+  wqe->common.rdma.remote_rkey = swap_endian_val<uint32_t>(rkey);
+  wqe->common.length = swap_endian_val<uint32_t>(size);
+
+  if (size) {
+    if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) {
+      wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
+      wqe->base.num_sge_key = 0;
+      if (!laddr) {
+        // TODO why is this needed?
+        wqe->common.pld.data[0] = 1;
+      } else {
+        memcpy(wqe->common.pld.data, laddr, size);
+      }
+    } else {
+      wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(laddr));
+      wqe->common.pld.sgl[0].len = swap_endian_val<uint32_t>(size);
+      wqe->common.pld.sgl[0].lkey = swap_endian_val<uint32_t>(lkey);
+    }
+  }
+
+  __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
+
+  commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
+}
+
+__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
+                                                  int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
+  uint64_t activemask = get_same_qp_lane_mask();
+  uint32_t num_wqes = get_active_lane_count(activemask);
+  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
+  bool is_leader{my_logical_lane_id == 0};
+  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
+  uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
+  uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
+  struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
+  uint16_t wqe_flags = 0;
+  uint32_t cons;
+
+  uint64_t* wave_fetch_atomic{nullptr};
+  if (fetching) {
+    if (is_leader) {
+      auto res = fetching_atomic_freelist->pop_front();
+      while (!res.success) {
+        res = fetching_atomic_freelist->pop_front();
+      }
+      wave_fetch_atomic = res.value;
+    }
+    wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
+  }
+
+  if (!(my_sq_pos & (sq_mask + 1))) {
+    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
+  }
+
+  if (is_last_active_lane(activemask)) {
+    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
+  }
+
+  wqe->base.wqe_idx = my_sq_pos;
+  wqe->base.op = opcode;
+  wqe->base.num_sge_key = 1;
+  wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
+
+  wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
+  wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
+  wqe->atomic_v2.remote_rkey = swap_endian_val<uint32_t>(rkey);
+  wqe->atomic_v2.swap_add_high = swap_endian_val<uint32_t>(atomic_data >> 32);
+  wqe->atomic_v2.swap_add_low = swap_endian_val<uint32_t>(atomic_data);
+  wqe->atomic_v2.compare_high = swap_endian_val<uint32_t>(atomic_cmp >> 32);
+  wqe->atomic_v2.compare_low = swap_endian_val<uint32_t>(atomic_cmp);
+
+  if (fetching) {
+    wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(wave_fetch_atomic + my_logical_lane_id));
+    wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(fetching_atomic_lkey);
+  } else {
+    wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(nonfetching_atomic));
+    wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(nonfetching_atomic_lkey);
+  }
+
+  __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
+
+  cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
+
+  uint64_t ret{0};
+  if (fetching) {
+    ionic_quiet_internal(activemask, cons);
+    ret = wave_fetch_atomic[my_logical_lane_id];
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    if (is_leader) {
+      fetching_atomic_freelist->push_back(wave_fetch_atomic);
+    }
+  }
+  return ret;
+}
+
+}  // namespace rocshmem
@@ -0,0 +1,37 @@
+###############################################################################
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+###############################################################################
+target_sources(
+  ${PROJECT_NAME}
+  PRIVATE
+    backend_gda_mlx5.cpp
+)
+
+if(GDA_MLX5)
+target_sources(
+  ${PROJECT_NAME}
+  PRIVATE
+    queue_pair_mlx5.cpp
+    segment_builder.cpp
+)
+endif()
@@ -0,0 +1,122 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "gda/backend_gda.hpp"
+#include "util.hpp"
+
+namespace rocshmem {
+
+void* GDABackend::mlx5_dv_dlopen() {
+  void* dv_handle{nullptr};
+  dv_handle = dlopen("libmlx5.so", RTLD_NOW);
+  if (!dv_handle) {
+    DPRINTF("Could not open libmlx5.so. Returning\n");
+  }
+  return dv_handle;
+}
+
+int GDABackend::mlx5_dv_dl_init() {
+  mlx5dv_handle_ = mlx5_dv_dlopen();
+  if (!mlx5dv_handle_)
+    return ROCSHMEM_ERROR;
+
+  DLSYM_HELPER(mlx5dv, mlx5dv_, mlx5dv_handle_, init_obj);
+  return ROCSHMEM_SUCCESS;
+}
+
+void GDABackend::mlx5_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
+  mlx5dv_cq cq_out;
+  mlx5dv_obj mlx_obj;
+  mlx_obj.cq.in = cqs[conn_num];
+  mlx_obj.cq.out = &cq_out;
+  mlx5dv.init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
+  dump_mlx5dv_cq(&cq_out, conn_num);
+
+  /*
+   * struct mlx5dv_cq {
+   *   void                    *buf;
+   *   __be32                  *dbrec;
+   *   uint32_t                cqe_cnt;
+   *   uint32_t                cqe_size;
+   *   void                    *cq_uar;
+   *   uint32_t                cqn;
+   *   uint64_t                comp_mask;
+   * };
+   */
+
+  gpu_qp->cq_buf = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
+  gpu_qp->cq_cnt = cq_out.cqe_cnt;
+  gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt);
+  gpu_qp->cq_dbrec = cq_out.dbrec;
+
+  mlx5dv_qp qp_out;
+  mlx_obj.qp.in = qps[conn_num];
+  mlx_obj.qp.out = &qp_out;
+  mlx5dv.init_obj(&mlx_obj, MLX5DV_OBJ_QP);
+  dump_mlx5dv_qp(&qp_out, conn_num);
+
+  /*
+   * struct mlx5dv_qp {
+   *   __be32 *dbrec;
+   *   struct {
+   *     void *buf;
+   *     uint32_t wqe_cnt;
+   *     uint32_t stride;
+   *   } sq;
+   *   struct {
+   *     void *buf;
+   *     uint32_t wqe_cnt;
+   *     uint32_t stride;
+   *   } rq;
+   *   struct {
+   *     void *reg;
+   *     uint32_t size;
+   *   } bf;
+   *   uint64_t comp_mask;
+   *   off_t uar_mmap_offset;
+   *   uint32_t tirn;
+   *   uint32_t tisn;
+   *   uint32_t rqn;
+   *   uint32_t sqn;
+   *   uint64_t tir_icm_addr;
+   * };
+   */
+
+  gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR
+  gpu_qp->sq_buf = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
+  gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt;
+  gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]);
+  gpu_qp->lkey = htobe32(heap_mr->lkey);
+  gpu_qp->qp_num = qps[conn_num]->qp_num;
+  gpu_qp->inline_threshold = inline_threshold;
+  // The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers
+
+  int hip_dev_id{-1};
+  CHECK_HIP(hipGetDevice(&hip_dev_id));
+  void* gpu_ptr{nullptr};
+  rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id);
+  gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
+}
+
+}  // namespace rocshmem
@@ -34,4 +34,8 @@ typedef union db_reg {
  uintptr_t uint;
 } db_reg_t;

+struct mlx5dv_funcs_t {
+  int (*init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type);
+};
+
 #endif  //LIBRARY_SRC_GDA_MLX5_GDA_PROVIDER_HPP_
@@ -0,0 +1,273 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#include "gda/queue_pair.hpp"
+#include "util.hpp"
+#include "containers/free_list_impl.hpp"
+#include "gda/endian.hpp"
+#include "segment_builder.hpp"
+
+namespace rocshmem {
+
+__device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) {
+  swap_endian_store(const_cast<uint32_t*>(dbrec), (uint32_t)my_sq_counter);
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+  __hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
+  uint64_t db_uint = __hip_atomic_load(&db.uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  db_uint ^= 0x100;
+  __hip_atomic_store(&db.uint, db_uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__ void QueuePair::mlx5_quiet() {
+  constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE;
+  __shared__ uint64_t wqe_broadcast[BROADCAST_SIZE];
+  uint8_t wavefront_id = get_flat_block_id() / WF_SIZE;
+  wqe_broadcast[wavefront_id] = 0;
+
+  uint64_t activemask = get_active_lane_mask();
+  uint8_t num_active_lanes = get_active_lane_count(activemask);
+  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
+  bool is_leader{my_logical_lane_id == 0};
+  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
+
+  while (true) {
+    bool done{false};
+    uint64_t quiet_amount{0};
+    uint64_t wave_cq_consumer{0};
+    while (!done) {
+      uint64_t active = __hip_atomic_load(&quiet_active, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      uint64_t posted = __hip_atomic_load(&quiet_posted, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      uint64_t completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      if (!(posted - completed)) {
+        return;
+      }
+      int64_t quiet_val = posted - active;
+      if (quiet_val <= 0) {
+        continue;
+      }
+      quiet_amount = min(num_active_lanes, quiet_val);
+      if (is_leader) {
+        done = __hip_atomic_compare_exchange_strong(&quiet_active, &active, active + quiet_amount, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+        if (done) {
+          wave_cq_consumer = __hip_atomic_fetch_add(&cq_consumer, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+        }
+      }
+      done = __shfl(done, leader_phys_lane_id);
+    }
+    wave_cq_consumer = __shfl(wave_cq_consumer, leader_phys_lane_id);
+    uint64_t my_cq_consumer = wave_cq_consumer + my_logical_lane_id;
+    uint64_t my_cq_index = my_cq_consumer % cq_cnt;
+
+    if (my_logical_lane_id < quiet_amount) {
+      volatile mlx5_cqe64 *cqe_entry = &cq_buf[my_cq_index];
+      uint16_t be_wqe_counter{0};
+      uint8_t op_own{0};
+      uint8_t owner_bit = (my_cq_consumer >> cq_log_cnt) & 1;
+      bool vote_failed{true};
+
+      while (vote_failed) {
+        op_own = *((volatile uint8_t*)&cqe_entry->op_own);
+        bool my_ownership_vote = (op_own & 1) == owner_bit;
+        bool my_opcode_vote = (op_own >> 4) != MLX5_CQE_INVALID;
+        uint64_t votes = __ballot(my_ownership_vote && my_opcode_vote);
+        vote_failed = __popcll(votes) < quiet_amount;
+        if (!vote_failed) {
+          be_wqe_counter = *((volatile uint16_t*)&cqe_entry->wqe_counter);
+        }
+      }
+
+      uint16_t wqe_counter;
+      swap_endian_store(const_cast<uint16_t*>(&wqe_counter), reinterpret_cast<uint16_t>(be_wqe_counter));
+      uint64_t wqe_id =  outstanding_wqes[wqe_counter];
+      __hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+      uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit;
+      *((volatile uint8_t*)&cqe_entry->op_own) = mlx5_invld_bits;
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    }
+    if (is_leader) {
+      uint64_t completed {0};
+      do {
+        completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      } while (completed != wave_cq_consumer);
+
+      swap_endian_store(const_cast<uint32_t*>(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount));
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+      uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id];
+      __hip_atomic_fetch_max(&sq_sunk, sunk_wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      __hip_atomic_fetch_add(&quiet_completed, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    }
+  }
+}
+
+__device__ void QueuePair::mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
+  uint64_t activemask = get_active_lane_mask();
+  uint8_t num_active_lanes = get_active_lane_count(activemask);
+  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
+  bool is_leader{my_logical_lane_id == 0};
+  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
+  uint8_t num_wqes{num_active_lanes};
+  uint64_t wave_sq_counter{0};
+
+  if (is_leader) {
+    wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
+  uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
+  uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
+
+  while (true) {
+    uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    int64_t num_active_sq_entries = db_touched - sunk;
+    if (num_active_sq_entries < 0) {
+      continue;
+    }
+    uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
+    uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
+    if (num_free_entries > num_entries_until_wave_last_entry) {
+      break;
+    }
+    mlx5_quiet();
+  }
+
+  outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
+
+  SegmentBuilder seg_build(my_sq_index, sq_buf);
+  seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0);
+  seg_build.update_raddr_seg(raddr, rkey);
+
+  if (size <= inline_threshold && opcode == gda_op_rdma_write) {
+    seg_build.update_inl_data_seg(laddr, size);
+  } else {
+    seg_build.update_data_seg(laddr, size, lkey);
+  }
+
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+  if (is_leader) {
+    uint64_t db_touched {0};
+    do {
+      db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    } while (db_touched != wave_sq_counter);
+
+    uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
+    uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
+    mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
+
+    __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+}
+
+__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
+                                                 int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
+  uint64_t activemask = get_active_lane_mask();
+  uint8_t num_active_lanes = get_active_lane_count(activemask);
+  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
+  bool is_leader{my_logical_lane_id == 0};
+  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
+  uint8_t num_wqes{num_active_lanes};
+  uint64_t wave_sq_counter{0};
+
+  if (is_leader) {
+    wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
+  uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
+  uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
+
+  while (true) {
+    uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    int64_t num_active_sq_entries = db_touched - sunk;
+    if (num_active_sq_entries < 0) {
+      continue;
+    }
+    uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
+    uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
+    if (num_free_entries > num_entries_until_wave_last_entry) {
+      break;
+    }
+    mlx5_quiet();
+  }
+
+  uint64_t* wave_fetch_atomic{nullptr};
+  if (fetching) {
+    if (is_leader) {
+      uint64_t db_touched {0};
+      do {
+        db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      } while (db_touched != wave_sq_counter);
+
+      auto res = fetching_atomic_freelist->pop_front();
+      while (!res.success) {
+        res = fetching_atomic_freelist->pop_front();
+      }
+      wave_fetch_atomic = res.value;
+    }
+    wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
+  }
+
+  outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
+
+  SegmentBuilder seg_build(my_sq_index, sq_buf);
+  seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 4, 0, 0);
+  seg_build.update_raddr_seg(raddr, rkey);
+  seg_build.update_atomic_seg(atomic_data, atomic_cmp);
+  if (fetching) {
+    seg_build.update_data_seg(wave_fetch_atomic + my_logical_lane_id, 8, fetching_atomic_lkey);
+  } else {
+    seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey);
+  }
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+  if (is_leader) {
+    uint64_t db_touched {0};
+    do {
+      db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    } while (db_touched != wave_sq_counter);
+
+    uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
+    uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
+    mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
+
+    __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+
+  uint64_t ret{0};
+  if (fetching) {
+    mlx5_quiet();
+    ret = wave_fetch_atomic[my_logical_lane_id];
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    if (is_leader) {
+      fetching_atomic_freelist->push_back(wave_fetch_atomic);
+    }
+  }
+  return ret;
+}
+
+}  // namespace rocshmem
@@ -25,7 +25,7 @@
 #include "segment_builder.hpp"

 #include "util.hpp"
-#include "endian.hpp"
+#include "gda/endian.hpp"

 namespace rocshmem {

@@ -27,8 +27,6 @@
 #include <hip/hip_runtime.h>

 #include "backend_gda.hpp"
-#include "endian.hpp"
-#include "segment_builder.hpp"
 #include "constants.hpp"

 namespace rocshmem {
@@ -66,22 +64,33 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_vendor) {
  }

  /* Set Correct opcodes for each NIC */
+  switch (gda_vendor) {
 #if defined(GDA_IONIC)
-  gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE;
-  gda_op_rdma_read  = IONIC_V2_OP_RDMA_READ;
-  gda_op_atomic_fa  = IONIC_V2_OP_ATOMIC_FA;
-  gda_op_atomic_cs  = IONIC_V2_OP_ATOMIC_CS;
-#endif
-  if (gda_vendor == GDAVendor::BNXT) {
+  case GDAVendor::IONIC:
+    gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE;
+    gda_op_rdma_read  = IONIC_V2_OP_RDMA_READ;
+    gda_op_atomic_fa  = IONIC_V2_OP_ATOMIC_FA;
+    gda_op_atomic_cs  = IONIC_V2_OP_ATOMIC_CS;
+    break;
+#endif //defined(GDA_IONIC)
+#if defined(GDA_BNXT)
+  case GDAVendor::BNXT:
    gda_op_rdma_write = BNXT_RE_WR_OPCD_RDMA_WRITE;
    gda_op_rdma_read  = BNXT_RE_WR_OPCD_RDMA_READ;
    gda_op_atomic_fa  = BNXT_RE_WR_OPCD_ATOMIC_FA;
    gda_op_atomic_cs  = BNXT_RE_WR_OPCD_ATOMIC_CS;
-  } else if (gda_vendor == GDAVendor::MLX5) {
+    break;
+#endif //defined(GDA_BNXT)
+#if defined(GDA_MLX5)
+  case GDAVendor::MLX5:
    gda_op_rdma_write = MLX5_OPCODE_RDMA_WRITE;
    gda_op_rdma_read  = MLX5_OPCODE_RDMA_READ;
    gda_op_atomic_fa  = MLX5_OPCODE_ATOMIC_FA;
    gda_op_atomic_cs  = MLX5_OPCODE_ATOMIC_CS;
+    break;
+#endif //defined(GDA_MLX5)
+  default:
+    assert(false /* invalid nic provider */);
  }
  gda_vendor_ = gda_vendor;
 }
@@ -105,264 +114,6 @@ QueuePair::~QueuePair() {
 /******************************************************************************
 ************************ PROVIDER-SPECIFIC HELPERS ***************************
 *****************************************************************************/
-#if defined(GDA_IONIC)
-__device__ uint64_t QueuePair::get_same_qp_lane_mask() {
-  uint64_t lane_mask = get_active_lane_mask();
-  uintptr_t this_val = reinterpret_cast<uintptr_t>(this);
-
-  // exclude threads operating on a different qp from this thread lane mask
-  #pragma unroll
-  for (int i = 0; i < 64; ++i) {
-    uint64_t bit_i = 1ull << i;
-    if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) {
-      lane_mask &= ~bit_i;
-    }
-  }
-
-  return lane_mask;
-}
-
-__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) {
-  uint32_t my_sq_prod = 0;
-
-  // reserve space for wqes in sq
-  if (is_first_active_lane(activemask)) {
-    my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  }
-  my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask));
-
-  // wait for that space to be available
-  ionic_quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask);
-
-  return my_sq_prod;
-}
-
-__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) {
-  uint32_t dbprod = my_sq_prod + num_wqes;
-
-  spin_lock_acquire_shared(&sq_lock, activemask);
-
-  if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) {
-    sq_dbprod = dbprod;
-
-    ionic_ring_doorbell(dbprod);
-  }
-
-  spin_lock_release_shared(&sq_lock, activemask);
-
-  return dbprod;
-}
-
-__device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
-  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
-  uint32_t my_cq_pos = cq_pos + my_logical_lane_id;
-
-  /* Look at the cqe at the current position in the cq buffer */
-  struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask];
-
-  /* Determine expected color based on cq wrap count */
-  uint32_t qtf_color_bit = swap_endian_val<uint32_t>(IONIC_V1_CQE_COLOR);
-  uint32_t qtf_color_exp = qtf_color_bit;
-  if (my_cq_pos & (cq_mask + 1)) {
-    qtf_color_exp = 0;
-  }
-
-  /* Check if my cqe color == expected color */
-  uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
-  if ((qtf_be & qtf_color_bit) != qtf_color_exp) {
-    return;
-  }
-
-  uint32_t msn = swap_endian_val<uint32_t>(cqe->send.msg_msn);
-
-  /* Report if the completion indicates an error. */
-  if (!!(qtf_be & swap_endian_val<uint32_t>(IONIC_V1_CQE_ERROR))) {
-#if defined(DEBUG)
-    uint32_t qtf = swap_endian_val<uint32_t>(qtf_be);
-    uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT;
-    uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
-    uint32_t flag = qtf & 0xf;
-    uint32_t status = swap_endian_val<uint32_t>(cqe->status_length);
-    uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK;
-
-    printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n",
-        dev_name, qid, type, flag, status, msn, npg);
-#endif
-    /* No other way to signal an error, so just crash. */
-    abort();
-  }
-
-  /* Only proceed with the furthest ahead cqe to update the sq state */
-  uint64_t my_lane_mask = 1ull << __lane_id();
-  uint64_t lesser_lane_mask = my_lane_mask - 1;
-  if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) {
-    return;
-  }
-
-  /* update position in the cq */
-  cq_pos = my_cq_pos + 1;
-
-  /*
-   * Ring cq doorbell frequently enough to avoid cq full.
-   *
-   * NB: IONIC_CQ_GRACE is 100
-   */
-  if (((cq_pos - cq_dbpos) & cq_mask) >= 100) {
-    cq_dbpos = cq_pos;
-    __atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed?
-  }
-
-  sq_msn = msn;
-}
-
-__device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) {
-  uint32_t greed = 10;
-
-  /* wait for sq_msn to catch up or pass cons. */
-  /* 0x800000 - sign bit for 24-bit fields     */
-  while ((sq_msn - cons) & 0x800000) {
-    if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) {
-      continue;
-    }
-
-    /* with lock acquired, this wave polls cqes until caught up */
-    while ((sq_msn - cons) & 0x800000) {
-      uint32_t old_sq_msn = sq_msn;
-
-      poll_wave_cqes(activemask);
-
-      if (!((sq_msn - cons) & 0x800000)) {
-        if (sq_msn == old_sq_msn) {
-          break;
-        }
-        if (!greed) {
-          break;
-        }
-        --greed;
-      }
-    }
-
-    spin_lock_release_shared(&cq_lock, activemask);
-    break;
-  }
-}
-#endif // GDA_IONIC
-
-#if defined(GDA_IONIC)
-__device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) {
-  // TODO When threads write at once to the same address, not all writes reach the bus.
-  for (int i = 0; i < 64; ++i) {
-    if (__lane_id() == i) {
-      __threadfence();
-      __atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST);
-    }
-  }
-  __threadfence();
-}
-#endif
-
-#if defined(GDA_MLX5)
-__device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) {
-  swap_endian_store(const_cast<uint32_t*>(dbrec), (uint32_t)my_sq_counter);
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
-
-  __hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
-  uint64_t db_uint = __hip_atomic_load(&db.uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  db_uint ^= 0x100;
-  __hip_atomic_store(&db.uint, db_uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-}
-#endif // GDA_MLX5
-
-#if defined(GDA_IONIC)
-__device__ void QueuePair::ionic_quiet() {
-  ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod);
-}
-#endif
-
-#if defined(GDA_MLX5)
-__device__ void QueuePair::mlx5_quiet() {
-  constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE;
-  __shared__ uint64_t wqe_broadcast[BROADCAST_SIZE];
-  uint8_t wavefront_id = get_flat_block_id() / WF_SIZE;
-  wqe_broadcast[wavefront_id] = 0;
-
-  uint64_t activemask = get_active_lane_mask();
-  uint8_t num_active_lanes = get_active_lane_count(activemask);
-  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
-  bool is_leader{my_logical_lane_id == 0};
-  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
-
-  while (true) {
-    bool done{false};
-    uint64_t quiet_amount{0};
-    uint64_t wave_cq_consumer{0};
-    while (!done) {
-      uint64_t active = __hip_atomic_load(&quiet_active, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-      uint64_t posted = __hip_atomic_load(&quiet_posted, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-      uint64_t completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-      if (!(posted - completed)) {
-        return;
-      }
-      int64_t quiet_val = posted - active;
-      if (quiet_val <= 0) {
-        continue;
-      }
-      quiet_amount = min(num_active_lanes, quiet_val);
-      if (is_leader) {
-        done = __hip_atomic_compare_exchange_strong(&quiet_active, &active, active + quiet_amount, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-        if (done) {
-          wave_cq_consumer = __hip_atomic_fetch_add(&cq_consumer, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-        }
-      }
-      done = __shfl(done, leader_phys_lane_id);
-    }
-    wave_cq_consumer = __shfl(wave_cq_consumer, leader_phys_lane_id);
-    uint64_t my_cq_consumer = wave_cq_consumer + my_logical_lane_id;
-    uint64_t my_cq_index = my_cq_consumer % cq_cnt;
-
-    if (my_logical_lane_id < quiet_amount) {
-      volatile mlx5_cqe64 *cqe_entry = &cq_buf[my_cq_index];
-      uint16_t be_wqe_counter{0};
-      uint8_t op_own{0};
-      uint8_t owner_bit = (my_cq_consumer >> cq_log_cnt) & 1;
-      bool vote_failed{true};
-
-      while (vote_failed) {
-        op_own = *((volatile uint8_t*)&cqe_entry->op_own);
-        bool my_ownership_vote = (op_own & 1) == owner_bit;
-        bool my_opcode_vote = (op_own >> 4) != MLX5_CQE_INVALID;
-        uint64_t votes = __ballot(my_ownership_vote && my_opcode_vote);
-        vote_failed = __popcll(votes) < quiet_amount;
-        if (!vote_failed) {
-          be_wqe_counter = *((volatile uint16_t*)&cqe_entry->wqe_counter);
-        }
-      }
-
-      uint16_t wqe_counter;
-      swap_endian_store(const_cast<uint16_t*>(&wqe_counter), reinterpret_cast<uint16_t>(be_wqe_counter));
-      uint64_t wqe_id =  outstanding_wqes[wqe_counter];
-      __hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
-      uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit;
-      *((volatile uint8_t*)&cqe_entry->op_own) = mlx5_invld_bits;
-      __atomic_signal_fence(__ATOMIC_SEQ_CST);
-    }
-    if (is_leader) {
-      uint64_t completed {0};
-      do {
-        completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-      } while (completed != wave_cq_consumer);
-
-      swap_endian_store(const_cast<uint32_t*>(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount));
-      __atomic_signal_fence(__ATOMIC_SEQ_CST);
-
-      uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id];
-      __hip_atomic_fetch_max(&sq_sunk, sunk_wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-      __hip_atomic_fetch_add(&quiet_completed, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    }
-  }
-}
-#endif // GDA_MLX5
-
 __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
  switch (gda_vendor_) {
 #if defined(GDA_MLX5)
@@ -428,286 +179,6 @@ __device__ void QueuePair::quiet() {
  }
 }

-#if defined(GDA_IONIC)
-__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
-  uint64_t activemask = get_same_qp_lane_mask();
-  uint32_t num_wqes = get_active_lane_count(activemask);
-  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
-  uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
-  uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
-  struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
-  uint16_t wqe_flags = 0;
-
-  if (!(my_sq_pos & (sq_mask + 1))) {
-    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
-  }
-
-  if (is_last_active_lane(activemask)) {
-    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
-  }
-
-  // TODO why is this needed?
-  if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) {
-    size = 1;
-  }
-
-  wqe->base.wqe_idx = my_sq_pos;
-  wqe->base.op = opcode;
-  wqe->base.num_sge_key = size ? 1 : 0;
-  wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
-
-  wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
-  wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
-  wqe->common.rdma.remote_rkey = swap_endian_val<uint32_t>(rkey);
-  wqe->common.length = swap_endian_val<uint32_t>(size);
-
-  if (size) {
-    if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) {
-      wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
-      wqe->base.num_sge_key = 0;
-      if (!laddr) {
-        // TODO why is this needed?
-        wqe->common.pld.data[0] = 1;
-      } else {
-        memcpy(wqe->common.pld.data, laddr, size);
-      }
-    } else {
-      wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(laddr));
-      wqe->common.pld.sgl[0].len = swap_endian_val<uint32_t>(size);
-      wqe->common.pld.sgl[0].lkey = swap_endian_val<uint32_t>(lkey);
-    }
-  }
-
-  __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
-
-  commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
-}
-#endif
-
-#if defined (GDA_MLX5)
-__device__ void QueuePair::mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
-  uint64_t activemask = get_active_lane_mask();
-  uint8_t num_active_lanes = get_active_lane_count(activemask);
-  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
-  bool is_leader{my_logical_lane_id == 0};
-  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
-  uint8_t num_wqes{num_active_lanes};
-  uint64_t wave_sq_counter{0};
-
-  if (is_leader) {
-    wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
-  }
-  wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
-  uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
-  uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
-
-  while (true) {
-    uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    int64_t num_active_sq_entries = db_touched - sunk;
-    if (num_active_sq_entries < 0) {
-      continue;
-    }
-    uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
-    uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
-    if (num_free_entries > num_entries_until_wave_last_entry) {
-      break;
-    }
-    mlx5_quiet();
-  }
-
-  outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
-
-  SegmentBuilder seg_build(my_sq_index, sq_buf);
-  seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0);
-  seg_build.update_raddr_seg(raddr, rkey);
-
-  if (size <= inline_threshold && opcode == gda_op_rdma_write) {
-    seg_build.update_inl_data_seg(laddr, size);
-  } else {
-    seg_build.update_data_seg(laddr, size, lkey);
-  }
-
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
-
-  if (is_leader) {
-    uint64_t db_touched {0};
-    do {
-      db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    } while (db_touched != wave_sq_counter);
-
-    uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
-    uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
-    mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
-
-    __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  }
-}
-#endif // GDA_MLX5
-
-#if defined(GDA_IONIC)
-__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
-                                                  int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
-  uint64_t activemask = get_same_qp_lane_mask();
-  uint32_t num_wqes = get_active_lane_count(activemask);
-  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
-  bool is_leader{my_logical_lane_id == 0};
-  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
-  uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
-  uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
-  struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
-  uint16_t wqe_flags = 0;
-  uint32_t cons;
-
-  uint64_t* wave_fetch_atomic{nullptr};
-  if (fetching) {
-    if (is_leader) {
-      auto res = fetching_atomic_freelist->pop_front();
-      while (!res.success) {
-        res = fetching_atomic_freelist->pop_front();
-      }
-      wave_fetch_atomic = res.value;
-    }
-    wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
-  }
-
-  if (!(my_sq_pos & (sq_mask + 1))) {
-    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
-  }
-
-  if (is_last_active_lane(activemask)) {
-    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
-  }
-
-  wqe->base.wqe_idx = my_sq_pos;
-  wqe->base.op = opcode;
-  wqe->base.num_sge_key = 1;
-  wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
-
-  wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
-  wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
-  wqe->atomic_v2.remote_rkey = swap_endian_val<uint32_t>(rkey);
-  wqe->atomic_v2.swap_add_high = swap_endian_val<uint32_t>(atomic_data >> 32);
-  wqe->atomic_v2.swap_add_low = swap_endian_val<uint32_t>(atomic_data);
-  wqe->atomic_v2.compare_high = swap_endian_val<uint32_t>(atomic_cmp >> 32);
-  wqe->atomic_v2.compare_low = swap_endian_val<uint32_t>(atomic_cmp);
-
-  if (fetching) {
-    wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(wave_fetch_atomic + my_logical_lane_id));
-    wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(fetching_atomic_lkey);
-  } else {
-    wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(nonfetching_atomic));
-    wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(nonfetching_atomic_lkey);
-  }
-
-  __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
-
-  cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
-
-  uint64_t ret{0};
-  if (fetching) {
-    ionic_quiet_internal(activemask, cons);
-    ret = wave_fetch_atomic[my_logical_lane_id];
-    __atomic_signal_fence(__ATOMIC_SEQ_CST);
-    if (is_leader) {
-      fetching_atomic_freelist->push_back(wave_fetch_atomic);
-    }
-  }
-  return ret;
-}
-#endif
-
-#if defined(GDA_MLX5)
-__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
-                                                 int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
-  uint64_t activemask = get_active_lane_mask();
-  uint8_t num_active_lanes = get_active_lane_count(activemask);
-  uint8_t my_logical_lane_id = get_active_lane_num(activemask);
-  bool is_leader{my_logical_lane_id == 0};
-  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
-  uint8_t num_wqes{num_active_lanes};
-  uint64_t wave_sq_counter{0};
-
-  if (is_leader) {
-    wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  }
-  wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
-  uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
-  uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
-
-  while (true) {
-    uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    int64_t num_active_sq_entries = db_touched - sunk;
-    if (num_active_sq_entries < 0) {
-      continue;
-    }
-    uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
-    uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
-    if (num_free_entries > num_entries_until_wave_last_entry) {
-      break;
-    }
-    mlx5_quiet();
-  }
-
-  uint64_t* wave_fetch_atomic{nullptr};
-  if (fetching) {
-    if (is_leader) {
-      uint64_t db_touched {0};
-      do {
-        db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-      } while (db_touched != wave_sq_counter);
-
-      auto res = fetching_atomic_freelist->pop_front();
-      while (!res.success) {
-        res = fetching_atomic_freelist->pop_front();
-      }
-      wave_fetch_atomic = res.value;
-    }
-    wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
-  }
-
-  outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
-
-  SegmentBuilder seg_build(my_sq_index, sq_buf);
-  seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 4, 0, 0);
-  seg_build.update_raddr_seg(raddr, rkey);
-  seg_build.update_atomic_seg(atomic_data, atomic_cmp);
-  if (fetching) {
-    seg_build.update_data_seg(wave_fetch_atomic + my_logical_lane_id, 8, fetching_atomic_lkey);
-  } else {
-    seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey);
-  }
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
-
-  if (is_leader) {
-    uint64_t db_touched {0};
-    do {
-      db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    } while (db_touched != wave_sq_counter);
-
-    uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
-    uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
-    mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
-
-    __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-    __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  }
-
-  uint64_t ret{0};
-  if (fetching) {
-    mlx5_quiet();
-    ret = wave_fetch_atomic[my_logical_lane_id];
-    __atomic_signal_fence(__ATOMIC_SEQ_CST);
-    if (is_leader) {
-      fetching_atomic_freelist->push_back(wave_fetch_atomic);
-    }
-  }
-  return ret;
-}
-#endif // GDA_MLX5
-
 /******************************************************************************
 ****************************** SHMEM INTERFACE *******************************
 *****************************************************************************/
@@ -163,14 +163,17 @@ class QueuePair {
 #if defined(GDA_MLX5)
  __device__ uint64_t mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
  __device__ void mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
+  __device__ void mlx5_quiet();
 #endif
 #if defined(GDA_BNXT)
  __device__ uint64_t bnxt_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
  __device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
+  __device__ void bnxt_quiet();
 #endif
 #if defined(GDA_IONIC)
  __device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
  __device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
+  __device__ void ionic_quiet();
 #endif

  /**
@@ -189,65 +192,6 @@ class QueuePair {
  __device__ void ionic_ring_doorbell(uint32_t pos);
 #endif

-#ifdef GDA_IONIC
-  __device__ uint64_t get_same_qp_lane_mask();
-
-  /**
-   * @brief Reserve space in the sq to post this many wqes.
-   * @param my_tid my logical thread id.
-   * @param num_wqes number of sq wqes to reserve for this wave.
-   * @return position of my_tid=0's wqe.
-   */
-  __device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes);
-
-  /**
-   * @brief Ring the sq doorbell maintaining order between waves.
-   * @param last this is the last wqe posted in this wave.
-   * @param my_sq_prod position of my_tid=0's wqe.
-   * @param num_wqes number of sq wqes posted in this wave.
-   * @param wqe this thread's wqe.
-   * @return doorbell producer index.
-   */
-  __device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes);
-
-  /**
-   * @brief Helper method to poll the next completion queue entry.
-   */
-  __device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask);
-
-  /**
-   * @brief Helper method to drain completion queue entries.
-   * @param cons wait for sq_msn to catch up to this position.
-   */
-  __device__ __attribute__((noinline)) void ionic_quiet_internal(uint64_t active_lane_mask, uint32_t cons);
-
-  uint64_t *cq_dbreg{nullptr};
-  uint64_t cq_dbval{0};
-  uint64_t cq_mask{0};
-  struct ionic_v1_cqe *ionic_cq_buf{nullptr};
-  uint32_t cq_lock{SPIN_LOCK_UNLOCKED};
-  uint32_t cq_pos{0};
-  uint32_t cq_dbpos{0};
-
-  uint64_t *sq_dbreg{nullptr};
-  uint64_t sq_dbval{0};
-  uint64_t sq_mask{0};
-  struct ionic_v1_wqe *ionic_sq_buf{nullptr};
-  uint32_t sq_lock{SPIN_LOCK_UNLOCKED};
-  uint32_t sq_dbprod{0};
-  uint32_t sq_prod{0};
-  uint32_t sq_msn{0};
-#endif
-
-#if defined(GDA_MLX5)
-  __device__ void mlx5_quiet();
-#endif
-#if defined(GDA_BNXT)
-  __device__ void bnxt_quiet();
-#endif
-#if defined(GDA_IONIC)
-  __device__ void ionic_quiet();
-#endif
  int gda_vendor_{0};

  /* GDAVendor::BNXT START */
@@ -322,6 +266,58 @@ class QueuePair {

  /* GDAVendor::MLX5 END */

+  /* GDAVendor::IONIC START */
+
+  uint64_t *cq_dbreg{nullptr};
+  uint64_t cq_dbval{0};
+  uint64_t cq_mask{0};
+  struct ionic_v1_cqe *ionic_cq_buf{nullptr};
+  uint32_t cq_lock{SPIN_LOCK_UNLOCKED};
+  uint32_t cq_pos{0};
+  uint32_t cq_dbpos{0};
+
+  uint64_t *sq_dbreg{nullptr};
+  uint64_t sq_dbval{0};
+  uint64_t sq_mask{0};
+  struct ionic_v1_wqe *ionic_sq_buf{nullptr};
+  uint32_t sq_lock{SPIN_LOCK_UNLOCKED};
+  uint32_t sq_dbprod{0};
+  uint32_t sq_prod{0};
+  uint32_t sq_msn{0};
+
+  __device__ uint64_t get_same_qp_lane_mask();
+
+  /**
+   * @brief Reserve space in the sq to post this many wqes.
+   * @param my_tid my logical thread id.
+   * @param num_wqes number of sq wqes to reserve for this wave.
+   * @return position of my_tid=0's wqe.
+   */
+  __device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes);
+
+  /**
+   * @brief Ring the sq doorbell maintaining order between waves.
+   * @param last this is the last wqe posted in this wave.
+   * @param my_sq_prod position of my_tid=0's wqe.
+   * @param num_wqes number of sq wqes posted in this wave.
+   * @param wqe this thread's wqe.
+   * @return doorbell producer index.
+   */
+  __device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes);
+
+  /**
+   * @brief Helper method to poll the next completion queue entry.
+   */
+  __device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask);
+
+  /**
+   * @brief Helper method to drain completion queue entries.
+   * @param cons wait for sq_msn to catch up to this position.
+   */
+  __device__ __attribute__((noinline)) void ionic_quiet_internal(uint64_t active_lane_mask, uint32_t cons);
+
+  /* GDAVendor::IONIC END */
+
  uint32_t inline_threshold{0};

  char dev_name[24];