gda ionic: restore functionality of ionic gda in rocshmem (#269)

* Revamp findibverbs to find ionic again * gda ionic: rename ionic_sq_buf ionic_cq_buf Avoid duplicating member names used by mlx5 gda. Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> * gda: move spin lock to util.hpp Move spin lock out of ionic gda to util.hpp. Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> * gda ionic: assume latest fwabi changes There is no firmware abi compatibility in this ionic gda code yet, so assume we are using the latest firmware abi as of now. Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> * gda ionic: allow doorbell with incomplete wqes Use spin lock to ensure doorbell is only written with an increasing producer index. Ring the doorbell after this wave has initialized its wqes. Wqes of other waves might not be fully initialized, but firmware will not process them until the phase/color flag is updated in the respecitve wqes. Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> * gda ionic: poll cq for additional completions Keep polling the cq for more than just the minimum number of completions for this wave of threads to make progress, as long as the cq is not empty. A part of wave-optimized cq polling, at the expense of one wave polling additional completions, it was observed that nearly all other waves avoid taking the cq lock at all. Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> * gda: max_rd_atomic in rts transition In modify_qp(RTS), specify max_rd_atomic, not max_dest_rd_atomic. By not speicfying max_rd_atomic (rather, max_rd_atomic=zero), the local nic may get stuck transmitting the first read or atomic request. One read or atomic request is greater than the initiator depth of zero. Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> * gda ionic: allow specifying traffic class Allow specifying a traffic class. The network might have a specific traffic class configured as no-drop, for example. Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com> Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> * gda ionic: tweak uxdma assignment The ideal arrangement will have an equal number of QPs active on each uxdma pipeline. Pre-rebase, the better arrangement for rocshmem funcitonal test benchmarks was [0, 1], [1, 0], [0, 1], [1, 0], ... Now, following changes that add 'ROCSHMEM_GDA_ALTERNATE_QP_PORTS=1' by default, the better arrangement is [0, 1], [0, 1], [0, 1], [0, 1], ... Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> --------- Signed-off-by: Allen Hubbe <allen.hubbe@amd.com> Co-authored-by: Aurelien Bouteiller <abouteil@amd.com> Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com>
2025-10-07 10:08:19 -04:00
commit c84bbc250b
@@ -43,29 +43,31 @@ find_library(IBVerbs_LIBRARY
 )

 if (GDA_IONIC)
-find_library(IBVerbs_PROVIDER_LIBRARY
+list(APPEND provider_vars IBVerbs_IONIC_LIBRARY IBVerbs_IONIC_INCLUDE_DIR)
+find_path(IBVerbs_IONIC_INCLUDE_DIR infiniband/ionic_dv.h
+  HINTS ${PC_IBVerbs_INCLUDEDIR} ${PC_IBVerbs_INCLUDE_DIRS}
+  PATH_SUFFIXES include
+)
+
+find_library(IBVerbs_IONIC_LIBRARY
  NAMES ionic libionic
  HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS}
  PATH_SUFFIXES lib lib64
 )

-find_package_handle_standard_args(IBVerbs DEFAULT_MSG
-  IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY
+add_library(IBVerbs::verbs_ionic UNKNOWN IMPORTED)
+set_target_properties(IBVerbs::verbs_ionic PROPERTIES
+  IMPORTED_LOCATION "${IBVerbs_IONIC_LIBRARY}"
+  INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_IONIC_INCLUDE_DIR}"
 )
-mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY)
-
-add_library(IBVerbs::verbs_provider UNKNOWN IMPORTED)
-set_target_properties(IBVerbs::verbs_provider PROPERTIES
-  IMPORTED_LOCATION "${IBVerbs_PROVIDER_LIBRARY}"
-  INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_PROVIDER_INCLUDE_DIR}"
-)
-target_link_libraries(IBVerbs::verbs IBVerbs::verbs_provider)
 endif()

 find_package_handle_standard_args(IBVerbs DEFAULT_MSG
-  IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR
+  IBVerbs_LIBRARY
+  IBVerbs_INCLUDE_DIR
+  ${provider_vars}
 )
-mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR)
+mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR ${provider_vars})

 if (IBVerbs_FOUND)
 add_library(IBVerbs::verbs UNKNOWN IMPORTED)
@@ -75,6 +77,8 @@ set_target_properties(IBVerbs::verbs PROPERTIES
  INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_INCLUDE_DIR}"
 )

-target_link_libraries(IBVerbs::verbs INTERFACE)
+target_link_libraries(IBVerbs::verbs INTERFACE
+  $<TARGET_NAME_IF_EXISTS:IBVerbs::verbs_ionic>
+)

 endif()
@@ -63,6 +63,7 @@ namespace envvar {

  namespace gda {
    const var<bool> alternate_qp_ports("ALTERNATE_QP_PORTS", "", true);
+    const var<uint8_t> traffic_class("TRAFFIC_CLASS", "", 0);
  }  // namespace gda

  namespace _detail {
@@ -106,9 +106,12 @@ namespace envvar {
  };

  using var_types = unique_type_sequence_t<bool,
+                                           uint8_t,
+                                           int32_t,
+                                           uint32_t,
                                           size_t,
                                           int64_t,
-                                           uint32_t,
+                                           uint64_t,
                                           useconds_t,
                                           std::string,
                                           types::socket_family,
@@ -443,6 +446,7 @@ namespace envvar {
  namespace gda {
    template <typename T> using var = var<T, category::tag::GDA>;
    extern const var<bool> alternate_qp_ports;
+    extern const var<uint8_t> traffic_class;
  }  // namespace gda
 }  // namespace envvar
 }  // namespace rocshmem
@@ -841,6 +841,7 @@ void GDABackend::modify_qps_init_to_rtr() {
    attr.ah_attr.is_global      = 1;
    attr.ah_attr.grh.hop_limit  = 1;
    attr.ah_attr.sl             = 1;
+    attr.ah_attr.grh.traffic_class = envvar::gda::traffic_class;
  }

  attr_mask = IBV_QP_STATE
@@ -882,9 +883,9 @@ void GDABackend::modify_qps_rtr_to_rts() {
  attr.rnr_retry     = 7;

  if (gda_vendor == GDAVendor::IONIC) {
-    attr.max_dest_rd_atomic = 15;
+    attr.max_rd_atomic = 15;
  } else {
-    attr.max_dest_rd_atomic = 1;
+    attr.max_rd_atomic = 1;
  }

  attr_mask = IBV_QP_STATE
@@ -1053,7 +1054,7 @@ void GDABackend::create_cqs(int cqe) {

  for (int i = 0; i < qps.size(); i++) {
    if (gda_vendor == GDAVendor::IONIC) {
-      cq_attr.parent_domain = pd_uxdma[((i + 1) / 2) & 1];
+      cq_attr.parent_domain = pd_uxdma[i & 1];
    }

    cq_ex = ibv_create_cq_ex(context, &cq_attr);
@@ -1093,7 +1094,7 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
  gpu_qp->cq_dbval = dvcq.q.db_val;
  gpu_qp->cq_mask = dvcq.q.mask;

-  gpu_qp->cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
+  gpu_qp->ionic_cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);

  ionic_dv_qp dvqp;
  ionic_dv_get_qp(&dvqp, qps[conn_num]);
@@ -1101,7 +1102,12 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
  gpu_qp->sq_dbreg = gpu_db_sq;
  gpu_qp->sq_dbval = dvqp.sq.db_val;
  gpu_qp->sq_mask = dvqp.sq.mask;
-  gpu_qp->sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
+  gpu_qp->ionic_sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
+
+  strncpy(gpu_qp->dev_name,
+          qps[conn_num]->context->device->name,
+          sizeof(gpu_qp->dev_name));
+  gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0;

  gpu_qp->qp_num = qps[conn_num]->qp_num;
  gpu_qp->lkey = heap_mr->lkey;
@@ -1198,7 +1204,7 @@ void GDABackend::create_qps(int sq_length) {

  for (int i = 0; i < qps.size(); i++) {
    if (gda_vendor == GDAVendor::IONIC) {
-      attr.pd      = pd_uxdma[((i + 1) / 2) & 1];
+      attr.pd      = pd_uxdma[i & 1];
    }
    attr.send_cq = cqs[i];
    attr.recv_cq = cqs[i];
@@ -97,7 +97,7 @@ class GDABackend : public Backend {
  struct ibv_port_attr portinfo;
  union ibv_gid gid;
  int port = 1;
-  int gid_index;
+  int gid_index = 0;

  uint32_t *heap_rkey = nullptr;
  struct ibv_mr *heap_mr = nullptr;
@@ -32,8 +32,4 @@ extern "C" {
 }
 #endif

-#define SPIN_LOCK_INVALID  0xdead
-#define SPIN_LOCK_UNLOCKED 0x1234
-#define SPIN_LOCK_LOCKED   0xabcd
-
 #endif  //LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_
@@ -29,7 +29,6 @@
 #include "backend_gda.hpp"
 #include "endian.hpp"
 #include "segment_builder.hpp"
-#include "util.hpp"
 #include "constants.hpp"

 namespace rocshmem {
@@ -69,6 +68,7 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_vendor) {
  /* Set Correct opcodes for each NIC */
 #if defined(GDA_IONIC)
  gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE;
+  gda_op_rdma_read  = IONIC_V2_OP_RDMA_READ;
  gda_op_atomic_fa  = IONIC_V2_OP_ATOMIC_FA;
  gda_op_atomic_cs  = IONIC_V2_OP_ATOMIC_CS;
 #endif
@@ -122,25 +122,6 @@ __device__ uint64_t QueuePair::get_same_qp_lane_mask() {
  return lane_mask;
 }

-__device__ bool QueuePair::cq_lock_try_acquire(uint64_t activemask) {
-  uint32_t cq_lock_val = SPIN_LOCK_INVALID;
-
-  if (is_first_active_lane(activemask)) {
-    cq_lock_val = SPIN_LOCK_UNLOCKED;
-    __hip_atomic_compare_exchange_strong(&cq_lock, &cq_lock_val, SPIN_LOCK_LOCKED,
-                                         __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT);
-  }
-  cq_lock_val = __shfl(cq_lock_val, get_first_active_lane_id(activemask));
-
-  return (cq_lock_val == SPIN_LOCK_UNLOCKED);
-}
-
-__device__ void QueuePair::cq_lock_release(uint64_t activemask) {
-  if (is_first_active_lane(activemask)) {
-    __hip_atomic_store(&cq_lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
-  }
-}
-
 __device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) {
  uint32_t my_sq_prod = 0;

@@ -156,22 +137,19 @@ __device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes
  return my_sq_prod;
 }

-__device__ uint32_t QueuePair::commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe) {
+__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) {
  uint32_t dbprod = my_sq_prod + num_wqes;

-  if (last) {
-    // signal last wqe before the doorbell
-    wqe->base.flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
+  spin_lock_acquire_shared(&sq_lock, activemask);

-    while (__hip_atomic_load(&sq_dbprod, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT) != my_sq_prod) {
-      // spin
-    }
+  if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) {
+    sq_dbprod = dbprod;

    ionic_ring_doorbell(dbprod);
-
-    __hip_atomic_exchange(&sq_dbprod, dbprod, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
  }

+  spin_lock_release_shared(&sq_lock, activemask);
+
  return dbprod;
 }

@@ -180,7 +158,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
  uint32_t my_cq_pos = cq_pos + my_logical_lane_id;

  /* Look at the cqe at the current position in the cq buffer */
-  struct ionic_v1_cqe *cqe = &cq_buf[my_cq_pos & cq_mask];
+  struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask];

  /* Determine expected color based on cq wrap count */
  uint32_t qtf_color_bit = swap_endian_val<uint32_t>(IONIC_V1_CQE_COLOR);
@@ -189,18 +167,9 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
    qtf_color_exp = 0;
  }

-  /* Wait for at least one thread cqe color == expected color */
-  uint32_t qtf_be;
-  bool ready;
-  uint64_t ballot_ready;
-  do {
-    qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
-    ready = (qtf_be & qtf_color_bit) == qtf_color_exp;
-    ballot_ready = __ballot(ready);
-  } while (!ballot_ready);
-
-  /* Other threads saw a ready cqe, but not this thread */
-  if (!ready) {
+  /* Check if my cqe color == expected color */
+  uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
+  if ((qtf_be & qtf_color_bit) != qtf_color_exp) {
    return;
  }

@@ -214,10 +183,10 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
    uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
    uint32_t flag = qtf & 0xf;
    uint32_t status = swap_endian_val<uint32_t>(cqe->status_length);
-    uint64_t npg = swap_endian_val<uint64_t>(cqe->send.npg_wqe_id);
+    uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK;

-    printf("QUIET ERROR: qid %u type %u flag %#x status %u msn %u npg %lu\n",
-        qid, type, flag, status, msn, npg);
+    printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n",
+        dev_name, qid, type, flag, status, msn, npg);
 #endif
    /* No other way to signal an error, so just crash. */
    abort();
@@ -226,7 +195,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
  /* Only proceed with the furthest ahead cqe to update the sq state */
  uint64_t my_lane_mask = 1ull << __lane_id();
  uint64_t lesser_lane_mask = my_lane_mask - 1;
-  if (my_lane_mask != (ballot_ready & ~lesser_lane_mask)) {
+  if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) {
    return;
  }

@@ -247,19 +216,33 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
 }

 __device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) {
+  uint32_t greed = 10;
+
  /* wait for sq_msn to catch up or pass cons. */
  /* 0x800000 - sign bit for 24-bit fields     */
  while ((sq_msn - cons) & 0x800000) {
-    if (!cq_lock_try_acquire(activemask)) {
+    if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) {
      continue;
    }

    /* with lock acquired, this wave polls cqes until caught up */
    while ((sq_msn - cons) & 0x800000) {
+      uint32_t old_sq_msn = sq_msn;
+
      poll_wave_cqes(activemask);
+
+      if (!((sq_msn - cons) & 0x800000)) {
+        if (sq_msn == old_sq_msn) {
+          break;
+        }
+        if (!greed) {
+          break;
+        }
+        --greed;
+      }
    }

-    cq_lock_release(activemask);
+    spin_lock_release_shared(&cq_lock, activemask);
    break;
  }
 }
@@ -452,17 +435,25 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
  uint32_t my_logical_lane_id = get_active_lane_num(activemask);
  uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
  uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
-  struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask];
+  struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
+  uint16_t wqe_flags = 0;
+
+  if (!(my_sq_pos & (sq_mask + 1))) {
+    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
+  }
+
+  if (is_last_active_lane(activemask)) {
+    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
+  }

  // TODO why is this needed?
  if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) {
    size = 1;
  }

-  wqe->base.wqe_id = my_sq_pos;
+  wqe->base.wqe_idx = my_sq_pos;
  wqe->base.op = opcode;
  wqe->base.num_sge_key = size ? 1 : 0;
-  wqe->base.flags = swap_endian_val<uint16_t>(0);
  wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);

  wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
@@ -472,7 +463,7 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l

  if (size) {
    if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) {
-      wqe->base.flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
+      wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
      wqe->base.num_sge_key = 0;
      if (!laddr) {
        // TODO why is this needed?
@@ -487,7 +478,9 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
    }
  }

-  commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe);
+  __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
+
+  commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
 }
 #endif

@@ -563,7 +556,8 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
  const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
  uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
  uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
-  struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask];
+  struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
+  uint16_t wqe_flags = 0;
  uint32_t cons;

  uint64_t* wave_fetch_atomic{nullptr};
@@ -578,10 +572,17 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
    wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
  }

-  wqe->base.wqe_id = my_sq_pos;
+  if (!(my_sq_pos & (sq_mask + 1))) {
+    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
+  }
+
+  if (is_last_active_lane(activemask)) {
+    wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
+  }
+
+  wqe->base.wqe_idx = my_sq_pos;
  wqe->base.op = opcode;
  wqe->base.num_sge_key = 1;
-  wqe->base.flags = swap_endian_val<uint16_t>(0);
  wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);

  wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
@@ -600,7 +601,9 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
    wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(nonfetching_atomic_lkey);
  }

-  cons = commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe);
+  __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
+
+  cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);

  uint64_t ret{0};
  if (fetching) {
@@ -37,6 +37,7 @@
 #include "rocshmem_config.h"
 #include "endian.h"
 #include "constants.hpp"
+#include "util.hpp"

 #include "gda/ionic/provider_gda_ionic.hpp"
 #include "gda/mlx5/provider_gda_mlx5.hpp"
@@ -191,9 +192,6 @@ class QueuePair {
 #ifdef GDA_IONIC
  __device__ uint64_t get_same_qp_lane_mask();

-  __device__ bool cq_lock_try_acquire(uint64_t active_lane_mask);
-  __device__ void cq_lock_release(uint64_t active_lane_mask);
-
  /**
   * @brief Reserve space in the sq to post this many wqes.
   * @param my_tid my logical thread id.
@@ -210,7 +208,7 @@ class QueuePair {
   * @param wqe this thread's wqe.
   * @return doorbell producer index.
   */
-  __device__ uint32_t commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe);
+  __device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes);

  /**
   * @brief Helper method to poll the next completion queue entry.
@@ -226,7 +224,7 @@ class QueuePair {
  uint64_t *cq_dbreg{nullptr};
  uint64_t cq_dbval{0};
  uint64_t cq_mask{0};
-  struct ionic_v1_cqe *cq_buf{nullptr};
+  struct ionic_v1_cqe *ionic_cq_buf{nullptr};
  uint32_t cq_lock{SPIN_LOCK_UNLOCKED};
  uint32_t cq_pos{0};
  uint32_t cq_dbpos{0};
@@ -234,7 +232,8 @@ class QueuePair {
  uint64_t *sq_dbreg{nullptr};
  uint64_t sq_dbval{0};
  uint64_t sq_mask{0};
-  struct ionic_v1_wqe *sq_buf{nullptr};
+  struct ionic_v1_wqe *ionic_sq_buf{nullptr};
+  uint32_t sq_lock{SPIN_LOCK_UNLOCKED};
  uint32_t sq_dbprod{0};
  uint32_t sq_prod{0};
  uint32_t sq_msn{0};
@@ -325,6 +324,7 @@ class QueuePair {

  uint32_t inline_threshold{0};

+  char dev_name[24];
  uint32_t qp_num{0};
  uint32_t rkey{0};
  uint32_t lkey{0};
@@ -265,6 +265,76 @@ __device__ __forceinline__ bool is_last_active_lane() {
  return is_last_active_lane(get_active_lane_mask());
 }

+#define SPIN_LOCK_INVALID  0xdead
+#define SPIN_LOCK_UNLOCKED 0x1234
+#define SPIN_LOCK_LOCKED   0xabcd
+
+/*
+ * Each thread in wave tries to acquire a different lock.
+ */
+__device__ __forceinline__ bool spin_lock_try_acquire_unique(uint32_t *lock) {
+  uint32_t lock_val = SPIN_LOCK_UNLOCKED;
+
+  __hip_atomic_compare_exchange_strong(lock, &lock_val, SPIN_LOCK_LOCKED,
+                                       __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+
+  return lock_val == SPIN_LOCK_UNLOCKED;
+}
+
+/*
+ * Each thread in wave acquires a different lock.
+ * (deadlock if locks are not different)
+ */
+__device__ __forceinline__ void spin_lock_acquire_unique(uint32_t *lock) {
+  while (!spin_lock_try_acquire_unique(lock)) {
+    // spin
+  }
+}
+
+/*
+ * Each thread in wave releases a different lock.
+ */
+__device__ __forceinline__ void spin_lock_release_unique(uint32_t *lock) {
+  __hip_atomic_store(lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE,
+                     __HIP_MEMORY_SCOPE_AGENT);
+}
+
+/*
+ * Threads in activemask together try to acquire the same lock.
+ */
+__device__ __forceinline__ bool spin_lock_try_acquire_shared(uint32_t *lock, uint64_t activemask) {
+  uint32_t lock_val = SPIN_LOCK_INVALID;
+
+  if (is_first_active_lane(activemask)) {
+    lock_val = SPIN_LOCK_UNLOCKED;
+    __hip_atomic_compare_exchange_strong(lock, &lock_val, SPIN_LOCK_LOCKED,
+                                         __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+                                         __HIP_MEMORY_SCOPE_AGENT);
+  }
+  lock_val = __shfl(lock_val, get_first_active_lane_id(activemask));
+
+  return lock_val == SPIN_LOCK_UNLOCKED;
+}
+
+/*
+ * Threads in activemask together acquire the same lock.
+ */
+__device__ __forceinline__ void spin_lock_acquire_shared(uint32_t *lock, uint64_t activemask) {
+  while (!spin_lock_try_acquire_shared(lock, activemask)) {
+    // spin
+  }
+}
+
+/*
+ * Threads in activemask together release the same lock.
+ */
+__device__ __forceinline__ void spin_lock_release_shared(uint32_t *lock, uint64_t activemask) {
+  if (is_first_active_lane(activemask)) {
+    __hip_atomic_store(lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE,
+                       __HIP_MEMORY_SCOPE_AGENT);
+  }
+}

 extern __constant__ int* print_lock;