From 16a4f102032e8f1fe56bda27d54834eecfd032c0 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 26 Sep 2025 00:27:41 -0400 Subject: [PATCH] Select device NIC vendor code at runtime (#263) * Runtime selection of device implementation for post_wqe, quiet, ring_doorbell * Normalize function naming --- src/gda/bnxt/queue_pair_bnxt.cpp | 8 +-- src/gda/queue_pair.cpp | 118 ++++++++++++++++++++++++------- src/gda/queue_pair.hpp | 38 ++++++++-- 3 files changed, 128 insertions(+), 36 deletions(-) diff --git a/src/gda/bnxt/queue_pair_bnxt.cpp b/src/gda/bnxt/queue_pair_bnxt.cpp index 44b8de616a..67690e5213 100644 --- a/src/gda/bnxt/queue_pair_bnxt.cpp +++ b/src/gda/bnxt/queue_pair_bnxt.cpp @@ -222,7 +222,7 @@ __device__ int QueuePair::poll_cq() { return 0; } -__device__ void QueuePair::quiet() { +__device__ void QueuePair::bnxt_quiet() { uint64_t active_lane_mask; uint8_t active_lane_id; @@ -238,7 +238,7 @@ __device__ void QueuePair::quiet() { } } -__device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { +__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { uint64_t active_lane_mask; uint8_t active_lane_count; uint8_t active_lane_id; @@ -331,8 +331,8 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr } } -__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t length, uintptr_t *raddr, uint8_t opcode, - int64_t atomic_data, int64_t atomic_cmp, bool fetching) { +__device__ uint64_t QueuePair::bnxt_post_wqe_amo(int pe, int32_t length, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { uint64_t active_lane_mask; uint8_t active_lane_count; uint8_t active_lane_id; diff --git a/src/gda/queue_pair.cpp b/src/gda/queue_pair.cpp index 9b10d22b61..0d60b3c544 100644 --- a/src/gda/queue_pair.cpp +++ b/src/gda/queue_pair.cpp @@ -67,7 +67,7 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_vendor) { } /* Set Correct opcodes for each NIC */ -#ifdef GDA_IONIC +#if defined(GDA_IONIC) gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE; gda_op_atomic_fa = IONIC_V2_OP_ATOMIC_FA; gda_op_atomic_cs = IONIC_V2_OP_ATOMIC_CS; @@ -83,6 +83,7 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_vendor) { gda_op_atomic_fa = MLX5_OPCODE_ATOMIC_FA; gda_op_atomic_cs = MLX5_OPCODE_ATOMIC_CS; } + gda_vendor_ = gda_vendor; } QueuePair::~QueuePair() { @@ -104,7 +105,7 @@ QueuePair::~QueuePair() { /****************************************************************************** ************************ PROVIDER-SPECIFIC HELPERS *************************** *****************************************************************************/ -#ifdef GDA_IONIC +#if defined(GDA_IONIC) __device__ uint64_t QueuePair::get_same_qp_lane_mask() { uint64_t lane_mask = get_active_lane_mask(); uintptr_t this_val = reinterpret_cast(this); @@ -150,7 +151,7 @@ __device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask)); // wait for that space to be available - quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask); + ionic_quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask); return my_sq_prod; } @@ -166,7 +167,7 @@ __device__ uint32_t QueuePair::commit_sq(bool last, uint32_t my_sq_prod, uint32_ // spin } - ring_doorbell(dbprod); + ionic_ring_doorbell(dbprod); __hip_atomic_exchange(&sq_dbprod, dbprod, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); } @@ -207,7 +208,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { /* Report if the completion indicates an error. */ if (!!(qtf_be & swap_endian_val(IONIC_V1_CQE_ERROR))) { -#ifdef DEBUG +#if defined(DEBUG) uint32_t qtf = swap_endian_val(qtf_be); uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT; uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; @@ -245,7 +246,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { sq_msn = msn; } -__device__ void QueuePair::quiet_internal(uint64_t activemask, uint32_t cons) { +__device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) { /* wait for sq_msn to catch up or pass cons. */ /* 0x800000 - sign bit for 24-bit fields */ while ((sq_msn - cons) & 0x800000) { @@ -264,8 +265,8 @@ __device__ void QueuePair::quiet_internal(uint64_t activemask, uint32_t cons) { } #endif // GDA_IONIC -#ifdef GDA_IONIC -__device__ void QueuePair::ring_doorbell(uint32_t pos) { +#if defined(GDA_IONIC) +__device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) { // TODO When threads write at once to the same address, not all writes reach the bus. for (int i = 0; i < 64; ++i) { if (__lane_id() == i) { @@ -278,7 +279,7 @@ __device__ void QueuePair::ring_doorbell(uint32_t pos) { #endif #if defined(GDA_MLX5) -__device__ void QueuePair::ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) { +__device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) { swap_endian_store(const_cast(dbrec), (uint32_t)my_sq_counter); __atomic_signal_fence(__ATOMIC_SEQ_CST); @@ -289,14 +290,14 @@ __device__ void QueuePair::ring_doorbell(uint64_t db_val, uint64_t my_sq_counter } #endif // GDA_MLX5 -#ifdef GDA_IONIC -__device__ void QueuePair::quiet() { - quiet_internal(get_same_qp_lane_mask(), sq_prod); +#if defined(GDA_IONIC) +__device__ void QueuePair::ionic_quiet() { + ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod); } #endif #if defined(GDA_MLX5) -__device__ void QueuePair::quiet() { +__device__ void QueuePair::mlx5_quiet() { constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE; __shared__ uint64_t wqe_broadcast[BROADCAST_SIZE]; uint8_t wavefront_id = get_flat_block_id() / WF_SIZE; @@ -379,8 +380,73 @@ __device__ void QueuePair::quiet() { } #endif // GDA_MLX5 -#ifdef GDA_IONIC __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { + switch (gda_vendor_) { +#if defined(GDA_MLX5) + case GDAVendor::MLX5: + mlx5_post_wqe_rma(pe, size, laddr, raddr, opcode); + return; +#endif +#if defined(GDA_BNXT) + case GDAVendor::BNXT: + bnxt_post_wqe_rma(pe, size, laddr, raddr, opcode); + return; +#endif +#if defined(GDA_IONIC) + case GDAVendor::IONIC: + ionic_post_wqe_rma(pe, size, laddr, raddr, opcode); + return; +#endif + default: + assert(false /* invalid nic provider */); + } +} + +__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { + switch (gda_vendor_) { +#if defined(GDA_MLX5) + case GDAVendor::MLX5: + return mlx5_post_wqe_amo(pe, size, raddr, opcode, atomic_data, atomic_cmp, fetching); +#endif +#if defined(GDA_BNXT) + case GDAVendor::BNXT: + return bnxt_post_wqe_amo(pe, size, raddr, opcode, atomic_data, atomic_cmp, fetching); +#endif +#if defined(GDA_IONIC) + case GDAVendor::IONIC: + return ionic_post_wqe_amo(pe, size, raddr, opcode, atomic_data, atomic_cmp, fetching); +#endif + default: + assert(false /* invalid nic provider */); + return 0; + } +} + +__device__ void QueuePair::quiet() { + switch (gda_vendor_) { +#if defined(GDA_MLX5) + case GDAVendor::MLX5: + mlx5_quiet(); + return; +#endif +#if defined(GDA_BNXT) + case GDAVendor::BNXT: + bnxt_quiet(); + return; +#endif +#if defined(GDA_IONIC) + case GDAVendor::IONIC: + ionic_quiet(); + return; +#endif + default: + assert(false /* invalid nic provider */); + } +} + +#if defined(GDA_IONIC) +__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { uint64_t activemask = get_same_qp_lane_mask(); uint32_t num_wqes = get_active_lane_count(activemask); uint32_t my_logical_lane_id = get_active_lane_num(activemask); @@ -426,7 +492,7 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, #endif #if defined (GDA_MLX5) -__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { +__device__ void QueuePair::mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { uint64_t activemask = get_active_lane_mask(); uint8_t num_active_lanes = get_active_lane_count(activemask); uint8_t my_logical_lane_id = get_active_lane_num(activemask); @@ -454,7 +520,7 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, if (num_free_entries > num_entries_until_wave_last_entry) { break; } - quiet(); + mlx5_quiet(); } outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; @@ -479,7 +545,7 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uint8_t *base_ptr = reinterpret_cast(sq_buf); uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]); - ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); + mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -487,9 +553,9 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, } #endif // GDA_MLX5 -#ifdef GDA_IONIC -__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, - int64_t atomic_data, int64_t atomic_cmp, bool fetching) { +#if defined(GDA_IONIC) +__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { uint64_t activemask = get_same_qp_lane_mask(); uint32_t num_wqes = get_active_lane_count(activemask); uint32_t my_logical_lane_id = get_active_lane_num(activemask); @@ -538,7 +604,7 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *rad uint64_t ret{0}; if (fetching) { - quiet_internal(activemask, cons); + ionic_quiet_internal(activemask, cons); ret = wave_fetch_atomic[my_logical_lane_id]; __atomic_signal_fence(__ATOMIC_SEQ_CST); if (is_leader) { @@ -550,8 +616,8 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *rad #endif #if defined(GDA_MLX5) -__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, - int64_t atomic_data, int64_t atomic_cmp, bool fetching) { +__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { uint64_t activemask = get_active_lane_mask(); uint8_t num_active_lanes = get_active_lane_count(activemask); uint8_t my_logical_lane_id = get_active_lane_num(activemask); @@ -579,7 +645,7 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *rad if (num_free_entries > num_entries_until_wave_last_entry) { break; } - quiet(); + mlx5_quiet(); } uint64_t* wave_fetch_atomic{nullptr}; @@ -620,7 +686,7 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *rad uint8_t *base_ptr = reinterpret_cast(sq_buf); uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]); - ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); + mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -628,7 +694,7 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *rad uint64_t ret{0}; if (fetching) { - quiet(); + mlx5_quiet(); ret = wave_fetch_atomic[my_logical_lane_id]; __atomic_signal_fence(__ATOMIC_SEQ_CST); if (is_leader) { diff --git a/src/gda/queue_pair.hpp b/src/gda/queue_pair.hpp index da9ccd33e4..e0987ea133 100644 --- a/src/gda/queue_pair.hpp +++ b/src/gda/queue_pair.hpp @@ -159,18 +159,33 @@ class QueuePair { */ __device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); +#if defined(GDA_MLX5) + __device__ uint64_t mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); + __device__ void mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); +#endif +#if defined(GDA_BNXT) + __device__ uint64_t bnxt_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); + __device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); +#endif +#if defined(GDA_IONIC) + __device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); + __device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); +#endif + /** * @brief Helper method to ring the doorbell * * @param[in] db_val Doorbell value is written by method. */ -#if defined(GDA_IONIC) - __device__ void ring_doorbell(uint32_t pos); -#elif defined(GDA_BNXT) +#if defined(GDA_MLX5) + __device__ void mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter); +#endif +#if defined(GDA_BNXT) __device__ void ring_sq_doorbell(uint32_t slot_idx); __device__ void ring_cq_doorbell(uint32_t slot_idx); -#else - __device__ void ring_doorbell(uint64_t db_val, uint64_t my_sq_counter); +#endif +#if defined(GDA_IONIC) + __device__ void ionic_ring_doorbell(uint32_t pos); #endif #ifdef GDA_IONIC @@ -206,7 +221,7 @@ class QueuePair { * @brief Helper method to drain completion queue entries. * @param cons wait for sq_msn to catch up to this position. */ - __device__ __attribute__((noinline)) void quiet_internal(uint64_t active_lane_mask, uint32_t cons); + __device__ __attribute__((noinline)) void ionic_quiet_internal(uint64_t active_lane_mask, uint32_t cons); uint64_t *cq_dbreg{nullptr}; uint64_t cq_dbval{0}; @@ -225,6 +240,17 @@ class QueuePair { uint32_t sq_msn{0}; #endif +#if defined(GDA_MLX5) + __device__ void mlx5_quiet(); +#endif +#if defined(GDA_BNXT) + __device__ void bnxt_quiet(); +#endif +#if defined(GDA_IONIC) + __device__ void ionic_quiet(); +#endif + int gda_vendor_{0}; + /* GDAVendor::BNXT START */ uint64_t *dbr; struct bnxt_device_cq cq;