From e8fc5e67c4907f35bb36b82ba74612fe05119778 Mon Sep 17 00:00:00 2001 From: Omri Mor Date: Fri, 19 Dec 2025 16:18:49 -0800 Subject: [PATCH] gda: fix incorrect casts from void* to uintptr_t (#369) --- src/gda/bnxt/queue_pair_bnxt.cpp | 22 ++++++++--------- src/gda/ionic/queue_pair_ionic.cpp | 16 ++++++------- src/gda/mlx5/queue_pair_mlx5.cpp | 18 +++++++------- src/gda/mlx5/segment_builder.cpp | 12 +++++----- src/gda/mlx5/segment_builder.hpp | 6 ++--- src/gda/queue_pair.cpp | 35 +++++++++++++-------------- src/gda/queue_pair.hpp | 38 +++++++++++++++--------------- 7 files changed, 73 insertions(+), 74 deletions(-) diff --git a/src/gda/bnxt/queue_pair_bnxt.cpp b/src/gda/bnxt/queue_pair_bnxt.cpp index 26ad09cfab..43f0dae8df 100644 --- a/src/gda/bnxt/queue_pair_bnxt.cpp +++ b/src/gda/bnxt/queue_pair_bnxt.cpp @@ -213,7 +213,7 @@ __device__ void QueuePair::bnxt_quiet_single() { poll_cq_until(sq.depth); } -__device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr, int32_t length, uint8_t opcode) { +__device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode) { struct bnxt_re_bsqe hdr; struct bnxt_re_rdma rdma; struct bnxt_re_sge sge; @@ -251,12 +251,12 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr hdr.lhdr.qkey_len = length; /* Populate RDMA Segment */ - rdma.rva = (uint64_t) raddr; + rdma.rva = raddr; rdma.rkey = rkey; if (!inline_msg) { /* Populate SG Segment */ - sge.pa = (uint64_t) laddr; + sge.pa = laddr; sge.lkey = lkey; sge.length = length; } @@ -266,7 +266,7 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr memcpy(rdma_ptr, &rdma, sizeof(struct bnxt_re_rdma)); if (inline_msg) { - memcpy(sge_ptr, laddr, length); + memcpy(sge_ptr, reinterpret_cast(laddr), length); } else { memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge)); } @@ -278,7 +278,7 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT); } -__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { +__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) { uint64_t active_lane_mask; uint8_t active_lane_count; uint8_t active_lane_id; @@ -306,8 +306,8 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t * } } -__device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t *laddr, - uintptr_t *raddr, uint8_t opcode, +__device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t laddr, + uintptr_t raddr, uint8_t opcode, bool ring_db) { aquire_lock(&sq.lock); @@ -322,7 +322,7 @@ __device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t *l release_lock(&sq.lock); } -__device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opcode, +__device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching) { struct bnxt_re_bsqe hdr; @@ -354,7 +354,7 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opco | (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT) | wqe_type; hdr.key_immd = rkey; - hdr.lhdr.rva = (uint64_t) raddr; + hdr.lhdr.rva = raddr; /* Populate AMO Segment */ amo.swp_dt = atomic_data; @@ -385,7 +385,7 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opco return atomic_idx; } -__device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcode, +__device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching) { uint64_t active_lane_mask; @@ -422,7 +422,7 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcod return 0; } -__device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode, +__device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching) { uint32_t atomic_idx = 0; diff --git a/src/gda/ionic/queue_pair_ionic.cpp b/src/gda/ionic/queue_pair_ionic.cpp index 9533ab21be..c827bdf42f 100644 --- a/src/gda/ionic/queue_pair_ionic.cpp +++ b/src/gda/ionic/queue_pair_ionic.cpp @@ -186,7 +186,7 @@ __device__ void QueuePair::ionic_quiet() { ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod); } -__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) { +__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) { uint64_t activemask = get_same_qp_lane_mask(); uint32_t my_logical_lane_id = get_active_lane_num(activemask); uint32_t num_wqes = 1; @@ -224,8 +224,8 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l wqe->base.num_sge_key = size ? 1 : 0; wqe->base.imm_data_key = swap_endian_val(0); - wqe->common.rdma.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); - wqe->common.rdma.remote_va_low = swap_endian_val(reinterpret_cast(raddr)); + wqe->common.rdma.remote_va_high = swap_endian_val(raddr >> 32); + wqe->common.rdma.remote_va_low = swap_endian_val(raddr); wqe->common.rdma.remote_rkey = swap_endian_val(rkey); wqe->common.length = swap_endian_val(size); @@ -237,10 +237,10 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l // TODO why is this needed? wqe->common.pld.data[0] = 1; } else { - memcpy(wqe->common.pld.data, laddr, size); + memcpy(wqe->common.pld.data, reinterpret_cast(laddr), size); } } else { - wqe->common.pld.sgl[0].va = swap_endian_val(reinterpret_cast(laddr)); + wqe->common.pld.sgl[0].va = swap_endian_val(laddr); wqe->common.pld.sgl[0].len = swap_endian_val(size); wqe->common.pld.sgl[0].lkey = swap_endian_val(lkey); } @@ -251,7 +251,7 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); } -__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, +__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching) { uint64_t activemask = get_same_qp_lane_mask(); uint32_t num_wqes = get_active_lane_count(activemask); @@ -289,8 +289,8 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_ wqe->base.num_sge_key = 1; wqe->base.imm_data_key = swap_endian_val(0); - wqe->atomic_v2.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); - wqe->atomic_v2.remote_va_low = swap_endian_val(reinterpret_cast(raddr)); + wqe->atomic_v2.remote_va_high = swap_endian_val(raddr >> 32); + wqe->atomic_v2.remote_va_low = swap_endian_val(raddr); wqe->atomic_v2.remote_rkey = swap_endian_val(rkey); wqe->atomic_v2.swap_add_high = swap_endian_val(atomic_data >> 32); wqe->atomic_v2.swap_add_low = swap_endian_val(atomic_data); diff --git a/src/gda/mlx5/queue_pair_mlx5.cpp b/src/gda/mlx5/queue_pair_mlx5.cpp index 54a34424b2..9a081f36e8 100644 --- a/src/gda/mlx5/queue_pair_mlx5.cpp +++ b/src/gda/mlx5/queue_pair_mlx5.cpp @@ -155,8 +155,8 @@ __device__ __forceinline__ void QueuePair::mlx5_wait_for_free_sq_slots( } __device__ __forceinline__ void QueuePair::mlx5_build_rma_wqe( - uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t *laddr, - uintptr_t *raddr, int32_t size, uint8_t opcode) { + uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t laddr, + uintptr_t raddr, int32_t size, uint8_t opcode) { outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; SegmentBuilder seg_build(my_sq_index, sq_buf); @@ -166,7 +166,7 @@ __device__ __forceinline__ void QueuePair::mlx5_build_rma_wqe( seg_build.update_raddr_seg(raddr, rkey); if (size <= inline_threshold && opcode == gda_op_rdma_write) { - seg_build.update_inl_data_seg(laddr, size); + seg_build.update_inl_data_seg(reinterpret_cast(laddr), size); } else { seg_build.update_data_seg(laddr, size, lkey); } @@ -199,8 +199,8 @@ __device__ __forceinline__ void QueuePair::mlx5_ring_doorbell( __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } -__device__ void QueuePair::mlx5_post_wqe_rma(int32_t size, uintptr_t *laddr, - uintptr_t *raddr, uint8_t opcode) { +__device__ void QueuePair::mlx5_post_wqe_rma(int32_t size, uintptr_t laddr, + uintptr_t raddr, uint8_t opcode) { uint64_t activemask = get_active_lane_mask(); uint8_t num_active_lanes = get_active_lane_count(activemask); uint8_t my_logical_lane_id = get_active_lane_num(activemask); @@ -255,7 +255,7 @@ QueuePair::mlx5_allocate_wave_fetching_atomic_buffer( } __device__ __forceinline__ void QueuePair::mlx5_build_amo_wqe( - uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t *raddr, + uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching, uint64_t *wave_fetch_atomic) { outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; @@ -267,14 +267,14 @@ __device__ __forceinline__ void QueuePair::mlx5_build_amo_wqe( seg_build.update_atomic_seg(atomic_data, atomic_cmp); if (fetching) { - seg_build.update_data_seg(wave_fetch_atomic, 8, fetching_atomic_lkey); + seg_build.update_data_seg(reinterpret_cast(wave_fetch_atomic), 8, fetching_atomic_lkey); } else { - seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey); + seg_build.update_data_seg(reinterpret_cast(nonfetching_atomic), 8, nonfetching_atomic_lkey); } } __device__ uint64_t QueuePair::mlx5_post_wqe_amo(int32_t size, - uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, + uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching) { uint64_t activemask = get_active_lane_mask(); uint8_t num_active_lanes = get_active_lane_count(activemask); diff --git a/src/gda/mlx5/segment_builder.cpp b/src/gda/mlx5/segment_builder.cpp index 497474f7a6..deb0963bb4 100644 --- a/src/gda/mlx5/segment_builder.cpp +++ b/src/gda/mlx5/segment_builder.cpp @@ -80,9 +80,9 @@ __device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uin segp++; } -__device__ void SegmentBuilder::update_raddr_seg(uintptr_t *raddr, uint32_t rkey) { +__device__ void SegmentBuilder::update_raddr_seg(uint64_t raddr, uint32_t rkey) { segp->raddr_seg = {0}; - swap_endian_store(reinterpret_cast(&segp->raddr_seg.raddr), reinterpret_cast(raddr)); + swap_endian_store(reinterpret_cast(&segp->raddr_seg.raddr), raddr); segp->raddr_seg.rkey = rkey; segp++; } @@ -100,15 +100,15 @@ __device__ void SegmentBuilder::update_raddr_seg(uintptr_t *raddr, uint32_t rkey * seg->addr = htobe64(address); * } */ -__device__ void SegmentBuilder::update_data_seg(uintptr_t *address, uint32_t length, uint32_t lkey) { +__device__ void SegmentBuilder::update_data_seg(uint64_t laddr, uint32_t size, uint32_t lkey) { segp->data_seg = {0}; - swap_endian_store(&segp->data_seg.byte_count, length); + swap_endian_store(&segp->data_seg.byte_count, size); segp->data_seg.lkey = lkey; - swap_endian_store(reinterpret_cast(&segp->data_seg.addr), reinterpret_cast(address)); + swap_endian_store(reinterpret_cast(&segp->data_seg.addr), laddr); segp++; } -__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr, int32_t size) { +__device__ void SegmentBuilder::update_inl_data_seg(const void* laddr, int32_t size) { // size is masked with 0x3FF because only the first 10 bits of byte_count are valid swap_endian_store(&segp->inl_data_seg.byte_count, ((size & 0x3FF) | MLX5_INLINE_SEG)); // + 1 because we start packing the segment with data after the byte_count parameter diff --git a/src/gda/mlx5/segment_builder.hpp b/src/gda/mlx5/segment_builder.hpp index 51861cd0f2..5565118792 100644 --- a/src/gda/mlx5/segment_builder.hpp +++ b/src/gda/mlx5/segment_builder.hpp @@ -38,11 +38,11 @@ class SegmentBuilder { __device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm); - __device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey); + __device__ void update_raddr_seg(uint64_t raddr, uint32_t rkey); - __device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey); + __device__ void update_data_seg(uint64_t laddr, uint32_t size, uint32_t lkey); - __device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size); + __device__ void update_inl_data_seg(const void* laddr, int32_t size); __device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp); diff --git a/src/gda/queue_pair.cpp b/src/gda/queue_pair.cpp index 2e6d2b9e0c..7adf1ff41c 100644 --- a/src/gda/queue_pair.cpp +++ b/src/gda/queue_pair.cpp @@ -121,7 +121,7 @@ QueuePair::~QueuePair() { /****************************************************************************** ************************ PROVIDER-SPECIFIC HELPERS *************************** *****************************************************************************/ -__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) { +__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) { switch (gda_provider_) { #if defined(GDA_IONIC) case GDAProvider::IONIC: @@ -133,7 +133,7 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, } } -__device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) { +__device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) { if (cy == THREAD) { bool need_turn {true}; uint64_t turns = __ballot(need_turn); @@ -153,7 +153,7 @@ __device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t *la } } -__device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { +__device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) { switch (gda_provider_) { #if defined(GDA_MLX5) case GDAProvider::MLX5: @@ -170,7 +170,7 @@ __device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t *ladd } } -__device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db) { +__device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db) { switch (gda_provider_) { #if defined(GDA_BNXT) case GDAProvider::BNXT: @@ -183,7 +183,7 @@ __device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t *laddr, u } } -__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, +__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching) { switch (gda_provider_) { #if defined(GDA_MLX5) @@ -204,7 +204,7 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *rad } } -__device__ uint64_t QueuePair::post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode, +__device__ uint64_t QueuePair::post_wqe_amo_single(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching) { switch (gda_provider_) { @@ -264,46 +264,45 @@ __device__ void QueuePair::quiet_single() { ****************************** SHMEM INTERFACE ******************************* *****************************************************************************/ __device__ void QueuePair::put_nbi(void *dest, const void *source, size_t nelems, int pe, Collectivity cy) { - uintptr_t *src = reinterpret_cast(const_cast(source)); - uintptr_t *dst = reinterpret_cast(dest); + uintptr_t src = reinterpret_cast(source); + uintptr_t dst = reinterpret_cast(dest); post_wqe_rma(pe, nelems, src, dst, gda_op_rdma_write, cy); } __device__ void QueuePair::put_nbi_single(void *dest, const void *source, size_t nelems, bool ring_db) { - uintptr_t *src = reinterpret_cast(const_cast(source)); - uintptr_t *dst = reinterpret_cast(dest); + uintptr_t src = reinterpret_cast(source); + uintptr_t dst = reinterpret_cast(dest); post_wqe_rma_single(nelems, src, dst, gda_op_rdma_write, ring_db); } __device__ void QueuePair::get_nbi(void *dest, const void *source, size_t nelems, int pe, Collectivity cy) { - uintptr_t *src = reinterpret_cast(const_cast(source)); - uintptr_t *dst = reinterpret_cast(dest); + uintptr_t src = reinterpret_cast(source); + uintptr_t dst = reinterpret_cast(dest); post_wqe_rma(pe, nelems, dst, src, gda_op_rdma_read, cy); } __device__ int64_t QueuePair::atomic_cas(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) { - uintptr_t *dst = reinterpret_cast(dest); + uintptr_t dst = reinterpret_cast(dest); return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_cs, atomic_data, atomic_cmp, true); } __device__ int64_t QueuePair::atomic_cas_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) { - uintptr_t *dst = reinterpret_cast(dest); + uintptr_t dst = reinterpret_cast(dest); return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_cs, atomic_data, atomic_cmp, false); } __device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) { - uintptr_t *dst = reinterpret_cast(dest); + uintptr_t dst = reinterpret_cast(dest); return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_fa, atomic_data, atomic_cmp, true); } __device__ void QueuePair::atomic_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) { - uintptr_t *dst = reinterpret_cast(dest); + uintptr_t dst = reinterpret_cast(dest); post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_fa, atomic_data, atomic_cmp, false); } __device__ void QueuePair::atomic_nofetch_single(void *dest, int64_t value) { - const bool fetching = false; - uintptr_t *dst = static_cast(dest); + uintptr_t dst = reinterpret_cast(dest); post_wqe_amo_single(dst, gda_op_atomic_fa, value, 0, false); } diff --git a/src/gda/queue_pair.hpp b/src/gda/queue_pair.hpp index 94496ce261..36b217c1f0 100644 --- a/src/gda/queue_pair.hpp +++ b/src/gda/queue_pair.hpp @@ -156,9 +156,9 @@ class QueuePair { * @param[in] atomic_cmp An atomic comparison operation to be performed. * @param[in] fetching True if the operation returns a value. */ - __device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); + __device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); - __device__ __attribute__((noinline)) uint64_t post_wqe_amo_single(uintptr_t *raddr, + __device__ __attribute__((noinline)) uint64_t post_wqe_amo_single(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, @@ -173,11 +173,11 @@ class QueuePair { * @param[in] raddr Remote address. * @param[in] opcode Operation to be performed. */ - __device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy); - __device__ __attribute__((noinline)) void post_wqe_rma_turn(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy); + __device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy); + __device__ __attribute__((noinline)) void post_wqe_rma_turn(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy); - __device__ __attribute__((noinline)) void post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db); - __device__ __attribute__((noinline)) void post_wqe_rma_mt(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); + __device__ __attribute__((noinline)) void post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db); + __device__ __attribute__((noinline)) void post_wqe_rma_mt(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode); #if defined(GDA_MLX5) __device__ __forceinline__ void @@ -189,11 +189,11 @@ class QueuePair { __device__ __forceinline__ void mlx5_build_rma_wqe(uint64_t my_sq_counter, uint64_t my_sq_index, - uintptr_t *laddr, uintptr_t *raddr, int32_t size, uint8_t opcode); + uintptr_t laddr, uintptr_t raddr, int32_t size, uint8_t opcode); __device__ __forceinline__ void mlx5_build_amo_wqe(uint64_t my_sq_counter, uint64_t my_sq_index, - uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, + uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching, uint64_t *wave_fetch_atomic); __device__ __forceinline__ uint64_t* @@ -204,12 +204,12 @@ class QueuePair { mlx5_ring_doorbell(uint64_t wave_sq_counter, uint8_t num_wqes); __device__ uint64_t - mlx5_post_wqe_amo(int32_t size, uintptr_t *raddr, uint8_t opcode, + mlx5_post_wqe_amo(int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); __device__ void - mlx5_post_wqe_rma(int32_t size, uintptr_t *laddr, - uintptr_t *raddr, uint8_t opcode); + mlx5_post_wqe_rma(int32_t size, uintptr_t laddr, + uintptr_t raddr, uint8_t opcode); __device__ void mlx5_quiet(); @@ -217,21 +217,21 @@ class QueuePair { #endif #if defined(GDA_BNXT) - __device__ void bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr, int32_t length, uint8_t opcode); - __device__ uint32_t bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching); + __device__ void bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode); + __device__ uint32_t bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching); - __device__ uint64_t bnxt_post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching); - __device__ uint64_t bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching); + __device__ uint64_t bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching); + __device__ uint64_t bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching); - __device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); + __device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode); - __device__ void bnxt_post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db); + __device__ void bnxt_post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db); __device__ void bnxt_quiet(); __device__ void bnxt_quiet_single(); #endif #if defined(GDA_IONIC) - __device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); - __device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy); + __device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); + __device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy); __device__ void ionic_quiet(); #endif