gda: fix incorrect casts from void* to uintptr_t (#369)

Этот коммит содержится в:
Omri Mor
2025-12-19 16:18:49 -08:00
коммит произвёл GitHub
родитель cf6a53e81c
Коммит e8fc5e67c4
7 изменённых файлов: 73 добавлений и 74 удалений
+11 -11
Просмотреть файл
@@ -213,7 +213,7 @@ __device__ void QueuePair::bnxt_quiet_single() {
poll_cq_until(sq.depth);
}
__device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr, int32_t length, uint8_t opcode) {
__device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode) {
struct bnxt_re_bsqe hdr;
struct bnxt_re_rdma rdma;
struct bnxt_re_sge sge;
@@ -251,12 +251,12 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr
hdr.lhdr.qkey_len = length;
/* Populate RDMA Segment */
rdma.rva = (uint64_t) raddr;
rdma.rva = raddr;
rdma.rkey = rkey;
if (!inline_msg) {
/* Populate SG Segment */
sge.pa = (uint64_t) laddr;
sge.pa = laddr;
sge.lkey = lkey;
sge.length = length;
}
@@ -266,7 +266,7 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr
memcpy(rdma_ptr, &rdma, sizeof(struct bnxt_re_rdma));
if (inline_msg) {
memcpy(sge_ptr, laddr, length);
memcpy(sge_ptr, reinterpret_cast<const void*>(laddr), length);
} else {
memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge));
}
@@ -278,7 +278,7 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr
bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT);
}
__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) {
uint64_t active_lane_mask;
uint8_t active_lane_count;
uint8_t active_lane_id;
@@ -306,8 +306,8 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t *
}
}
__device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t *laddr,
uintptr_t *raddr, uint8_t opcode,
__device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t laddr,
uintptr_t raddr, uint8_t opcode,
bool ring_db) {
aquire_lock(&sq.lock);
@@ -322,7 +322,7 @@ __device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t *l
release_lock(&sq.lock);
}
__device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opcode,
__device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp,
bool fetching) {
struct bnxt_re_bsqe hdr;
@@ -354,7 +354,7 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opco
| (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT)
| wqe_type;
hdr.key_immd = rkey;
hdr.lhdr.rva = (uint64_t) raddr;
hdr.lhdr.rva = raddr;
/* Populate AMO Segment */
amo.swp_dt = atomic_data;
@@ -385,7 +385,7 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opco
return atomic_idx;
}
__device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcode,
__device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp,
bool fetching) {
uint64_t active_lane_mask;
@@ -422,7 +422,7 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcod
return 0;
}
__device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode,
__device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp,
bool fetching) {
uint32_t atomic_idx = 0;
+8 -8
Просмотреть файл
@@ -186,7 +186,7 @@ __device__ void QueuePair::ionic_quiet() {
ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod);
}
__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) {
__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) {
uint64_t activemask = get_same_qp_lane_mask();
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
uint32_t num_wqes = 1;
@@ -224,8 +224,8 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
wqe->base.num_sge_key = size ? 1 : 0;
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(raddr >> 32);
wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(raddr);
wqe->common.rdma.remote_rkey = swap_endian_val<uint32_t>(rkey);
wqe->common.length = swap_endian_val<uint32_t>(size);
@@ -237,10 +237,10 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
// TODO why is this needed?
wqe->common.pld.data[0] = 1;
} else {
memcpy(wqe->common.pld.data, laddr, size);
memcpy(wqe->common.pld.data, reinterpret_cast<const void*>(laddr), size);
}
} else {
wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(laddr));
wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(laddr);
wqe->common.pld.sgl[0].len = swap_endian_val<uint32_t>(size);
wqe->common.pld.sgl[0].lkey = swap_endian_val<uint32_t>(lkey);
}
@@ -251,7 +251,7 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
}
__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
uint64_t activemask = get_same_qp_lane_mask();
uint32_t num_wqes = get_active_lane_count(activemask);
@@ -289,8 +289,8 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
wqe->base.num_sge_key = 1;
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(raddr >> 32);
wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(raddr);
wqe->atomic_v2.remote_rkey = swap_endian_val<uint32_t>(rkey);
wqe->atomic_v2.swap_add_high = swap_endian_val<uint32_t>(atomic_data >> 32);
wqe->atomic_v2.swap_add_low = swap_endian_val<uint32_t>(atomic_data);
+9 -9
Просмотреть файл
@@ -155,8 +155,8 @@ __device__ __forceinline__ void QueuePair::mlx5_wait_for_free_sq_slots(
}
__device__ __forceinline__ void QueuePair::mlx5_build_rma_wqe(
uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t *laddr,
uintptr_t *raddr, int32_t size, uint8_t opcode) {
uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t laddr,
uintptr_t raddr, int32_t size, uint8_t opcode) {
outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
SegmentBuilder seg_build(my_sq_index, sq_buf);
@@ -166,7 +166,7 @@ __device__ __forceinline__ void QueuePair::mlx5_build_rma_wqe(
seg_build.update_raddr_seg(raddr, rkey);
if (size <= inline_threshold && opcode == gda_op_rdma_write) {
seg_build.update_inl_data_seg(laddr, size);
seg_build.update_inl_data_seg(reinterpret_cast<const void*>(laddr), size);
} else {
seg_build.update_data_seg(laddr, size, lkey);
}
@@ -199,8 +199,8 @@ __device__ __forceinline__ void QueuePair::mlx5_ring_doorbell(
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
__device__ void QueuePair::mlx5_post_wqe_rma(int32_t size, uintptr_t *laddr,
uintptr_t *raddr, uint8_t opcode) {
__device__ void QueuePair::mlx5_post_wqe_rma(int32_t size, uintptr_t laddr,
uintptr_t raddr, uint8_t opcode) {
uint64_t activemask = get_active_lane_mask();
uint8_t num_active_lanes = get_active_lane_count(activemask);
uint8_t my_logical_lane_id = get_active_lane_num(activemask);
@@ -255,7 +255,7 @@ QueuePair::mlx5_allocate_wave_fetching_atomic_buffer(
}
__device__ __forceinline__ void QueuePair::mlx5_build_amo_wqe(
uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t *raddr,
uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t raddr,
uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching,
uint64_t *wave_fetch_atomic) {
outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
@@ -267,14 +267,14 @@ __device__ __forceinline__ void QueuePair::mlx5_build_amo_wqe(
seg_build.update_atomic_seg(atomic_data, atomic_cmp);
if (fetching) {
seg_build.update_data_seg(wave_fetch_atomic, 8, fetching_atomic_lkey);
seg_build.update_data_seg(reinterpret_cast<uintptr_t>(wave_fetch_atomic), 8, fetching_atomic_lkey);
} else {
seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey);
seg_build.update_data_seg(reinterpret_cast<uintptr_t>(nonfetching_atomic), 8, nonfetching_atomic_lkey);
}
}
__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int32_t size,
uintptr_t *raddr, uint8_t opcode, int64_t atomic_data,
uintptr_t raddr, uint8_t opcode, int64_t atomic_data,
int64_t atomic_cmp, bool fetching) {
uint64_t activemask = get_active_lane_mask();
uint8_t num_active_lanes = get_active_lane_count(activemask);
+6 -6
Просмотреть файл
@@ -80,9 +80,9 @@ __device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uin
segp++;
}
__device__ void SegmentBuilder::update_raddr_seg(uintptr_t *raddr, uint32_t rkey) {
__device__ void SegmentBuilder::update_raddr_seg(uint64_t raddr, uint32_t rkey) {
segp->raddr_seg = {0};
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->raddr_seg.raddr), reinterpret_cast<uint64_t>(raddr));
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->raddr_seg.raddr), raddr);
segp->raddr_seg.rkey = rkey;
segp++;
}
@@ -100,15 +100,15 @@ __device__ void SegmentBuilder::update_raddr_seg(uintptr_t *raddr, uint32_t rkey
* seg->addr = htobe64(address);
* }
*/
__device__ void SegmentBuilder::update_data_seg(uintptr_t *address, uint32_t length, uint32_t lkey) {
__device__ void SegmentBuilder::update_data_seg(uint64_t laddr, uint32_t size, uint32_t lkey) {
segp->data_seg = {0};
swap_endian_store(&segp->data_seg.byte_count, length);
swap_endian_store(&segp->data_seg.byte_count, size);
segp->data_seg.lkey = lkey;
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->data_seg.addr), reinterpret_cast<uint64_t>(address));
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->data_seg.addr), laddr);
segp++;
}
__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr, int32_t size) {
__device__ void SegmentBuilder::update_inl_data_seg(const void* laddr, int32_t size) {
// size is masked with 0x3FF because only the first 10 bits of byte_count are valid
swap_endian_store(&segp->inl_data_seg.byte_count, ((size & 0x3FF) | MLX5_INLINE_SEG));
// + 1 because we start packing the segment with data after the byte_count parameter
+3 -3
Просмотреть файл
@@ -38,11 +38,11 @@ class SegmentBuilder {
__device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num,
uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm);
__device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey);
__device__ void update_raddr_seg(uint64_t raddr, uint32_t rkey);
__device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey);
__device__ void update_data_seg(uint64_t laddr, uint32_t size, uint32_t lkey);
__device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size);
__device__ void update_inl_data_seg(const void* laddr, int32_t size);
__device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp);
+17 -18
Просмотреть файл
@@ -121,7 +121,7 @@ QueuePair::~QueuePair() {
/******************************************************************************
************************ PROVIDER-SPECIFIC HELPERS ***************************
*****************************************************************************/
__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) {
__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) {
switch (gda_provider_) {
#if defined(GDA_IONIC)
case GDAProvider::IONIC:
@@ -133,7 +133,7 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr,
}
}
__device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) {
__device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) {
if (cy == THREAD) {
bool need_turn {true};
uint64_t turns = __ballot(need_turn);
@@ -153,7 +153,7 @@ __device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t *la
}
}
__device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
__device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) {
switch (gda_provider_) {
#if defined(GDA_MLX5)
case GDAProvider::MLX5:
@@ -170,7 +170,7 @@ __device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t *ladd
}
}
__device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db) {
__device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db) {
switch (gda_provider_) {
#if defined(GDA_BNXT)
case GDAProvider::BNXT:
@@ -183,7 +183,7 @@ __device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t *laddr, u
}
}
__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
switch (gda_provider_) {
#if defined(GDA_MLX5)
@@ -204,7 +204,7 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *rad
}
}
__device__ uint64_t QueuePair::post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode,
__device__ uint64_t QueuePair::post_wqe_amo_single(uintptr_t raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp,
bool fetching) {
switch (gda_provider_) {
@@ -264,46 +264,45 @@ __device__ void QueuePair::quiet_single() {
****************************** SHMEM INTERFACE *******************************
*****************************************************************************/
__device__ void QueuePair::put_nbi(void *dest, const void *source, size_t nelems, int pe, Collectivity cy) {
uintptr_t *src = reinterpret_cast<uintptr_t*>(const_cast<void*>(source));
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
uintptr_t src = reinterpret_cast<uintptr_t>(source);
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
post_wqe_rma(pe, nelems, src, dst, gda_op_rdma_write, cy);
}
__device__ void QueuePair::put_nbi_single(void *dest, const void *source, size_t nelems, bool ring_db) {
uintptr_t *src = reinterpret_cast<uintptr_t*>(const_cast<void*>(source));
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
uintptr_t src = reinterpret_cast<uintptr_t>(source);
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
post_wqe_rma_single(nelems, src, dst, gda_op_rdma_write, ring_db);
}
__device__ void QueuePair::get_nbi(void *dest, const void *source, size_t nelems, int pe, Collectivity cy) {
uintptr_t *src = reinterpret_cast<uintptr_t*>(const_cast<void*>(source));
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
uintptr_t src = reinterpret_cast<uintptr_t>(source);
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
post_wqe_rma(pe, nelems, dst, src, gda_op_rdma_read, cy);
}
__device__ int64_t QueuePair::atomic_cas(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) {
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_cs, atomic_data, atomic_cmp, true);
}
__device__ int64_t QueuePair::atomic_cas_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) {
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_cs, atomic_data, atomic_cmp, false);
}
__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) {
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_fa, atomic_data, atomic_cmp, true);
}
__device__ void QueuePair::atomic_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) {
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_fa, atomic_data, atomic_cmp, false);
}
__device__ void QueuePair::atomic_nofetch_single(void *dest, int64_t value) {
const bool fetching = false;
uintptr_t *dst = static_cast<uintptr_t*>(dest);
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
post_wqe_amo_single(dst, gda_op_atomic_fa, value, 0, false);
}
+19 -19
Просмотреть файл
@@ -156,9 +156,9 @@ class QueuePair {
* @param[in] atomic_cmp An atomic comparison operation to be performed.
* @param[in] fetching True if the operation returns a value.
*/
__device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ __attribute__((noinline)) uint64_t post_wqe_amo_single(uintptr_t *raddr,
__device__ __attribute__((noinline)) uint64_t post_wqe_amo_single(uintptr_t raddr,
uint8_t opcode,
int64_t atomic_data,
int64_t atomic_cmp,
@@ -173,11 +173,11 @@ class QueuePair {
* @param[in] raddr Remote address.
* @param[in] opcode Operation to be performed.
*/
__device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy);
__device__ __attribute__((noinline)) void post_wqe_rma_turn(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy);
__device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy);
__device__ __attribute__((noinline)) void post_wqe_rma_turn(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy);
__device__ __attribute__((noinline)) void post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db);
__device__ __attribute__((noinline)) void post_wqe_rma_mt(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
__device__ __attribute__((noinline)) void post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db);
__device__ __attribute__((noinline)) void post_wqe_rma_mt(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode);
#if defined(GDA_MLX5)
__device__ __forceinline__ void
@@ -189,11 +189,11 @@ class QueuePair {
__device__ __forceinline__ void
mlx5_build_rma_wqe(uint64_t my_sq_counter, uint64_t my_sq_index,
uintptr_t *laddr, uintptr_t *raddr, int32_t size, uint8_t opcode);
uintptr_t laddr, uintptr_t raddr, int32_t size, uint8_t opcode);
__device__ __forceinline__ void
mlx5_build_amo_wqe(uint64_t my_sq_counter, uint64_t my_sq_index,
uintptr_t *raddr, uint8_t opcode, int64_t atomic_data,
uintptr_t raddr, uint8_t opcode, int64_t atomic_data,
int64_t atomic_cmp, bool fetching, uint64_t *wave_fetch_atomic);
__device__ __forceinline__ uint64_t*
@@ -204,12 +204,12 @@ class QueuePair {
mlx5_ring_doorbell(uint64_t wave_sq_counter, uint8_t num_wqes);
__device__ uint64_t
mlx5_post_wqe_amo(int32_t size, uintptr_t *raddr, uint8_t opcode,
mlx5_post_wqe_amo(int32_t size, uintptr_t raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void
mlx5_post_wqe_rma(int32_t size, uintptr_t *laddr,
uintptr_t *raddr, uint8_t opcode);
mlx5_post_wqe_rma(int32_t size, uintptr_t laddr,
uintptr_t raddr, uint8_t opcode);
__device__ void
mlx5_quiet();
@@ -217,21 +217,21 @@ class QueuePair {
#endif
#if defined(GDA_BNXT)
__device__ void bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr, int32_t length, uint8_t opcode);
__device__ uint32_t bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
__device__ void bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode);
__device__ uint32_t bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
__device__ uint64_t bnxt_post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
__device__ uint64_t bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
__device__ uint64_t bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
__device__ uint64_t bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
__device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
__device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode);
__device__ void bnxt_post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db);
__device__ void bnxt_post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db);
__device__ void bnxt_quiet();
__device__ void bnxt_quiet_single();
#endif
#if defined(GDA_IONIC)
__device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy);
__device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy);
__device__ void ionic_quiet();
#endif