gda: fix incorrect casts from void* to uintptr_t (#369)
Этот коммит содержится в:
@@ -213,7 +213,7 @@ __device__ void QueuePair::bnxt_quiet_single() {
|
||||
poll_cq_until(sq.depth);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr, int32_t length, uint8_t opcode) {
|
||||
__device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode) {
|
||||
struct bnxt_re_bsqe hdr;
|
||||
struct bnxt_re_rdma rdma;
|
||||
struct bnxt_re_sge sge;
|
||||
@@ -251,12 +251,12 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr
|
||||
hdr.lhdr.qkey_len = length;
|
||||
|
||||
/* Populate RDMA Segment */
|
||||
rdma.rva = (uint64_t) raddr;
|
||||
rdma.rva = raddr;
|
||||
rdma.rkey = rkey;
|
||||
|
||||
if (!inline_msg) {
|
||||
/* Populate SG Segment */
|
||||
sge.pa = (uint64_t) laddr;
|
||||
sge.pa = laddr;
|
||||
sge.lkey = lkey;
|
||||
sge.length = length;
|
||||
}
|
||||
@@ -266,7 +266,7 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr
|
||||
memcpy(rdma_ptr, &rdma, sizeof(struct bnxt_re_rdma));
|
||||
|
||||
if (inline_msg) {
|
||||
memcpy(sge_ptr, laddr, length);
|
||||
memcpy(sge_ptr, reinterpret_cast<const void*>(laddr), length);
|
||||
} else {
|
||||
memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge));
|
||||
}
|
||||
@@ -278,7 +278,7 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr
|
||||
bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
|
||||
__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) {
|
||||
uint64_t active_lane_mask;
|
||||
uint8_t active_lane_count;
|
||||
uint8_t active_lane_id;
|
||||
@@ -306,8 +306,8 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t *
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t *laddr,
|
||||
uintptr_t *raddr, uint8_t opcode,
|
||||
__device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t laddr,
|
||||
uintptr_t raddr, uint8_t opcode,
|
||||
bool ring_db) {
|
||||
|
||||
aquire_lock(&sq.lock);
|
||||
@@ -322,7 +322,7 @@ __device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t *l
|
||||
release_lock(&sq.lock);
|
||||
}
|
||||
|
||||
__device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opcode,
|
||||
__device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp,
|
||||
bool fetching) {
|
||||
struct bnxt_re_bsqe hdr;
|
||||
@@ -354,7 +354,7 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opco
|
||||
| (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT)
|
||||
| wqe_type;
|
||||
hdr.key_immd = rkey;
|
||||
hdr.lhdr.rva = (uint64_t) raddr;
|
||||
hdr.lhdr.rva = raddr;
|
||||
|
||||
/* Populate AMO Segment */
|
||||
amo.swp_dt = atomic_data;
|
||||
@@ -385,7 +385,7 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opco
|
||||
return atomic_idx;
|
||||
}
|
||||
|
||||
__device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcode,
|
||||
__device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp,
|
||||
bool fetching) {
|
||||
uint64_t active_lane_mask;
|
||||
@@ -422,7 +422,7 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcod
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode,
|
||||
__device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp,
|
||||
bool fetching) {
|
||||
uint32_t atomic_idx = 0;
|
||||
|
||||
@@ -186,7 +186,7 @@ __device__ void QueuePair::ionic_quiet() {
|
||||
ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) {
|
||||
__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) {
|
||||
uint64_t activemask = get_same_qp_lane_mask();
|
||||
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
|
||||
uint32_t num_wqes = 1;
|
||||
@@ -224,8 +224,8 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
|
||||
wqe->base.num_sge_key = size ? 1 : 0;
|
||||
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
|
||||
|
||||
wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
|
||||
wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
|
||||
wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(raddr >> 32);
|
||||
wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(raddr);
|
||||
wqe->common.rdma.remote_rkey = swap_endian_val<uint32_t>(rkey);
|
||||
wqe->common.length = swap_endian_val<uint32_t>(size);
|
||||
|
||||
@@ -237,10 +237,10 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
|
||||
// TODO why is this needed?
|
||||
wqe->common.pld.data[0] = 1;
|
||||
} else {
|
||||
memcpy(wqe->common.pld.data, laddr, size);
|
||||
memcpy(wqe->common.pld.data, reinterpret_cast<const void*>(laddr), size);
|
||||
}
|
||||
} else {
|
||||
wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(laddr));
|
||||
wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(laddr);
|
||||
wqe->common.pld.sgl[0].len = swap_endian_val<uint32_t>(size);
|
||||
wqe->common.pld.sgl[0].lkey = swap_endian_val<uint32_t>(lkey);
|
||||
}
|
||||
@@ -251,7 +251,7 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
|
||||
commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
|
||||
}
|
||||
|
||||
__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
|
||||
__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
|
||||
uint64_t activemask = get_same_qp_lane_mask();
|
||||
uint32_t num_wqes = get_active_lane_count(activemask);
|
||||
@@ -289,8 +289,8 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
|
||||
wqe->base.num_sge_key = 1;
|
||||
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
|
||||
|
||||
wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
|
||||
wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
|
||||
wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(raddr >> 32);
|
||||
wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(raddr);
|
||||
wqe->atomic_v2.remote_rkey = swap_endian_val<uint32_t>(rkey);
|
||||
wqe->atomic_v2.swap_add_high = swap_endian_val<uint32_t>(atomic_data >> 32);
|
||||
wqe->atomic_v2.swap_add_low = swap_endian_val<uint32_t>(atomic_data);
|
||||
|
||||
@@ -155,8 +155,8 @@ __device__ __forceinline__ void QueuePair::mlx5_wait_for_free_sq_slots(
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void QueuePair::mlx5_build_rma_wqe(
|
||||
uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t *laddr,
|
||||
uintptr_t *raddr, int32_t size, uint8_t opcode) {
|
||||
uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t laddr,
|
||||
uintptr_t raddr, int32_t size, uint8_t opcode) {
|
||||
outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
|
||||
|
||||
SegmentBuilder seg_build(my_sq_index, sq_buf);
|
||||
@@ -166,7 +166,7 @@ __device__ __forceinline__ void QueuePair::mlx5_build_rma_wqe(
|
||||
seg_build.update_raddr_seg(raddr, rkey);
|
||||
|
||||
if (size <= inline_threshold && opcode == gda_op_rdma_write) {
|
||||
seg_build.update_inl_data_seg(laddr, size);
|
||||
seg_build.update_inl_data_seg(reinterpret_cast<const void*>(laddr), size);
|
||||
} else {
|
||||
seg_build.update_data_seg(laddr, size, lkey);
|
||||
}
|
||||
@@ -199,8 +199,8 @@ __device__ __forceinline__ void QueuePair::mlx5_ring_doorbell(
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::mlx5_post_wqe_rma(int32_t size, uintptr_t *laddr,
|
||||
uintptr_t *raddr, uint8_t opcode) {
|
||||
__device__ void QueuePair::mlx5_post_wqe_rma(int32_t size, uintptr_t laddr,
|
||||
uintptr_t raddr, uint8_t opcode) {
|
||||
uint64_t activemask = get_active_lane_mask();
|
||||
uint8_t num_active_lanes = get_active_lane_count(activemask);
|
||||
uint8_t my_logical_lane_id = get_active_lane_num(activemask);
|
||||
@@ -255,7 +255,7 @@ QueuePair::mlx5_allocate_wave_fetching_atomic_buffer(
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void QueuePair::mlx5_build_amo_wqe(
|
||||
uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t *raddr,
|
||||
uint64_t my_sq_counter, uint64_t my_sq_index, uintptr_t raddr,
|
||||
uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching,
|
||||
uint64_t *wave_fetch_atomic) {
|
||||
outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
|
||||
@@ -267,14 +267,14 @@ __device__ __forceinline__ void QueuePair::mlx5_build_amo_wqe(
|
||||
seg_build.update_atomic_seg(atomic_data, atomic_cmp);
|
||||
|
||||
if (fetching) {
|
||||
seg_build.update_data_seg(wave_fetch_atomic, 8, fetching_atomic_lkey);
|
||||
seg_build.update_data_seg(reinterpret_cast<uintptr_t>(wave_fetch_atomic), 8, fetching_atomic_lkey);
|
||||
} else {
|
||||
seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey);
|
||||
seg_build.update_data_seg(reinterpret_cast<uintptr_t>(nonfetching_atomic), 8, nonfetching_atomic_lkey);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int32_t size,
|
||||
uintptr_t *raddr, uint8_t opcode, int64_t atomic_data,
|
||||
uintptr_t raddr, uint8_t opcode, int64_t atomic_data,
|
||||
int64_t atomic_cmp, bool fetching) {
|
||||
uint64_t activemask = get_active_lane_mask();
|
||||
uint8_t num_active_lanes = get_active_lane_count(activemask);
|
||||
|
||||
@@ -80,9 +80,9 @@ __device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uin
|
||||
segp++;
|
||||
}
|
||||
|
||||
__device__ void SegmentBuilder::update_raddr_seg(uintptr_t *raddr, uint32_t rkey) {
|
||||
__device__ void SegmentBuilder::update_raddr_seg(uint64_t raddr, uint32_t rkey) {
|
||||
segp->raddr_seg = {0};
|
||||
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->raddr_seg.raddr), reinterpret_cast<uint64_t>(raddr));
|
||||
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->raddr_seg.raddr), raddr);
|
||||
segp->raddr_seg.rkey = rkey;
|
||||
segp++;
|
||||
}
|
||||
@@ -100,15 +100,15 @@ __device__ void SegmentBuilder::update_raddr_seg(uintptr_t *raddr, uint32_t rkey
|
||||
* seg->addr = htobe64(address);
|
||||
* }
|
||||
*/
|
||||
__device__ void SegmentBuilder::update_data_seg(uintptr_t *address, uint32_t length, uint32_t lkey) {
|
||||
__device__ void SegmentBuilder::update_data_seg(uint64_t laddr, uint32_t size, uint32_t lkey) {
|
||||
segp->data_seg = {0};
|
||||
swap_endian_store(&segp->data_seg.byte_count, length);
|
||||
swap_endian_store(&segp->data_seg.byte_count, size);
|
||||
segp->data_seg.lkey = lkey;
|
||||
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->data_seg.addr), reinterpret_cast<uint64_t>(address));
|
||||
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->data_seg.addr), laddr);
|
||||
segp++;
|
||||
}
|
||||
|
||||
__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr, int32_t size) {
|
||||
__device__ void SegmentBuilder::update_inl_data_seg(const void* laddr, int32_t size) {
|
||||
// size is masked with 0x3FF because only the first 10 bits of byte_count are valid
|
||||
swap_endian_store(&segp->inl_data_seg.byte_count, ((size & 0x3FF) | MLX5_INLINE_SEG));
|
||||
// + 1 because we start packing the segment with data after the byte_count parameter
|
||||
|
||||
@@ -38,11 +38,11 @@ class SegmentBuilder {
|
||||
__device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num,
|
||||
uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm);
|
||||
|
||||
__device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey);
|
||||
__device__ void update_raddr_seg(uint64_t raddr, uint32_t rkey);
|
||||
|
||||
__device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey);
|
||||
__device__ void update_data_seg(uint64_t laddr, uint32_t size, uint32_t lkey);
|
||||
|
||||
__device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size);
|
||||
__device__ void update_inl_data_seg(const void* laddr, int32_t size);
|
||||
|
||||
__device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp);
|
||||
|
||||
|
||||
@@ -121,7 +121,7 @@ QueuePair::~QueuePair() {
|
||||
/******************************************************************************
|
||||
************************ PROVIDER-SPECIFIC HELPERS ***************************
|
||||
*****************************************************************************/
|
||||
__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) {
|
||||
__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) {
|
||||
switch (gda_provider_) {
|
||||
#if defined(GDA_IONIC)
|
||||
case GDAProvider::IONIC:
|
||||
@@ -133,7 +133,7 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr,
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) {
|
||||
__device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) {
|
||||
if (cy == THREAD) {
|
||||
bool need_turn {true};
|
||||
uint64_t turns = __ballot(need_turn);
|
||||
@@ -153,7 +153,7 @@ __device__ void QueuePair::post_wqe_rma_turn(int pe, int32_t size, uintptr_t *la
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
|
||||
__device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) {
|
||||
switch (gda_provider_) {
|
||||
#if defined(GDA_MLX5)
|
||||
case GDAProvider::MLX5:
|
||||
@@ -170,7 +170,7 @@ __device__ void QueuePair::post_wqe_rma_mt(int pe, int32_t size, uintptr_t *ladd
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db) {
|
||||
__device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db) {
|
||||
switch (gda_provider_) {
|
||||
#if defined(GDA_BNXT)
|
||||
case GDAProvider::BNXT:
|
||||
@@ -183,7 +183,7 @@ __device__ void QueuePair::post_wqe_rma_single(int32_t size, uintptr_t *laddr, u
|
||||
}
|
||||
}
|
||||
|
||||
__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
|
||||
__device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
|
||||
switch (gda_provider_) {
|
||||
#if defined(GDA_MLX5)
|
||||
@@ -204,7 +204,7 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t size, uintptr_t *rad
|
||||
}
|
||||
}
|
||||
|
||||
__device__ uint64_t QueuePair::post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode,
|
||||
__device__ uint64_t QueuePair::post_wqe_amo_single(uintptr_t raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp,
|
||||
bool fetching) {
|
||||
switch (gda_provider_) {
|
||||
@@ -264,46 +264,45 @@ __device__ void QueuePair::quiet_single() {
|
||||
****************************** SHMEM INTERFACE *******************************
|
||||
*****************************************************************************/
|
||||
__device__ void QueuePair::put_nbi(void *dest, const void *source, size_t nelems, int pe, Collectivity cy) {
|
||||
uintptr_t *src = reinterpret_cast<uintptr_t*>(const_cast<void*>(source));
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
|
||||
uintptr_t src = reinterpret_cast<uintptr_t>(source);
|
||||
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
|
||||
post_wqe_rma(pe, nelems, src, dst, gda_op_rdma_write, cy);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::put_nbi_single(void *dest, const void *source, size_t nelems, bool ring_db) {
|
||||
uintptr_t *src = reinterpret_cast<uintptr_t*>(const_cast<void*>(source));
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
|
||||
uintptr_t src = reinterpret_cast<uintptr_t>(source);
|
||||
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
|
||||
post_wqe_rma_single(nelems, src, dst, gda_op_rdma_write, ring_db);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::get_nbi(void *dest, const void *source, size_t nelems, int pe, Collectivity cy) {
|
||||
uintptr_t *src = reinterpret_cast<uintptr_t*>(const_cast<void*>(source));
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
|
||||
uintptr_t src = reinterpret_cast<uintptr_t>(source);
|
||||
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
|
||||
post_wqe_rma(pe, nelems, dst, src, gda_op_rdma_read, cy);
|
||||
}
|
||||
|
||||
__device__ int64_t QueuePair::atomic_cas(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) {
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
|
||||
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
|
||||
return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_cs, atomic_data, atomic_cmp, true);
|
||||
}
|
||||
|
||||
__device__ int64_t QueuePair::atomic_cas_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) {
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
|
||||
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
|
||||
return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_cs, atomic_data, atomic_cmp, false);
|
||||
}
|
||||
|
||||
__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) {
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
|
||||
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
|
||||
return post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_fa, atomic_data, atomic_cmp, true);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::atomic_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe) {
|
||||
uintptr_t *dst = reinterpret_cast<uintptr_t*>(dest);
|
||||
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
|
||||
post_wqe_amo(pe, sizeof(int64_t), dst, gda_op_atomic_fa, atomic_data, atomic_cmp, false);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::atomic_nofetch_single(void *dest, int64_t value) {
|
||||
const bool fetching = false;
|
||||
uintptr_t *dst = static_cast<uintptr_t*>(dest);
|
||||
uintptr_t dst = reinterpret_cast<uintptr_t>(dest);
|
||||
post_wqe_amo_single(dst, gda_op_atomic_fa, value, 0, false);
|
||||
}
|
||||
|
||||
|
||||
@@ -156,9 +156,9 @@ class QueuePair {
|
||||
* @param[in] atomic_cmp An atomic comparison operation to be performed.
|
||||
* @param[in] fetching True if the operation returns a value.
|
||||
*/
|
||||
__device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
|
||||
__device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
|
||||
|
||||
__device__ __attribute__((noinline)) uint64_t post_wqe_amo_single(uintptr_t *raddr,
|
||||
__device__ __attribute__((noinline)) uint64_t post_wqe_amo_single(uintptr_t raddr,
|
||||
uint8_t opcode,
|
||||
int64_t atomic_data,
|
||||
int64_t atomic_cmp,
|
||||
@@ -173,11 +173,11 @@ class QueuePair {
|
||||
* @param[in] raddr Remote address.
|
||||
* @param[in] opcode Operation to be performed.
|
||||
*/
|
||||
__device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy);
|
||||
__device__ __attribute__((noinline)) void post_wqe_rma_turn(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy);
|
||||
__device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy);
|
||||
__device__ __attribute__((noinline)) void post_wqe_rma_turn(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy);
|
||||
|
||||
__device__ __attribute__((noinline)) void post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db);
|
||||
__device__ __attribute__((noinline)) void post_wqe_rma_mt(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
|
||||
__device__ __attribute__((noinline)) void post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db);
|
||||
__device__ __attribute__((noinline)) void post_wqe_rma_mt(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode);
|
||||
|
||||
#if defined(GDA_MLX5)
|
||||
__device__ __forceinline__ void
|
||||
@@ -189,11 +189,11 @@ class QueuePair {
|
||||
|
||||
__device__ __forceinline__ void
|
||||
mlx5_build_rma_wqe(uint64_t my_sq_counter, uint64_t my_sq_index,
|
||||
uintptr_t *laddr, uintptr_t *raddr, int32_t size, uint8_t opcode);
|
||||
uintptr_t laddr, uintptr_t raddr, int32_t size, uint8_t opcode);
|
||||
|
||||
__device__ __forceinline__ void
|
||||
mlx5_build_amo_wqe(uint64_t my_sq_counter, uint64_t my_sq_index,
|
||||
uintptr_t *raddr, uint8_t opcode, int64_t atomic_data,
|
||||
uintptr_t raddr, uint8_t opcode, int64_t atomic_data,
|
||||
int64_t atomic_cmp, bool fetching, uint64_t *wave_fetch_atomic);
|
||||
|
||||
__device__ __forceinline__ uint64_t*
|
||||
@@ -204,12 +204,12 @@ class QueuePair {
|
||||
mlx5_ring_doorbell(uint64_t wave_sq_counter, uint8_t num_wqes);
|
||||
|
||||
__device__ uint64_t
|
||||
mlx5_post_wqe_amo(int32_t size, uintptr_t *raddr, uint8_t opcode,
|
||||
mlx5_post_wqe_amo(int32_t size, uintptr_t raddr, uint8_t opcode,
|
||||
int64_t atomic_data, int64_t atomic_cmp, bool fetch);
|
||||
|
||||
__device__ void
|
||||
mlx5_post_wqe_rma(int32_t size, uintptr_t *laddr,
|
||||
uintptr_t *raddr, uint8_t opcode);
|
||||
mlx5_post_wqe_rma(int32_t size, uintptr_t laddr,
|
||||
uintptr_t raddr, uint8_t opcode);
|
||||
|
||||
__device__ void
|
||||
mlx5_quiet();
|
||||
@@ -217,21 +217,21 @@ class QueuePair {
|
||||
#endif
|
||||
#if defined(GDA_BNXT)
|
||||
|
||||
__device__ void bnxt_write_rma_wqe(uintptr_t *raddr, uintptr_t *laddr, int32_t length, uint8_t opcode);
|
||||
__device__ uint32_t bnxt_write_amo_wqe(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
|
||||
__device__ void bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode);
|
||||
__device__ uint32_t bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
|
||||
|
||||
__device__ uint64_t bnxt_post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
|
||||
__device__ uint64_t bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
|
||||
__device__ uint64_t bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
|
||||
__device__ uint64_t bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching);
|
||||
|
||||
__device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
|
||||
__device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode);
|
||||
|
||||
__device__ void bnxt_post_wqe_rma_single(int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, bool ring_db);
|
||||
__device__ void bnxt_post_wqe_rma_single(int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, bool ring_db);
|
||||
__device__ void bnxt_quiet();
|
||||
__device__ void bnxt_quiet_single();
|
||||
#endif
|
||||
#if defined(GDA_IONIC)
|
||||
__device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
|
||||
__device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy);
|
||||
__device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
|
||||
__device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy);
|
||||
__device__ void ionic_quiet();
|
||||
#endif
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user