diff --git a/scripts/functional_tests/driver.sh b/scripts/functional_tests/driver.sh index b1ff5c7cd0..9eaaa9abe7 100755 --- a/scripts/functional_tests/driver.sh +++ b/scripts/functional_tests/driver.sh @@ -488,11 +488,10 @@ TestGDA() { # ExecTest "g" 2 8 1 32 # ExecTest "g" 2 16 128 4 -#Implemented but known incorrect -# ExecTest "p" 2 1 1 128 -# ExecTest "p" 2 1 1024 2 -# ExecTest "p" 2 8 1 32 -# ExecTest "p" 2 16 128 4 + ExecTest "p" 2 1 1 128 + ExecTest "p" 2 1 1024 2 + ExecTest "p" 2 8 1 32 + ExecTest "p" 2 16 128 4 ################################ Non-Blocking ################################ @@ -608,9 +607,9 @@ TestGDA() { ############################################################################## ExecTest "init" 2 1 1 -# ExecTest "pingpong" 2 1 1 -# ExecTest "pingpong" 2 8 1 -# ExecTest "pingpong" 2 32 1 + ExecTest "pingpong" 2 1 1 + ExecTest "pingpong" 2 8 1 + ExecTest "pingpong" 2 32 1 # This test requires more contexts than workgroups export ROCSHMEM_MAX_NUM_CONTEXTS=1024 diff --git a/src/gda/backend_gda.cpp b/src/gda/backend_gda.cpp index 242e8005c3..8bff6aaf1c 100644 --- a/src/gda/backend_gda.cpp +++ b/src/gda/backend_gda.cpp @@ -1043,6 +1043,7 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]); gpu_qp->lkey = htobe32(heap_mr->lkey); gpu_qp->qp_num = qps[conn_num]->qp_num; + gpu_qp->inline_threshold = inline_threshold; // The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers void* gpu_ptr{nullptr}; rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id); @@ -1056,7 +1057,7 @@ void GDABackend::create_qps(int sq_length) { memset(&attr, 0, sizeof(struct ibv_qp_init_attr_ex)); attr.cap.max_send_wr = sq_length; attr.cap.max_send_sge = 1; - attr.cap.max_inline_data = 0; + attr.cap.max_inline_data = inline_threshold; #ifdef GDA_IONIC attr.cap.max_recv_sge = 1; // TODO allow zero sges in the driver #endif diff --git a/src/gda/backend_gda.hpp b/src/gda/backend_gda.hpp index db055c1a3c..c4492bbe1c 100644 --- a/src/gda/backend_gda.hpp +++ b/src/gda/backend_gda.hpp @@ -64,6 +64,7 @@ class GDABackend : public Backend { struct ibv_mr *heap_mr = nullptr; uint32_t sq_size = 1024; + uint32_t inline_threshold = 8; QueuePair *gpu_qps = nullptr; std::vector qps; std::vector cqs; diff --git a/src/gda/bnxt/backend_gda_bnxt.cpp b/src/gda/bnxt/backend_gda_bnxt.cpp index 10eeb0dd3b..4743ccb3fb 100644 --- a/src/gda/bnxt/backend_gda_bnxt.cpp +++ b/src/gda/bnxt/backend_gda_bnxt.cpp @@ -85,6 +85,9 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { /* Export Memory Keys */ gpu_qp->lkey = heap_mr->lkey; gpu_qp->rkey = heap_rkey[conn_num % num_pes]; + + /* Export Inline Threshold */ + gpu_qp->inline_threshold = inline_threshold; } void GDABackend::create_cqs(int cqe) { @@ -143,7 +146,7 @@ void GDABackend::create_qps(int sq_length) { ib_qp_attr.cap.max_recv_wr = 0; ib_qp_attr.cap.max_send_sge = 1; ib_qp_attr.cap.max_recv_sge = 0; - ib_qp_attr.cap.max_inline_data = 0; + ib_qp_attr.cap.max_inline_data = inline_threshold; ib_qp_attr.qp_type = IBV_QPT_RC; ib_qp_attr.sq_sig_all = 0; diff --git a/src/gda/bnxt/queue_pair_bnxt.cpp b/src/gda/bnxt/queue_pair_bnxt.cpp index a141e3340e..2eea3e45af 100644 --- a/src/gda/bnxt/queue_pair_bnxt.cpp +++ b/src/gda/bnxt/queue_pair_bnxt.cpp @@ -247,17 +247,26 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr uint32_t wqe_size; uint32_t wqe_type; uint32_t hdr_flags; - uint32_t rma_slots = 3; // (Three slots: hdr, rdma) + uint32_t inline_msg; + + uint32_t rma_slots = 3; // (Three slots: hdr, rdma, sge) + + inline_msg = length <= inline_threshold && + opcode == GDA_OP_RDMA_WRITE; hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0); rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1); sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2); /* Populate Header Segment */ - wqe_size = BNXT_RE_HDR_WS_MASK & rma_slots; + wqe_type = BNXT_RE_HDR_WT_MASK & opcode; + wqe_size = BNXT_RE_HDR_WS_MASK & rma_slots; hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK) & ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED); - wqe_type = BNXT_RE_HDR_WT_MASK & opcode; + + if (inline_msg) { + hdr_flags |= ((uint32_t) BNXT_RE_WR_FLAGS_INLINE); + } hdr.rsv_ws_fl_wt = (wqe_size << BNXT_RE_HDR_WS_SHIFT) | (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT) @@ -269,15 +278,22 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr rdma.rva = (uint64_t) raddr; rdma.rkey = rkey; - /* Populate SG Segment */ - sge.pa = (uint64_t) laddr; - sge.lkey = lkey; - sge.length = length; + if (!inline_msg) { + /* Populate SG Segment */ + sge.pa = (uint64_t) laddr; + sge.lkey = lkey; + sge.length = length; + } /* Write WQE to SQ */ memcpy(hdr_ptr, &hdr, sizeof(struct bnxt_re_bsqe)); memcpy(rdma_ptr, &rdma, sizeof(struct bnxt_re_rdma)); - memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge)); + + if (inline_msg) { + memcpy(sge_ptr, laddr, length); + } else { + memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge)); + } /* Populate MSN Table */ bnxt_re_fill_psns_for_msntbl(&sq, length); diff --git a/src/gda/context_gda_tmpl_device.hpp b/src/gda/context_gda_tmpl_device.hpp index 265afb232d..e28c5e0aa8 100644 --- a/src/gda/context_gda_tmpl_device.hpp +++ b/src/gda/context_gda_tmpl_device.hpp @@ -42,10 +42,7 @@ namespace rocshmem { *****************************************************************************/ template __device__ void GDAContext::p(T *dest, T value, int pe) { - printf("rocshmem::gda:p not implemented\n"); - abort(); - //TODO the following is incorrect because value is not ibv registered memory - //putmem_nbi(dest, &value, sizeof(T), pe); + putmem_nbi(dest, &value, sizeof(T), pe); } template diff --git a/src/gda/queue_pair.cpp b/src/gda/queue_pair.cpp index 397df577b9..e4ec342854 100644 --- a/src/gda/queue_pair.cpp +++ b/src/gda/queue_pair.cpp @@ -446,7 +446,13 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, SegmentBuilder seg_build(my_sq_index, sq_buf); seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0); seg_build.update_raddr_seg(raddr, rkey); - seg_build.update_data_seg(laddr, size, lkey); + + if (size <= inline_threshold && opcode == GDA_OP_RDMA_WRITE) { + seg_build.update_inl_data_seg(laddr, size); + } else { + seg_build.update_data_seg(laddr, size, lkey); + } + __atomic_signal_fence(__ATOMIC_SEQ_CST); if (is_leader) { diff --git a/src/gda/queue_pair.hpp b/src/gda/queue_pair.hpp index df7e83ba5f..4356116341 100644 --- a/src/gda/queue_pair.hpp +++ b/src/gda/queue_pair.hpp @@ -234,8 +234,6 @@ class QueuePair { uint32_t sq_prod{0}; uint32_t sq_msn{0}; - uint32_t inline_threshold{0}; - #elif defined(GDA_BNXT) uint64_t *dbr; struct bnxt_device_cq cq; @@ -305,6 +303,8 @@ class QueuePair { #endif // GDA_IONIC + uint32_t inline_threshold{0}; + uint32_t qp_num{0}; uint32_t rkey{0}; uint32_t lkey{0}; diff --git a/src/gda/segment_builder.cpp b/src/gda/segment_builder.cpp index ca3033d103..f9f6c9046a 100644 --- a/src/gda/segment_builder.cpp +++ b/src/gda/segment_builder.cpp @@ -108,6 +108,14 @@ __device__ void SegmentBuilder::update_data_seg(uintptr_t *address, uint32_t len segp++; } +__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr, int32_t size) { + // size is masked with 0x3FF because only the first 10 bits of byte_count are valid + swap_endian_store(&segp->inl_data_seg.byte_count, ((size & 0x3FF) | MLX5_INLINE_SEG)); + // + 1 because we start packing the segment with data after the byte_count parameter + memcpy(&segp->inl_data_seg + 1, laddr, size); + segp++; +} + __device__ void SegmentBuilder::update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp) { segp->atomic_seg = {0}; swap_endian_store(reinterpret_cast(&segp->atomic_seg.swap_add), atomic_data); diff --git a/src/gda/segment_builder.hpp b/src/gda/segment_builder.hpp index 130a9604f3..c5ce93aca6 100644 --- a/src/gda/segment_builder.hpp +++ b/src/gda/segment_builder.hpp @@ -32,58 +32,32 @@ namespace rocshmem { class SegmentBuilder { - public: - __device__ SegmentBuilder(uint64_t wqe_idx, void *base); + public: + __device__ SegmentBuilder(uint64_t wqe_idx, void *base); - /* - * struct mlx5_wqe_ctrl_seg { - * __be32 opmod_idx_opcode; - * __be32 qpn_ds; - * uint8_t signature; - * __be16 dci_stream_channel_id; - * uint8_t fm_ce_se; - * __be32 imm; - * } __attribute__((__packed__)) __attribute__((__aligned__(4))); - */ - __device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm); + __device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, + uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm); - /* - * struct mlx5_wqe_raddr_seg { - * __be64 raddr; - * __be32 rkey; - * __be32 reserved; - * }; - */ - __device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey); + __device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey); - /* - * struct mlx5_wqe_data_seg { - * __be32 byte_count; - * __be32 lkey; - * __be64 addr; - * }; - */ - __device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey); + __device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey); - /* - * struct mlx5_wqe_atomic_seg { - * __be64 swap_add; - * __be64 compare; - * }; - */ - __device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp); + __device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size); - private: - const int SEGMENTS_PER_WQE = 4; + __device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp); - union mlx5_segment { - mlx5_wqe_ctrl_seg ctrl_seg; - mlx5_wqe_raddr_seg raddr_seg; - mlx5_wqe_data_seg data_seg; - mlx5_wqe_atomic_seg atomic_seg; - }; + private: + const int SEGMENTS_PER_WQE = 4; - mlx5_segment *segp; + union mlx5_segment { + mlx5_wqe_ctrl_seg ctrl_seg; + mlx5_wqe_raddr_seg raddr_seg; + mlx5_wqe_data_seg data_seg; + mlx5_wqe_inl_data_seg inl_data_seg; + mlx5_wqe_atomic_seg atomic_seg; + }; + + mlx5_segment *segp; }; } // namespace rocshmem