Этот коммит содержится в:
Yiltan
2025-09-11 09:24:43 -04:00
коммит произвёл GitHub
родитель f677e5eb59
Коммит 2abeebbb6d
10 изменённых файлов: 75 добавлений и 70 удалений
+7 -8
Просмотреть файл
@@ -488,11 +488,10 @@ TestGDA() {
# ExecTest "g" 2 8 1 32
# ExecTest "g" 2 16 128 4
#Implemented but known incorrect
# ExecTest "p" 2 1 1 128
# ExecTest "p" 2 1 1024 2
# ExecTest "p" 2 8 1 32
# ExecTest "p" 2 16 128 4
ExecTest "p" 2 1 1 128
ExecTest "p" 2 1 1024 2
ExecTest "p" 2 8 1 32
ExecTest "p" 2 16 128 4
################################ Non-Blocking ################################
@@ -608,9 +607,9 @@ TestGDA() {
##############################################################################
ExecTest "init" 2 1 1
# ExecTest "pingpong" 2 1 1
# ExecTest "pingpong" 2 8 1
# ExecTest "pingpong" 2 32 1
ExecTest "pingpong" 2 1 1
ExecTest "pingpong" 2 8 1
ExecTest "pingpong" 2 32 1
# This test requires more contexts than workgroups
export ROCSHMEM_MAX_NUM_CONTEXTS=1024
+2 -1
Просмотреть файл
@@ -1043,6 +1043,7 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]);
gpu_qp->lkey = htobe32(heap_mr->lkey);
gpu_qp->qp_num = qps[conn_num]->qp_num;
gpu_qp->inline_threshold = inline_threshold;
// The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers
void* gpu_ptr{nullptr};
rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id);
@@ -1056,7 +1057,7 @@ void GDABackend::create_qps(int sq_length) {
memset(&attr, 0, sizeof(struct ibv_qp_init_attr_ex));
attr.cap.max_send_wr = sq_length;
attr.cap.max_send_sge = 1;
attr.cap.max_inline_data = 0;
attr.cap.max_inline_data = inline_threshold;
#ifdef GDA_IONIC
attr.cap.max_recv_sge = 1; // TODO allow zero sges in the driver
#endif
+1
Просмотреть файл
@@ -64,6 +64,7 @@ class GDABackend : public Backend {
struct ibv_mr *heap_mr = nullptr;
uint32_t sq_size = 1024;
uint32_t inline_threshold = 8;
QueuePair *gpu_qps = nullptr;
std::vector<ibv_qp*> qps;
std::vector<ibv_cq*> cqs;
+4 -1
Просмотреть файл
@@ -85,6 +85,9 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
/* Export Memory Keys */
gpu_qp->lkey = heap_mr->lkey;
gpu_qp->rkey = heap_rkey[conn_num % num_pes];
/* Export Inline Threshold */
gpu_qp->inline_threshold = inline_threshold;
}
void GDABackend::create_cqs(int cqe) {
@@ -143,7 +146,7 @@ void GDABackend::create_qps(int sq_length) {
ib_qp_attr.cap.max_recv_wr = 0;
ib_qp_attr.cap.max_send_sge = 1;
ib_qp_attr.cap.max_recv_sge = 0;
ib_qp_attr.cap.max_inline_data = 0;
ib_qp_attr.cap.max_inline_data = inline_threshold;
ib_qp_attr.qp_type = IBV_QPT_RC;
ib_qp_attr.sq_sig_all = 0;
+24 -8
Просмотреть файл
@@ -247,17 +247,26 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr
uint32_t wqe_size;
uint32_t wqe_type;
uint32_t hdr_flags;
uint32_t rma_slots = 3; // (Three slots: hdr, rdma)
uint32_t inline_msg;
uint32_t rma_slots = 3; // (Three slots: hdr, rdma, sge)
inline_msg = length <= inline_threshold &&
opcode == GDA_OP_RDMA_WRITE;
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1);
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2);
/* Populate Header Segment */
wqe_size = BNXT_RE_HDR_WS_MASK & rma_slots;
wqe_type = BNXT_RE_HDR_WT_MASK & opcode;
wqe_size = BNXT_RE_HDR_WS_MASK & rma_slots;
hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK)
& ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED);
wqe_type = BNXT_RE_HDR_WT_MASK & opcode;
if (inline_msg) {
hdr_flags |= ((uint32_t) BNXT_RE_WR_FLAGS_INLINE);
}
hdr.rsv_ws_fl_wt = (wqe_size << BNXT_RE_HDR_WS_SHIFT)
| (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT)
@@ -269,15 +278,22 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr
rdma.rva = (uint64_t) raddr;
rdma.rkey = rkey;
/* Populate SG Segment */
sge.pa = (uint64_t) laddr;
sge.lkey = lkey;
sge.length = length;
if (!inline_msg) {
/* Populate SG Segment */
sge.pa = (uint64_t) laddr;
sge.lkey = lkey;
sge.length = length;
}
/* Write WQE to SQ */
memcpy(hdr_ptr, &hdr, sizeof(struct bnxt_re_bsqe));
memcpy(rdma_ptr, &rdma, sizeof(struct bnxt_re_rdma));
memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge));
if (inline_msg) {
memcpy(sge_ptr, laddr, length);
} else {
memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge));
}
/* Populate MSN Table */
bnxt_re_fill_psns_for_msntbl(&sq, length);
+1 -4
Просмотреть файл
@@ -42,10 +42,7 @@ namespace rocshmem {
*****************************************************************************/
template <typename T>
__device__ void GDAContext::p(T *dest, T value, int pe) {
printf("rocshmem::gda:p not implemented\n");
abort();
//TODO the following is incorrect because value is not ibv registered memory
//putmem_nbi(dest, &value, sizeof(T), pe);
putmem_nbi(dest, &value, sizeof(T), pe);
}
template <typename T>
+7 -1
Просмотреть файл
@@ -446,7 +446,13 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr,
SegmentBuilder seg_build(my_sq_index, sq_buf);
seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0);
seg_build.update_raddr_seg(raddr, rkey);
seg_build.update_data_seg(laddr, size, lkey);
if (size <= inline_threshold && opcode == GDA_OP_RDMA_WRITE) {
seg_build.update_inl_data_seg(laddr, size);
} else {
seg_build.update_data_seg(laddr, size, lkey);
}
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
+2 -2
Просмотреть файл
@@ -234,8 +234,6 @@ class QueuePair {
uint32_t sq_prod{0};
uint32_t sq_msn{0};
uint32_t inline_threshold{0};
#elif defined(GDA_BNXT)
uint64_t *dbr;
struct bnxt_device_cq cq;
@@ -305,6 +303,8 @@ class QueuePair {
#endif // GDA_IONIC
uint32_t inline_threshold{0};
uint32_t qp_num{0};
uint32_t rkey{0};
uint32_t lkey{0};
+8
Просмотреть файл
@@ -108,6 +108,14 @@ __device__ void SegmentBuilder::update_data_seg(uintptr_t *address, uint32_t len
segp++;
}
__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr, int32_t size) {
// size is masked with 0x3FF because only the first 10 bits of byte_count are valid
swap_endian_store(&segp->inl_data_seg.byte_count, ((size & 0x3FF) | MLX5_INLINE_SEG));
// + 1 because we start packing the segment with data after the byte_count parameter
memcpy(&segp->inl_data_seg + 1, laddr, size);
segp++;
}
__device__ void SegmentBuilder::update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp) {
segp->atomic_seg = {0};
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->atomic_seg.swap_add), atomic_data);
+19 -45
Просмотреть файл
@@ -32,58 +32,32 @@
namespace rocshmem {
class SegmentBuilder {
public:
__device__ SegmentBuilder(uint64_t wqe_idx, void *base);
public:
__device__ SegmentBuilder(uint64_t wqe_idx, void *base);
/*
* struct mlx5_wqe_ctrl_seg {
* __be32 opmod_idx_opcode;
* __be32 qpn_ds;
* uint8_t signature;
* __be16 dci_stream_channel_id;
* uint8_t fm_ce_se;
* __be32 imm;
* } __attribute__((__packed__)) __attribute__((__aligned__(4)));
*/
__device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm);
__device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num,
uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm);
/*
* struct mlx5_wqe_raddr_seg {
* __be64 raddr;
* __be32 rkey;
* __be32 reserved;
* };
*/
__device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey);
__device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey);
/*
* struct mlx5_wqe_data_seg {
* __be32 byte_count;
* __be32 lkey;
* __be64 addr;
* };
*/
__device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey);
__device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey);
/*
* struct mlx5_wqe_atomic_seg {
* __be64 swap_add;
* __be64 compare;
* };
*/
__device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp);
__device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size);
private:
const int SEGMENTS_PER_WQE = 4;
__device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp);
union mlx5_segment {
mlx5_wqe_ctrl_seg ctrl_seg;
mlx5_wqe_raddr_seg raddr_seg;
mlx5_wqe_data_seg data_seg;
mlx5_wqe_atomic_seg atomic_seg;
};
private:
const int SEGMENTS_PER_WQE = 4;
mlx5_segment *segp;
union mlx5_segment {
mlx5_wqe_ctrl_seg ctrl_seg;
mlx5_wqe_raddr_seg raddr_seg;
mlx5_wqe_data_seg data_seg;
mlx5_wqe_inl_data_seg inl_data_seg;
mlx5_wqe_atomic_seg atomic_seg;
};
mlx5_segment *segp;
};
} // namespace rocshmem