QueuePair: prefix bnxt functions and variables (#373)
[ROCm/rocshmem commit: f5940f6b9a]
Этот коммит содержится в:
@@ -45,10 +45,10 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
|
||||
err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)");
|
||||
|
||||
memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq));
|
||||
gpu_qp->cq.buf = bnxt_scqs[conn_num].buf;
|
||||
gpu_qp->cq.depth = bnxt_scqs[conn_num].depth;
|
||||
gpu_qp->cq.id = dv_cq.cqn;
|
||||
memset(&gpu_qp->bnxt_cq, 0, sizeof(bnxt_device_cq));
|
||||
gpu_qp->bnxt_cq.buf = bnxt_scqs[conn_num].buf;
|
||||
gpu_qp->bnxt_cq.depth = bnxt_scqs[conn_num].depth;
|
||||
gpu_qp->bnxt_cq.id = dv_cq.cqn;
|
||||
|
||||
/* Export QP */
|
||||
memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj));
|
||||
@@ -58,25 +58,25 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
|
||||
err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");
|
||||
|
||||
memset(&gpu_qp->sq, 0, sizeof(bnxt_device_sq));
|
||||
gpu_qp->sq.buf = bnxt_qps[conn_num].sq_buf;
|
||||
gpu_qp->sq.depth = bnxt_qps[conn_num].mem_info.sq_slots;
|
||||
memset(&gpu_qp->bnxt_sq, 0, sizeof(bnxt_device_sq));
|
||||
gpu_qp->bnxt_sq.buf = bnxt_qps[conn_num].sq_buf;
|
||||
gpu_qp->bnxt_sq.depth = bnxt_qps[conn_num].mem_info.sq_slots;
|
||||
|
||||
if ((gpu_qp->sq.depth % BNXT_RE_STATIC_WQE_BB) != 0) {
|
||||
if ((gpu_qp->bnxt_sq.depth % BNXT_RE_STATIC_WQE_BB) != 0) {
|
||||
fprintf(stderr,
|
||||
"[WARNING] SQ depth not divisible by BNXT_RE_STATIC_WQE_BB. "
|
||||
"There may be runtime errors.\n");
|
||||
}
|
||||
|
||||
gpu_qp->sq.id = ib_qp->qp_num;
|
||||
gpu_qp->sq.msntbl = bnxt_qps[conn_num].msntbl;
|
||||
gpu_qp->sq.msn_tbl_sz = bnxt_qps[conn_num].msn_tbl_sz;
|
||||
gpu_qp->sq.psn_sz_log2 = std::log2(bnxt_qps[conn_num].mem_info.sq_psn_sz);
|
||||
gpu_qp->sq.mtu = ibv_mtu_to_int(portinfo.active_mtu);
|
||||
gpu_qp->bnxt_sq.id = ib_qp->qp_num;
|
||||
gpu_qp->bnxt_sq.msntbl = bnxt_qps[conn_num].msntbl;
|
||||
gpu_qp->bnxt_sq.msn_tbl_sz = bnxt_qps[conn_num].msn_tbl_sz;
|
||||
gpu_qp->bnxt_sq.psn_sz_log2 = std::log2(bnxt_qps[conn_num].mem_info.sq_psn_sz);
|
||||
gpu_qp->bnxt_sq.mtu = ibv_mtu_to_int(portinfo.active_mtu);
|
||||
|
||||
/* Export DB */
|
||||
CHECK_HIP(hipHostRegister(bnxt_qps[conn_num].db_region_attr->dbr, getpagesize(), hipHostRegisterDefault));
|
||||
CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->dbr, bnxt_qps[conn_num].db_region_attr->dbr, 0));
|
||||
CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->bnxt_dbr, bnxt_qps[conn_num].db_region_attr->dbr, 0));
|
||||
|
||||
/* Export Memory Keys */
|
||||
gpu_qp->lkey = heap_mr->lkey;
|
||||
|
||||
@@ -96,7 +96,7 @@ __device__ static inline void* bnxt_re_get_hwqe(struct bnxt_device_sq *sq, uint3
|
||||
return (void *)((char*)sq->buf + (idx << 4));
|
||||
}
|
||||
|
||||
__device__ static inline void aquire_lock(uint32_t *lock) {
|
||||
__device__ static inline void acquire_lock(uint32_t *lock) {
|
||||
uint32_t expected;
|
||||
|
||||
do {
|
||||
@@ -117,18 +117,18 @@ __device__ void QueuePair::bnxt_ring_doorbell(uint32_t slot_idx) {
|
||||
uint64_t key_lo;
|
||||
uint64_t key_hi;
|
||||
|
||||
epoch = (sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT;
|
||||
epoch = (bnxt_sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT;
|
||||
|
||||
key_lo = (slot_idx | epoch);
|
||||
|
||||
key_hi = (sq.id & BNXT_RE_DB_QID_MASK)
|
||||
key_hi = (bnxt_sq.id & BNXT_RE_DB_QID_MASK)
|
||||
| (((uint64_t) BNXT_RE_QUE_TYPE_SQ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT)
|
||||
| (0x1UL << BNXT_RE_DB_VALID_SHIFT);
|
||||
|
||||
hdr.typ_qid_indx = (key_lo | (key_hi << 32));
|
||||
|
||||
__threadfence_system();
|
||||
__hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
__hip_atomic_store(bnxt_dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe) {
|
||||
@@ -167,7 +167,7 @@ __device__ void QueuePair::bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe) {
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) {
|
||||
__device__ void QueuePair::bnxt_poll_cq_until(uint32_t requested_available_slots) {
|
||||
struct bnxt_re_req_cqe *cqe;
|
||||
uint32_t sq_tail;
|
||||
uint32_t sq_head;
|
||||
@@ -175,10 +175,10 @@ __device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) {
|
||||
uint32_t consumed_slots;
|
||||
uint32_t available_slots;
|
||||
|
||||
sq_depth = sq.depth;
|
||||
sq_depth = bnxt_sq.depth;
|
||||
|
||||
do {
|
||||
cqe = (struct bnxt_re_req_cqe *) cq.buf;
|
||||
cqe = (struct bnxt_re_req_cqe *) bnxt_cq.buf;
|
||||
|
||||
#ifdef DEBUG
|
||||
bnxt_check_cqe_error(cqe);
|
||||
@@ -188,9 +188,9 @@ __device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) {
|
||||
* This param provides us the wqe_idx but we need to convert to the slot idx.
|
||||
* We assume a static slots size of GDA_BNXT_WQE_SLOT_COUNT thus can multipy by this value */
|
||||
sq_head = (((cqe->con_indx & 0xFFFF) * GDA_BNXT_WQE_SLOT_COUNT) % sq_depth);
|
||||
sq.head = sq_head;
|
||||
bnxt_sq.head = sq_head;
|
||||
|
||||
sq_tail = __hip_atomic_load(&sq.tail, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
|
||||
sq_tail = __hip_atomic_load(&bnxt_sq.tail, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
|
||||
|
||||
consumed_slots = (sq_tail - sq_head + sq_depth) % sq_depth;
|
||||
available_slots = sq_depth - consumed_slots;
|
||||
@@ -205,12 +205,12 @@ __device__ void QueuePair::bnxt_quiet() {
|
||||
active_lane_id = get_active_lane_num(active_lane_mask);
|
||||
|
||||
if (0 == active_lane_id) {
|
||||
poll_cq_until(sq.depth);
|
||||
bnxt_poll_cq_until(bnxt_sq.depth);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void QueuePair::bnxt_quiet_single() {
|
||||
poll_cq_until(sq.depth);
|
||||
bnxt_poll_cq_until(bnxt_sq.depth);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode) {
|
||||
@@ -228,11 +228,11 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr,
|
||||
inline_msg = length <= inline_threshold &&
|
||||
opcode == gda_op_rdma_write;
|
||||
|
||||
poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
|
||||
bnxt_poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
|
||||
|
||||
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
|
||||
rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1);
|
||||
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2);
|
||||
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&bnxt_sq, 0);
|
||||
rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&bnxt_sq, 1);
|
||||
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&bnxt_sq, 2);
|
||||
|
||||
/* Populate Header Segment */
|
||||
wqe_type = BNXT_RE_HDR_WT_MASK & opcode;
|
||||
@@ -272,10 +272,10 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr,
|
||||
}
|
||||
|
||||
/* Populate MSN Table */
|
||||
bnxt_re_fill_psns_for_msntbl(&sq, length);
|
||||
bnxt_re_fill_psns_for_msntbl(&bnxt_sq, length);
|
||||
|
||||
/* Update SQ Pointer */
|
||||
bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT);
|
||||
bnxt_re_incr_tail(&bnxt_sq, GDA_BNXT_WQE_SLOT_COUNT);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) {
|
||||
@@ -288,7 +288,7 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t l
|
||||
active_lane_id = get_active_lane_num(active_lane_mask);
|
||||
|
||||
if (0 == active_lane_id) {
|
||||
aquire_lock(&sq.lock);
|
||||
acquire_lock(&bnxt_sq.lock);
|
||||
}
|
||||
|
||||
for (int i = 0; i < active_lane_count; i++) {
|
||||
@@ -297,12 +297,12 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t l
|
||||
bnxt_write_rma_wqe(raddr, laddr, length, opcode);
|
||||
|
||||
/* Ring Doorbell */
|
||||
bnxt_ring_doorbell(sq.tail);
|
||||
bnxt_ring_doorbell(bnxt_sq.tail);
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == active_lane_id) {
|
||||
release_lock(&sq.lock);
|
||||
release_lock(&bnxt_sq.lock);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -310,16 +310,16 @@ __device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t la
|
||||
uintptr_t raddr, uint8_t opcode,
|
||||
bool ring_db) {
|
||||
|
||||
aquire_lock(&sq.lock);
|
||||
acquire_lock(&bnxt_sq.lock);
|
||||
|
||||
/* Write WQE to SQ */
|
||||
bnxt_write_rma_wqe(raddr, laddr, length, opcode);
|
||||
|
||||
if (ring_db) {
|
||||
bnxt_ring_doorbell(sq.tail);
|
||||
bnxt_ring_doorbell(bnxt_sq.tail);
|
||||
}
|
||||
|
||||
release_lock(&sq.lock);
|
||||
release_lock(&bnxt_sq.lock);
|
||||
}
|
||||
|
||||
__device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode,
|
||||
@@ -338,11 +338,11 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcod
|
||||
uint32_t atomic_idx = 0;
|
||||
uint32_t length = sizeof(uint64_t);
|
||||
|
||||
poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
|
||||
bnxt_poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
|
||||
|
||||
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
|
||||
amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&sq, 1);
|
||||
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2);
|
||||
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&bnxt_sq, 0);
|
||||
amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&bnxt_sq, 1);
|
||||
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&bnxt_sq, 2);
|
||||
|
||||
/* Populate Header Segment */
|
||||
wqe_size = BNXT_RE_HDR_WS_MASK & GDA_BNXT_WQE_SLOT_COUNT;
|
||||
@@ -377,10 +377,10 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcod
|
||||
memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge));
|
||||
|
||||
/* Populate MSN Table */
|
||||
bnxt_re_fill_psns_for_msntbl(&sq, length);
|
||||
bnxt_re_fill_psns_for_msntbl(&bnxt_sq, length);
|
||||
|
||||
/* Update SQ Pointer */
|
||||
bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT);
|
||||
bnxt_re_incr_tail(&bnxt_sq, GDA_BNXT_WQE_SLOT_COUNT);
|
||||
|
||||
return atomic_idx;
|
||||
}
|
||||
@@ -398,7 +398,7 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode
|
||||
active_lane_id = get_active_lane_num(active_lane_mask);
|
||||
|
||||
if (0 == active_lane_id) {
|
||||
aquire_lock(&sq.lock);
|
||||
acquire_lock(&bnxt_sq.lock);
|
||||
}
|
||||
|
||||
for (int i = 0; i < active_lane_count; i++) {
|
||||
@@ -406,12 +406,12 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode
|
||||
atomic_idx = bnxt_write_amo_wqe(raddr, opcode, atomic_data, atomic_cmp, fetching);
|
||||
|
||||
/* Ring Doorbell */
|
||||
bnxt_ring_doorbell(sq.tail);
|
||||
bnxt_ring_doorbell(bnxt_sq.tail);
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == active_lane_id) {
|
||||
release_lock(&sq.lock);
|
||||
release_lock(&bnxt_sq.lock);
|
||||
}
|
||||
|
||||
if (fetching) {
|
||||
@@ -427,14 +427,14 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t
|
||||
bool fetching) {
|
||||
uint32_t atomic_idx = 0;
|
||||
|
||||
aquire_lock(&sq.lock);
|
||||
acquire_lock(&bnxt_sq.lock);
|
||||
|
||||
/* Write WQE to SQ */
|
||||
atomic_idx = bnxt_write_amo_wqe(raddr, opcode, atomic_data, atomic_cmp, fetching);
|
||||
|
||||
bnxt_ring_doorbell(sq.tail);
|
||||
bnxt_ring_doorbell(bnxt_sq.tail);
|
||||
|
||||
release_lock(&sq.lock);
|
||||
release_lock(&bnxt_sq.lock);
|
||||
|
||||
if (fetching) {
|
||||
quiet();
|
||||
|
||||
@@ -253,11 +253,11 @@ class QueuePair {
|
||||
int gda_provider_{0};
|
||||
|
||||
/* GDAProvider::BNXT START */
|
||||
uint64_t *dbr;
|
||||
struct bnxt_device_cq cq;
|
||||
struct bnxt_device_sq sq;
|
||||
uint64_t *bnxt_dbr;
|
||||
struct bnxt_device_cq bnxt_cq;
|
||||
struct bnxt_device_sq bnxt_sq;
|
||||
|
||||
__device__ void poll_cq_until(uint32_t requested_available_slots);
|
||||
__device__ void bnxt_poll_cq_until(uint32_t requested_available_slots);
|
||||
__device__ void bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe);
|
||||
|
||||
/* GDAProvider::BNXT END */
|
||||
@@ -393,7 +393,7 @@ class QueuePair {
|
||||
uint32_t fetching_atomic_idx{0};
|
||||
struct ibv_mr *mr_fetching_atomic;
|
||||
|
||||
static const uint32_t FETCHING_ATOMIC_CNT{1024};
|
||||
static constexpr uint32_t FETCHING_ATOMIC_CNT{1024};
|
||||
static_assert(FETCHING_ATOMIC_CNT % WF_SIZE == 0);
|
||||
using FreeListT = FreeList<uint64_t*, HIPAllocator>;
|
||||
FreeListT* fetching_atomic_freelist{nullptr};
|
||||
|
||||
Ссылка в новой задаче
Block a user