diff --git a/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp b/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp index 0b0bbc3f34..80c67c2143 100644 --- a/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp +++ b/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp @@ -45,10 +45,10 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ); CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)"); - memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq)); - gpu_qp->cq.buf = bnxt_scqs[conn_num].buf; - gpu_qp->cq.depth = bnxt_scqs[conn_num].depth; - gpu_qp->cq.id = dv_cq.cqn; + memset(&gpu_qp->bnxt_cq, 0, sizeof(bnxt_device_cq)); + gpu_qp->bnxt_cq.buf = bnxt_scqs[conn_num].buf; + gpu_qp->bnxt_cq.depth = bnxt_scqs[conn_num].depth; + gpu_qp->bnxt_cq.id = dv_cq.cqn; /* Export QP */ memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj)); @@ -58,25 +58,25 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP); CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)"); - memset(&gpu_qp->sq, 0, sizeof(bnxt_device_sq)); - gpu_qp->sq.buf = bnxt_qps[conn_num].sq_buf; - gpu_qp->sq.depth = bnxt_qps[conn_num].mem_info.sq_slots; + memset(&gpu_qp->bnxt_sq, 0, sizeof(bnxt_device_sq)); + gpu_qp->bnxt_sq.buf = bnxt_qps[conn_num].sq_buf; + gpu_qp->bnxt_sq.depth = bnxt_qps[conn_num].mem_info.sq_slots; - if ((gpu_qp->sq.depth % BNXT_RE_STATIC_WQE_BB) != 0) { + if ((gpu_qp->bnxt_sq.depth % BNXT_RE_STATIC_WQE_BB) != 0) { fprintf(stderr, "[WARNING] SQ depth not divisible by BNXT_RE_STATIC_WQE_BB. " "There may be runtime errors.\n"); } - gpu_qp->sq.id = ib_qp->qp_num; - gpu_qp->sq.msntbl = bnxt_qps[conn_num].msntbl; - gpu_qp->sq.msn_tbl_sz = bnxt_qps[conn_num].msn_tbl_sz; - gpu_qp->sq.psn_sz_log2 = std::log2(bnxt_qps[conn_num].mem_info.sq_psn_sz); - gpu_qp->sq.mtu = ibv_mtu_to_int(portinfo.active_mtu); + gpu_qp->bnxt_sq.id = ib_qp->qp_num; + gpu_qp->bnxt_sq.msntbl = bnxt_qps[conn_num].msntbl; + gpu_qp->bnxt_sq.msn_tbl_sz = bnxt_qps[conn_num].msn_tbl_sz; + gpu_qp->bnxt_sq.psn_sz_log2 = std::log2(bnxt_qps[conn_num].mem_info.sq_psn_sz); + gpu_qp->bnxt_sq.mtu = ibv_mtu_to_int(portinfo.active_mtu); /* Export DB */ CHECK_HIP(hipHostRegister(bnxt_qps[conn_num].db_region_attr->dbr, getpagesize(), hipHostRegisterDefault)); - CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->dbr, bnxt_qps[conn_num].db_region_attr->dbr, 0)); + CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->bnxt_dbr, bnxt_qps[conn_num].db_region_attr->dbr, 0)); /* Export Memory Keys */ gpu_qp->lkey = heap_mr->lkey; diff --git a/projects/rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp b/projects/rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp index ba63b29c3a..f3e5887a2b 100644 --- a/projects/rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp +++ b/projects/rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp @@ -96,7 +96,7 @@ __device__ static inline void* bnxt_re_get_hwqe(struct bnxt_device_sq *sq, uint3 return (void *)((char*)sq->buf + (idx << 4)); } -__device__ static inline void aquire_lock(uint32_t *lock) { +__device__ static inline void acquire_lock(uint32_t *lock) { uint32_t expected; do { @@ -117,18 +117,18 @@ __device__ void QueuePair::bnxt_ring_doorbell(uint32_t slot_idx) { uint64_t key_lo; uint64_t key_hi; - epoch = (sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT; + epoch = (bnxt_sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT; key_lo = (slot_idx | epoch); - key_hi = (sq.id & BNXT_RE_DB_QID_MASK) + key_hi = (bnxt_sq.id & BNXT_RE_DB_QID_MASK) | (((uint64_t) BNXT_RE_QUE_TYPE_SQ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT) | (0x1UL << BNXT_RE_DB_VALID_SHIFT); hdr.typ_qid_indx = (key_lo | (key_hi << 32)); __threadfence_system(); - __hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); + __hip_atomic_store(bnxt_dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); } __device__ void QueuePair::bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe) { @@ -167,7 +167,7 @@ __device__ void QueuePair::bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe) { } } -__device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) { +__device__ void QueuePair::bnxt_poll_cq_until(uint32_t requested_available_slots) { struct bnxt_re_req_cqe *cqe; uint32_t sq_tail; uint32_t sq_head; @@ -175,10 +175,10 @@ __device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) { uint32_t consumed_slots; uint32_t available_slots; - sq_depth = sq.depth; + sq_depth = bnxt_sq.depth; do { - cqe = (struct bnxt_re_req_cqe *) cq.buf; + cqe = (struct bnxt_re_req_cqe *) bnxt_cq.buf; #ifdef DEBUG bnxt_check_cqe_error(cqe); @@ -188,9 +188,9 @@ __device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) { * This param provides us the wqe_idx but we need to convert to the slot idx. * We assume a static slots size of GDA_BNXT_WQE_SLOT_COUNT thus can multipy by this value */ sq_head = (((cqe->con_indx & 0xFFFF) * GDA_BNXT_WQE_SLOT_COUNT) % sq_depth); - sq.head = sq_head; + bnxt_sq.head = sq_head; - sq_tail = __hip_atomic_load(&sq.tail, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); + sq_tail = __hip_atomic_load(&bnxt_sq.tail, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); consumed_slots = (sq_tail - sq_head + sq_depth) % sq_depth; available_slots = sq_depth - consumed_slots; @@ -205,12 +205,12 @@ __device__ void QueuePair::bnxt_quiet() { active_lane_id = get_active_lane_num(active_lane_mask); if (0 == active_lane_id) { - poll_cq_until(sq.depth); + bnxt_poll_cq_until(bnxt_sq.depth); } } __device__ void QueuePair::bnxt_quiet_single() { - poll_cq_until(sq.depth); + bnxt_poll_cq_until(bnxt_sq.depth); } __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode) { @@ -228,11 +228,11 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, inline_msg = length <= inline_threshold && opcode == gda_op_rdma_write; - poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT); + bnxt_poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT); - hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0); - rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1); - sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2); + hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&bnxt_sq, 0); + rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&bnxt_sq, 1); + sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&bnxt_sq, 2); /* Populate Header Segment */ wqe_type = BNXT_RE_HDR_WT_MASK & opcode; @@ -272,10 +272,10 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, } /* Populate MSN Table */ - bnxt_re_fill_psns_for_msntbl(&sq, length); + bnxt_re_fill_psns_for_msntbl(&bnxt_sq, length); /* Update SQ Pointer */ - bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT); + bnxt_re_incr_tail(&bnxt_sq, GDA_BNXT_WQE_SLOT_COUNT); } __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) { @@ -288,7 +288,7 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t l active_lane_id = get_active_lane_num(active_lane_mask); if (0 == active_lane_id) { - aquire_lock(&sq.lock); + acquire_lock(&bnxt_sq.lock); } for (int i = 0; i < active_lane_count; i++) { @@ -297,12 +297,12 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t l bnxt_write_rma_wqe(raddr, laddr, length, opcode); /* Ring Doorbell */ - bnxt_ring_doorbell(sq.tail); + bnxt_ring_doorbell(bnxt_sq.tail); } } if (0 == active_lane_id) { - release_lock(&sq.lock); + release_lock(&bnxt_sq.lock); } } @@ -310,16 +310,16 @@ __device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t la uintptr_t raddr, uint8_t opcode, bool ring_db) { - aquire_lock(&sq.lock); + acquire_lock(&bnxt_sq.lock); /* Write WQE to SQ */ bnxt_write_rma_wqe(raddr, laddr, length, opcode); if (ring_db) { - bnxt_ring_doorbell(sq.tail); + bnxt_ring_doorbell(bnxt_sq.tail); } - release_lock(&sq.lock); + release_lock(&bnxt_sq.lock); } __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode, @@ -338,11 +338,11 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcod uint32_t atomic_idx = 0; uint32_t length = sizeof(uint64_t); - poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT); + bnxt_poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT); - hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0); - amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&sq, 1); - sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2); + hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&bnxt_sq, 0); + amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&bnxt_sq, 1); + sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&bnxt_sq, 2); /* Populate Header Segment */ wqe_size = BNXT_RE_HDR_WS_MASK & GDA_BNXT_WQE_SLOT_COUNT; @@ -377,10 +377,10 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcod memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge)); /* Populate MSN Table */ - bnxt_re_fill_psns_for_msntbl(&sq, length); + bnxt_re_fill_psns_for_msntbl(&bnxt_sq, length); /* Update SQ Pointer */ - bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT); + bnxt_re_incr_tail(&bnxt_sq, GDA_BNXT_WQE_SLOT_COUNT); return atomic_idx; } @@ -398,7 +398,7 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode active_lane_id = get_active_lane_num(active_lane_mask); if (0 == active_lane_id) { - aquire_lock(&sq.lock); + acquire_lock(&bnxt_sq.lock); } for (int i = 0; i < active_lane_count; i++) { @@ -406,12 +406,12 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode atomic_idx = bnxt_write_amo_wqe(raddr, opcode, atomic_data, atomic_cmp, fetching); /* Ring Doorbell */ - bnxt_ring_doorbell(sq.tail); + bnxt_ring_doorbell(bnxt_sq.tail); } } if (0 == active_lane_id) { - release_lock(&sq.lock); + release_lock(&bnxt_sq.lock); } if (fetching) { @@ -427,14 +427,14 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t bool fetching) { uint32_t atomic_idx = 0; - aquire_lock(&sq.lock); + acquire_lock(&bnxt_sq.lock); /* Write WQE to SQ */ atomic_idx = bnxt_write_amo_wqe(raddr, opcode, atomic_data, atomic_cmp, fetching); - bnxt_ring_doorbell(sq.tail); + bnxt_ring_doorbell(bnxt_sq.tail); - release_lock(&sq.lock); + release_lock(&bnxt_sq.lock); if (fetching) { quiet(); diff --git a/projects/rocshmem/src/gda/queue_pair.hpp b/projects/rocshmem/src/gda/queue_pair.hpp index 36b217c1f0..9236bf9fc4 100644 --- a/projects/rocshmem/src/gda/queue_pair.hpp +++ b/projects/rocshmem/src/gda/queue_pair.hpp @@ -253,11 +253,11 @@ class QueuePair { int gda_provider_{0}; /* GDAProvider::BNXT START */ - uint64_t *dbr; - struct bnxt_device_cq cq; - struct bnxt_device_sq sq; + uint64_t *bnxt_dbr; + struct bnxt_device_cq bnxt_cq; + struct bnxt_device_sq bnxt_sq; - __device__ void poll_cq_until(uint32_t requested_available_slots); + __device__ void bnxt_poll_cq_until(uint32_t requested_available_slots); __device__ void bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe); /* GDAProvider::BNXT END */ @@ -393,7 +393,7 @@ class QueuePair { uint32_t fetching_atomic_idx{0}; struct ibv_mr *mr_fetching_atomic; - static const uint32_t FETCHING_ATOMIC_CNT{1024}; + static constexpr uint32_t FETCHING_ATOMIC_CNT{1024}; static_assert(FETCHING_ATOMIC_CNT % WF_SIZE == 0); using FreeListT = FreeList; FreeListT* fetching_atomic_freelist{nullptr};