QueuePair: prefix bnxt functions and variables (#373)

[ROCm/rocshmem commit: f5940f6b9a]
Этот коммит содержится в:
Omri Mor
2025-12-22 14:46:17 -08:00
коммит произвёл GitHub
родитель c43dc136f3
Коммит 56bfb13644
3 изменённых файлов: 54 добавлений и 54 удалений
+14 -14
Просмотреть файл
@@ -45,10 +45,10 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ);
CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)");
memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq));
gpu_qp->cq.buf = bnxt_scqs[conn_num].buf;
gpu_qp->cq.depth = bnxt_scqs[conn_num].depth;
gpu_qp->cq.id = dv_cq.cqn;
memset(&gpu_qp->bnxt_cq, 0, sizeof(bnxt_device_cq));
gpu_qp->bnxt_cq.buf = bnxt_scqs[conn_num].buf;
gpu_qp->bnxt_cq.depth = bnxt_scqs[conn_num].depth;
gpu_qp->bnxt_cq.id = dv_cq.cqn;
/* Export QP */
memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj));
@@ -58,25 +58,25 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP);
CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");
memset(&gpu_qp->sq, 0, sizeof(bnxt_device_sq));
gpu_qp->sq.buf = bnxt_qps[conn_num].sq_buf;
gpu_qp->sq.depth = bnxt_qps[conn_num].mem_info.sq_slots;
memset(&gpu_qp->bnxt_sq, 0, sizeof(bnxt_device_sq));
gpu_qp->bnxt_sq.buf = bnxt_qps[conn_num].sq_buf;
gpu_qp->bnxt_sq.depth = bnxt_qps[conn_num].mem_info.sq_slots;
if ((gpu_qp->sq.depth % BNXT_RE_STATIC_WQE_BB) != 0) {
if ((gpu_qp->bnxt_sq.depth % BNXT_RE_STATIC_WQE_BB) != 0) {
fprintf(stderr,
"[WARNING] SQ depth not divisible by BNXT_RE_STATIC_WQE_BB. "
"There may be runtime errors.\n");
}
gpu_qp->sq.id = ib_qp->qp_num;
gpu_qp->sq.msntbl = bnxt_qps[conn_num].msntbl;
gpu_qp->sq.msn_tbl_sz = bnxt_qps[conn_num].msn_tbl_sz;
gpu_qp->sq.psn_sz_log2 = std::log2(bnxt_qps[conn_num].mem_info.sq_psn_sz);
gpu_qp->sq.mtu = ibv_mtu_to_int(portinfo.active_mtu);
gpu_qp->bnxt_sq.id = ib_qp->qp_num;
gpu_qp->bnxt_sq.msntbl = bnxt_qps[conn_num].msntbl;
gpu_qp->bnxt_sq.msn_tbl_sz = bnxt_qps[conn_num].msn_tbl_sz;
gpu_qp->bnxt_sq.psn_sz_log2 = std::log2(bnxt_qps[conn_num].mem_info.sq_psn_sz);
gpu_qp->bnxt_sq.mtu = ibv_mtu_to_int(portinfo.active_mtu);
/* Export DB */
CHECK_HIP(hipHostRegister(bnxt_qps[conn_num].db_region_attr->dbr, getpagesize(), hipHostRegisterDefault));
CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->dbr, bnxt_qps[conn_num].db_region_attr->dbr, 0));
CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->bnxt_dbr, bnxt_qps[conn_num].db_region_attr->dbr, 0));
/* Export Memory Keys */
gpu_qp->lkey = heap_mr->lkey;
+35 -35
Просмотреть файл
@@ -96,7 +96,7 @@ __device__ static inline void* bnxt_re_get_hwqe(struct bnxt_device_sq *sq, uint3
return (void *)((char*)sq->buf + (idx << 4));
}
__device__ static inline void aquire_lock(uint32_t *lock) {
__device__ static inline void acquire_lock(uint32_t *lock) {
uint32_t expected;
do {
@@ -117,18 +117,18 @@ __device__ void QueuePair::bnxt_ring_doorbell(uint32_t slot_idx) {
uint64_t key_lo;
uint64_t key_hi;
epoch = (sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT;
epoch = (bnxt_sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT;
key_lo = (slot_idx | epoch);
key_hi = (sq.id & BNXT_RE_DB_QID_MASK)
key_hi = (bnxt_sq.id & BNXT_RE_DB_QID_MASK)
| (((uint64_t) BNXT_RE_QUE_TYPE_SQ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT)
| (0x1UL << BNXT_RE_DB_VALID_SHIFT);
hdr.typ_qid_indx = (key_lo | (key_hi << 32));
__threadfence_system();
__hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
__hip_atomic_store(bnxt_dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
__device__ void QueuePair::bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe) {
@@ -167,7 +167,7 @@ __device__ void QueuePair::bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe) {
}
}
__device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) {
__device__ void QueuePair::bnxt_poll_cq_until(uint32_t requested_available_slots) {
struct bnxt_re_req_cqe *cqe;
uint32_t sq_tail;
uint32_t sq_head;
@@ -175,10 +175,10 @@ __device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) {
uint32_t consumed_slots;
uint32_t available_slots;
sq_depth = sq.depth;
sq_depth = bnxt_sq.depth;
do {
cqe = (struct bnxt_re_req_cqe *) cq.buf;
cqe = (struct bnxt_re_req_cqe *) bnxt_cq.buf;
#ifdef DEBUG
bnxt_check_cqe_error(cqe);
@@ -188,9 +188,9 @@ __device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) {
* This param provides us the wqe_idx but we need to convert to the slot idx.
* We assume a static slots size of GDA_BNXT_WQE_SLOT_COUNT thus can multipy by this value */
sq_head = (((cqe->con_indx & 0xFFFF) * GDA_BNXT_WQE_SLOT_COUNT) % sq_depth);
sq.head = sq_head;
bnxt_sq.head = sq_head;
sq_tail = __hip_atomic_load(&sq.tail, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
sq_tail = __hip_atomic_load(&bnxt_sq.tail, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
consumed_slots = (sq_tail - sq_head + sq_depth) % sq_depth;
available_slots = sq_depth - consumed_slots;
@@ -205,12 +205,12 @@ __device__ void QueuePair::bnxt_quiet() {
active_lane_id = get_active_lane_num(active_lane_mask);
if (0 == active_lane_id) {
poll_cq_until(sq.depth);
bnxt_poll_cq_until(bnxt_sq.depth);
}
}
__device__ void QueuePair::bnxt_quiet_single() {
poll_cq_until(sq.depth);
bnxt_poll_cq_until(bnxt_sq.depth);
}
__device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr, int32_t length, uint8_t opcode) {
@@ -228,11 +228,11 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr,
inline_msg = length <= inline_threshold &&
opcode == gda_op_rdma_write;
poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
bnxt_poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1);
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2);
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&bnxt_sq, 0);
rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&bnxt_sq, 1);
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&bnxt_sq, 2);
/* Populate Header Segment */
wqe_type = BNXT_RE_HDR_WT_MASK & opcode;
@@ -272,10 +272,10 @@ __device__ void QueuePair::bnxt_write_rma_wqe(uintptr_t raddr, uintptr_t laddr,
}
/* Populate MSN Table */
bnxt_re_fill_psns_for_msntbl(&sq, length);
bnxt_re_fill_psns_for_msntbl(&bnxt_sq, length);
/* Update SQ Pointer */
bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT);
bnxt_re_incr_tail(&bnxt_sq, GDA_BNXT_WQE_SLOT_COUNT);
}
__device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t laddr, uintptr_t raddr, uint8_t opcode) {
@@ -288,7 +288,7 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t l
active_lane_id = get_active_lane_num(active_lane_mask);
if (0 == active_lane_id) {
aquire_lock(&sq.lock);
acquire_lock(&bnxt_sq.lock);
}
for (int i = 0; i < active_lane_count; i++) {
@@ -297,12 +297,12 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t l
bnxt_write_rma_wqe(raddr, laddr, length, opcode);
/* Ring Doorbell */
bnxt_ring_doorbell(sq.tail);
bnxt_ring_doorbell(bnxt_sq.tail);
}
}
if (0 == active_lane_id) {
release_lock(&sq.lock);
release_lock(&bnxt_sq.lock);
}
}
@@ -310,16 +310,16 @@ __device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t la
uintptr_t raddr, uint8_t opcode,
bool ring_db) {
aquire_lock(&sq.lock);
acquire_lock(&bnxt_sq.lock);
/* Write WQE to SQ */
bnxt_write_rma_wqe(raddr, laddr, length, opcode);
if (ring_db) {
bnxt_ring_doorbell(sq.tail);
bnxt_ring_doorbell(bnxt_sq.tail);
}
release_lock(&sq.lock);
release_lock(&bnxt_sq.lock);
}
__device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcode,
@@ -338,11 +338,11 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcod
uint32_t atomic_idx = 0;
uint32_t length = sizeof(uint64_t);
poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
bnxt_poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&sq, 1);
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2);
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&bnxt_sq, 0);
amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&bnxt_sq, 1);
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&bnxt_sq, 2);
/* Populate Header Segment */
wqe_size = BNXT_RE_HDR_WS_MASK & GDA_BNXT_WQE_SLOT_COUNT;
@@ -377,10 +377,10 @@ __device__ uint32_t QueuePair::bnxt_write_amo_wqe(uintptr_t raddr, uint8_t opcod
memcpy(sge_ptr, &sge, sizeof(struct bnxt_re_sge));
/* Populate MSN Table */
bnxt_re_fill_psns_for_msntbl(&sq, length);
bnxt_re_fill_psns_for_msntbl(&bnxt_sq, length);
/* Update SQ Pointer */
bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT);
bnxt_re_incr_tail(&bnxt_sq, GDA_BNXT_WQE_SLOT_COUNT);
return atomic_idx;
}
@@ -398,7 +398,7 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode
active_lane_id = get_active_lane_num(active_lane_mask);
if (0 == active_lane_id) {
aquire_lock(&sq.lock);
acquire_lock(&bnxt_sq.lock);
}
for (int i = 0; i < active_lane_count; i++) {
@@ -406,12 +406,12 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t raddr, uint8_t opcode
atomic_idx = bnxt_write_amo_wqe(raddr, opcode, atomic_data, atomic_cmp, fetching);
/* Ring Doorbell */
bnxt_ring_doorbell(sq.tail);
bnxt_ring_doorbell(bnxt_sq.tail);
}
}
if (0 == active_lane_id) {
release_lock(&sq.lock);
release_lock(&bnxt_sq.lock);
}
if (fetching) {
@@ -427,14 +427,14 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t raddr, uint8_t
bool fetching) {
uint32_t atomic_idx = 0;
aquire_lock(&sq.lock);
acquire_lock(&bnxt_sq.lock);
/* Write WQE to SQ */
atomic_idx = bnxt_write_amo_wqe(raddr, opcode, atomic_data, atomic_cmp, fetching);
bnxt_ring_doorbell(sq.tail);
bnxt_ring_doorbell(bnxt_sq.tail);
release_lock(&sq.lock);
release_lock(&bnxt_sq.lock);
if (fetching) {
quiet();
+5 -5
Просмотреть файл
@@ -253,11 +253,11 @@ class QueuePair {
int gda_provider_{0};
/* GDAProvider::BNXT START */
uint64_t *dbr;
struct bnxt_device_cq cq;
struct bnxt_device_sq sq;
uint64_t *bnxt_dbr;
struct bnxt_device_cq bnxt_cq;
struct bnxt_device_sq bnxt_sq;
__device__ void poll_cq_until(uint32_t requested_available_slots);
__device__ void bnxt_poll_cq_until(uint32_t requested_available_slots);
__device__ void bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe);
/* GDAProvider::BNXT END */
@@ -393,7 +393,7 @@ class QueuePair {
uint32_t fetching_atomic_idx{0};
struct ibv_mr *mr_fetching_atomic;
static const uint32_t FETCHING_ATOMIC_CNT{1024};
static constexpr uint32_t FETCHING_ATOMIC_CNT{1024};
static_assert(FETCHING_ATOMIC_CNT % WF_SIZE == 0);
using FreeListT = FreeList<uint64_t*, HIPAllocator>;
FreeListT* fetching_atomic_freelist{nullptr};