From 0bd07a26be1f51006f217aabf78bb3673a4e75da Mon Sep 17 00:00:00 2001 From: Yiltan Date: Thu, 23 Oct 2025 14:53:44 -0400 Subject: [PATCH] [GDA/BNXT] Implemented CQE Collapsing (#279) [ROCm/rocshmem commit: 6290db319c3e552bb6c532989dc8cae7b299c938] --- projects/rocshmem/src/gda/backend_gda.cpp | 60 +++++- projects/rocshmem/src/gda/backend_gda.hpp | 9 +- .../src/gda/bnxt/backend_gda_bnxt.cpp | 85 +++++--- .../src/gda/bnxt/provider_gda_bnxt.hpp | 9 +- .../rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp | 190 ++++++++---------- projects/rocshmem/src/gda/queue_pair.hpp | 6 +- 6 files changed, 203 insertions(+), 156 deletions(-) diff --git a/projects/rocshmem/src/gda/backend_gda.cpp b/projects/rocshmem/src/gda/backend_gda.cpp index d026979f03..0f14554d68 100644 --- a/projects/rocshmem/src/gda/backend_gda.cpp +++ b/projects/rocshmem/src/gda/backend_gda.cpp @@ -626,13 +626,20 @@ void GDABackend::cleanup_ibv() { CHECK_HIP(hipFree(bnxt_qps[i].sq_buf)); CHECK_HIP(hipFree(bnxt_qps[i].rq_buf)); - err = bnxt_re_dv.destroy_cq(cqs[i]); - CHECK_ZERO(err, "bnxt_re_dv_destroy_cq"); + err = bnxt_re_dv.destroy_cq(bnxt_scqs[i].cq); + CHECK_ZERO(err, "bnxt_re_dv_destroy_cq (SCQ)"); - err = bnxt_re_dv.umem_dereg(bnxt_cqs[i].umem_handle); - CHECK_ZERO(err, "bnxt_re_dv_umem_dereg"); + err = bnxt_re_dv.destroy_cq(bnxt_rcqs[i].cq); + CHECK_ZERO(err, "bnxt_re_dv_destroy_cq (RCQ)"); - CHECK_HIP(hipFree(bnxt_cqs[i].buf)); + err = bnxt_re_dv.umem_dereg(bnxt_scqs[i].umem_handle); + CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (SCQ)"); + + err = bnxt_re_dv.umem_dereg(bnxt_rcqs[i].umem_handle); + CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (RCQ)"); + + CHECK_HIP(hipFree(bnxt_scqs[i].buf)); + CHECK_HIP(hipFree(bnxt_rcqs[i].buf)); } } else { for (int i = 0; i < qps.size(); i++) { @@ -847,6 +854,8 @@ void GDABackend::open_ib_device() { dump_ibv_context(context); dump_ibv_device(context->device); + validate_ib_device(); + pd_orig = ibv_alloc_pd(context); CHECK_NNULL(pd_orig, "ib allocate pd"); dump_ibv_pd(pd_orig); @@ -865,6 +874,35 @@ void GDABackend::open_ib_device() { ibv_free_device_list(device_list); } +void GDABackend::validate_ib_device() { + int err; + + err = ibv_query_device(context, &device_attr); + CHECK_ZERO(err, "ibv_query_device"); + + if (gda_provider == GDAProvider::BNXT) { + const uint32_t bnxt_vendor_id = 0x14E4; + const std::set supported_bnxt_part_ids = { 0x1760 /* BCM57608 */}; + const char min_supported_bnxt_fw_ver[12] = "233.2.104.0"; + + + if (bnxt_vendor_id != device_attr.vendor_id) { + printf("GDAProvider::BNXT requested but an invalid device is selected\n"); + abort(); + } + + if (supported_bnxt_part_ids.find(device_attr.vendor_part_id) == supported_bnxt_part_ids.end()) { + printf("Unsupported Broadcom Part: %x\n", device_attr.vendor_part_id); + abort(); + } + + if (strverscmp(min_supported_bnxt_fw_ver, device_attr.fw_ver) > 0) { + printf("Unsupported firmware version: %s\n", device_attr.fw_ver); + abort(); + } + } +} + void GDABackend::modify_qps_reset_to_init() { int err; struct ibv_qp_attr attr; @@ -999,7 +1037,8 @@ void GDABackend::create_queues() { cqs.resize(resize_length); qps.resize(resize_length); - bnxt_cqs.resize(resize_length); + bnxt_scqs.resize(resize_length); + bnxt_rcqs.resize(resize_length); bnxt_qps.resize(resize_length); if (gda_provider == GDAProvider::BNXT) { @@ -1052,10 +1091,11 @@ void GDABackend::alternate_qp_ports() { if (new_qp_idx < qps.size()) { // Swap QPs - std::swap(cqs[cur_qp_idx], cqs[new_qp_idx]); - std::swap(qps[cur_qp_idx], qps[new_qp_idx]); - std::swap(bnxt_cqs[cur_qp_idx], bnxt_cqs[new_qp_idx]); - std::swap(bnxt_qps[cur_qp_idx], bnxt_qps[new_qp_idx]); + std::swap(cqs[cur_qp_idx], cqs[new_qp_idx]); + std::swap(qps[cur_qp_idx], qps[new_qp_idx]); + std::swap(bnxt_scqs[cur_qp_idx], bnxt_scqs[new_qp_idx]); + std::swap(bnxt_rcqs[cur_qp_idx], bnxt_rcqs[new_qp_idx]); + std::swap(bnxt_qps[cur_qp_idx], bnxt_qps[new_qp_idx]); } } } diff --git a/projects/rocshmem/src/gda/backend_gda.hpp b/projects/rocshmem/src/gda/backend_gda.hpp index d2a83cb3b9..2d5e4f9fdd 100644 --- a/projects/rocshmem/src/gda/backend_gda.hpp +++ b/projects/rocshmem/src/gda/backend_gda.hpp @@ -65,6 +65,7 @@ class GDABackend : public Backend { const char *requested_dev = nullptr; struct ibv_context *context = nullptr;; + struct ibv_device_attr device_attr; struct ibv_pd *pd_orig = nullptr; enum GDAProvider gda_provider = GDAProvider::UNSET; @@ -85,7 +86,8 @@ class GDABackend : public Backend { /* GDA_BNXT START */ std::vector bnxt_qps; - std::vector bnxt_cqs; + std::vector bnxt_scqs; + std::vector bnxt_rcqs; struct bnxt_re_dv_db_region_attr db_region_attr; /* GDA_BNXT END */ @@ -332,6 +334,11 @@ class GDABackend : public Backend { */ void open_ib_device(); + /** + * @brief Validated the rocSHMEM will run with the currently open InfiniBand Device + */ + void validate_ib_device(); + /** * @brief Selects the best GID index */ diff --git a/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp b/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp index a61f20fb49..511d03b22a 100644 --- a/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp +++ b/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp @@ -37,19 +37,18 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { ib_qp = qps[conn_num]; - /* Export CQ */ + /* Export SCQ */ memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj)); - dv_obj.cq.in = cqs[conn_num]; + dv_obj.cq.in = bnxt_scqs[conn_num].cq; dv_obj.cq.out = &dv_cq; err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ); CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)"); memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq)); - gpu_qp->cq.buf = bnxt_cqs[conn_num].buf; - gpu_qp->cq.depth = bnxt_cqs[conn_num].depth; + gpu_qp->cq.buf = bnxt_scqs[conn_num].buf; + gpu_qp->cq.depth = bnxt_scqs[conn_num].depth; gpu_qp->cq.id = dv_cq.cqn; - gpu_qp->cq.phase = BNXT_RE_QUEUE_START_PHASE; /* Export QP */ memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj)); @@ -95,34 +94,72 @@ void GDABackend::bnxt_create_cqs(int cqe) { struct bnxt_re_dv_cq_init_attr cq_init_attr; struct bnxt_re_dv_umem_reg_attr umem_attr; + /* Ignore value of cqe as we only need of length 1 to use CQE compression */ + cqe = 1; + + /* Create SCQs */ for (int i = 0; i < qps.size(); i++) { - /* Allocate CQ mem */ + /* Allocate SCQ mem */ memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr)); - bnxt_cqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr); - CHECK_NNULL(bnxt_cqs[i].handle, "bnxt_re_dv_cq_mem_alloc"); + bnxt_scqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr); + CHECK_NNULL(bnxt_scqs[i].handle, "bnxt_re_dv_cq_mem_alloc (SCQ)"); - /* Allocate CQ UMEM */ - bnxt_cqs[i].length = cq_attr.ncqe * cq_attr.cqe_size; - bnxt_cqs[i].depth = cq_attr.ncqe; - CHECK_HIP(hipExtMallocWithFlags(&bnxt_cqs[i].buf, bnxt_cqs[i].length, hipDeviceMallocUncached)); + /* We must force this to a value of 1 to use CQE Compression */ + cq_attr.ncqe = cqe; - /* Register CQ UMEM */ + /* Allocate SCQ UMEM */ + bnxt_scqs[i].length = cq_attr.ncqe * cq_attr.cqe_size; + bnxt_scqs[i].depth = cq_attr.ncqe; + CHECK_HIP(hipExtMallocWithFlags(&bnxt_scqs[i].buf, bnxt_scqs[i].length, hipDeviceMallocUncached)); + + /* Register SCQ UMEM */ memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr)); - umem_attr.addr = bnxt_cqs[i].buf; - umem_attr.size = bnxt_cqs[i].length; + umem_attr.addr = bnxt_scqs[i].buf; + umem_attr.size = bnxt_scqs[i].length; umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE; - bnxt_cqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr); - CHECK_NNULL(bnxt_cqs[i].umem_handle, "bnxt_re_dv_umem_reg(cq_buf)"); + bnxt_scqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr); + CHECK_NNULL(bnxt_scqs[i].umem_handle, "bnxt_re_dv_umem_reg(scq_buf)"); - /* Create CQ */ + /* Create SCQ */ memset(&cq_init_attr, 0, sizeof(struct bnxt_re_dv_cq_init_attr)); - cq_init_attr.cq_handle = (uint64_t) bnxt_cqs[i].handle; - cq_init_attr.umem_handle = bnxt_cqs[i].umem_handle; + cq_init_attr.cq_handle = (uint64_t) bnxt_scqs[i].handle; + cq_init_attr.umem_handle = bnxt_scqs[i].umem_handle; cq_init_attr.ncqe = cq_attr.ncqe; - cqs[i] = bnxt_re_dv.create_cq(context, &cq_init_attr); - CHECK_NNULL(cqs[i], "bnxt_re_dv_create_cq"); + bnxt_scqs[i].cq = bnxt_re_dv.create_cq(context, &cq_init_attr); + CHECK_NNULL(bnxt_scqs[i].cq, "bnxt_re_dv_create_cq (SCQ) "); + } + + /* Create RCQs */ + for (int i = 0; i < qps.size(); i++) { + /* Allocate RCQ mem */ + memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr)); + bnxt_rcqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr); + CHECK_NNULL(bnxt_rcqs[i].handle, "bnxt_re_dv_cq_mem_alloc (RCQ)"); + + /* Allocate RCQ UMEM */ + bnxt_rcqs[i].length = cq_attr.ncqe * cq_attr.cqe_size; + bnxt_rcqs[i].depth = cq_attr.ncqe; + CHECK_HIP(hipExtMallocWithFlags(&bnxt_rcqs[i].buf, bnxt_rcqs[i].length, hipDeviceMallocUncached)); + + /* Register RCQ UMEM */ + memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr)); + umem_attr.addr = bnxt_rcqs[i].buf; + umem_attr.size = bnxt_rcqs[i].length; + umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE; + + bnxt_rcqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr); + CHECK_NNULL(bnxt_rcqs[i].umem_handle, "bnxt_re_dv_umem_reg(rcq_buf)"); + + /* Create RCQ */ + memset(&cq_init_attr, 0, sizeof(struct bnxt_re_dv_cq_init_attr)); + cq_init_attr.cq_handle = (uint64_t) bnxt_rcqs[i].handle; + cq_init_attr.umem_handle = bnxt_rcqs[i].umem_handle; + cq_init_attr.ncqe = cq_attr.ncqe; + + bnxt_rcqs[i].cq = bnxt_re_dv.create_cq(context, &cq_init_attr); + CHECK_NNULL(bnxt_rcqs[i].cq, "bnxt_re_dv_create_cq (RCQ)"); } } @@ -140,8 +177,8 @@ void GDABackend::bnxt_create_qps(int sq_length) { for (int i = 0; i < qps.size(); i++) { /* IB QP Init Attr */ memset(&ib_qp_attr, 0, sizeof(struct ibv_qp_init_attr)); - ib_qp_attr.send_cq = cqs[i]; - ib_qp_attr.recv_cq = cqs[i]; + ib_qp_attr.send_cq = bnxt_scqs[i].cq; + ib_qp_attr.recv_cq = bnxt_rcqs[i].cq; ib_qp_attr.cap.max_send_wr = sq_length; ib_qp_attr.cap.max_recv_wr = 0; ib_qp_attr.cap.max_send_sge = 1; diff --git a/projects/rocshmem/src/gda/bnxt/provider_gda_bnxt.hpp b/projects/rocshmem/src/gda/bnxt/provider_gda_bnxt.hpp index 631e42d21c..b50d868291 100644 --- a/projects/rocshmem/src/gda/bnxt/provider_gda_bnxt.hpp +++ b/projects/rocshmem/src/gda/bnxt/provider_gda_bnxt.hpp @@ -30,11 +30,7 @@ extern "C" { #include "gda/bnxt/bnxt_re_hsi.h" } -#define bnxt_re_get_cqe_sz() (sizeof(struct bnxt_re_req_cqe) + \ - sizeof(struct bnxt_re_bcqe)) - -#define bnxt_re_is_cqe_valid(valid, phase) \ - (((valid) & BNXT_RE_BCQE_PH_MASK) == (phase)) +#define GDA_BNXT_WQE_SLOT_COUNT 3 struct bnxt_device_wq { void *buf; @@ -50,12 +46,10 @@ struct bnxt_device_wq { } __attribute__((packed)); struct bnxt_device_cq : public bnxt_device_wq { - uint32_t phase; } __attribute__((packed)); struct bnxt_device_sq : public bnxt_device_wq { uint32_t psn; - volatile uint32_t posted; void *msntbl; uint32_t msn; @@ -70,6 +64,7 @@ struct bnxt_host_cq { void *umem_handle; uint64_t length; uint32_t depth; + struct ibv_cq *cq; } __attribute__((packed)); struct bnxt_host_qp { diff --git a/projects/rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp b/projects/rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp index 47ca08f7b7..148d5be462 100644 --- a/projects/rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp +++ b/projects/rocshmem/src/gda/bnxt/queue_pair_bnxt.cpp @@ -27,36 +27,6 @@ namespace rocshmem { -static const __device__ char bnxt_re_wc_error_strings[12][14] = { - "OK", - "BAD_RESP", - "LOC_LEN", - "LOC_QP_OP", - "PROT", - "MEM_OP", - "REM_INVAL", - "REM_ACC", - "REM_OP", - "RNR_NAK_XCED", - "TRNSP_XCED", - "WR_FLUSH", -}; - -__device__ static inline void bnxt_re_init_db_hdr(struct bnxt_re_db_hdr *hdr, - uint32_t indx, uint32_t toggle, - uint32_t qid, uint32_t typ) { - uint64_t key_lo; - uint64_t key_hi; - - key_lo = (indx | toggle); - - key_hi = (qid & BNXT_RE_DB_QID_MASK) - | ((typ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT) - | (0x1UL << BNXT_RE_DB_VALID_SHIFT); - - hdr->typ_qid_indx = (key_lo | (key_hi << 32)); -} - __device__ static inline struct bnxt_re_msns* bnxt_re_pull_psn_buff(struct bnxt_device_sq *sq) { return (struct bnxt_re_msns*)(((char *) sq->msntbl) + ((sq->msn) << sq->psn_sz_log2)); } @@ -126,23 +96,6 @@ __device__ static inline void* bnxt_re_get_hwqe(struct bnxt_device_sq *sq, uint3 return (void *)((char*)sq->buf + (idx << 4)); } -__device__ static inline void bnxt_re_incr_head(struct bnxt_device_cq *cq, uint8_t cnt) -{ - cq->head += cnt; - if (cq->head >= cq->depth) { - cq->head %= cq->depth; - /* Rolled over, Toggle HEAD bit in epoch flags */ - cq->flags ^= 1UL << BNXT_RE_FLAG_EPOCH_HEAD_SHIFT; - } -} - -__device__ static inline void bnxt_re_change_cq_phase(struct bnxt_device_cq *cq) -{ - if (!cq->head) { - cq->phase = !(cq->phase & BNXT_RE_BCQE_PH_MASK); - } -} - __device__ static inline void aquire_lock(uint32_t *lock) { uint32_t expected; @@ -158,68 +111,94 @@ __device__ static inline void release_lock(uint32_t *lock) { *lock = 0; } -__device__ void QueuePair::ring_cq_doorbell(uint32_t slot_idx) { - struct bnxt_re_db_hdr hdr; - uint32_t epoch; - - epoch = (cq.flags & BNXT_RE_FLAG_EPOCH_HEAD_MASK) << BNXT_RE_DB_EPOCH_HEAD_SHIFT; - - bnxt_re_init_db_hdr(&hdr, (slot_idx | epoch), 0, cq.flags, BNXT_RE_QUE_TYPE_CQ); - - __threadfence_system(); - __hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); -} - -__device__ void QueuePair::ring_sq_doorbell(uint32_t slot_idx) { +__device__ void QueuePair::bnxt_ring_doorbell(uint32_t slot_idx) { struct bnxt_re_db_hdr hdr; uint32_t epoch; + uint64_t key_lo; + uint64_t key_hi; epoch = (sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT; - bnxt_re_init_db_hdr(&hdr, (slot_idx | epoch), 0, sq.id, BNXT_RE_QUE_TYPE_SQ); + key_lo = (slot_idx | epoch); + + key_hi = (sq.id & BNXT_RE_DB_QID_MASK) + | (((uint64_t) BNXT_RE_QUE_TYPE_SQ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT) + | (0x1UL << BNXT_RE_DB_VALID_SHIFT); + + hdr.typ_qid_indx = (key_lo | (key_hi << 32)); __threadfence_system(); __hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); } -__device__ int QueuePair::poll_cq() { +__device__ void QueuePair::bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe) { struct bnxt_re_bcqe *hdr; - void *cqe; uint32_t flg_val; - int type; uint8_t status; - cqe = (void*) ((char*) cq.buf + (cq.head * bnxt_re_get_cqe_sz())); + const char bnxt_re_wc_error_strings[12][14] = { + "OK", + "BAD_RESP", + "LOC_LEN", + "LOC_QP_OP", + "PROT", + "MEM_OP", + "REM_INVAL", + "REM_ACC", + "REM_OP", + "RNR_NAK_XCED", + "TRNSP_XCED", + "WR_FLUSH", + }; + hdr = (struct bnxt_re_bcqe*) ((char*)cqe + sizeof(struct bnxt_re_req_cqe)); flg_val = hdr->flg_st_typ_ph; - __threadfence_system(); + __threadfence(); - if (bnxt_re_is_cqe_valid(flg_val, cq.phase)) { - // Is the CQE valid? - status = (flg_val >> BNXT_RE_BCQE_STATUS_SHIFT) - & BNXT_RE_BCQE_STATUS_MASK; + // Is the CQE valid? + status = (flg_val >> BNXT_RE_BCQE_STATUS_SHIFT) + & BNXT_RE_BCQE_STATUS_MASK; - if (status != BNXT_RE_REQ_ST_OK) { - printf("CQ Error %s (%x)\n", bnxt_re_wc_error_strings[status], status); - abort(); - return -1; - } - - /* Update the CQ Ptr */ - bnxt_re_incr_head(&cq, 1); - bnxt_re_change_cq_phase(&cq); - - /* Ring Doorbell */ - ring_cq_doorbell(cq.head); - - __hip_atomic_fetch_sub(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); - - return 1; + if (status != BNXT_RE_REQ_ST_OK) { + printf("CQ Error %s (%x)\n", bnxt_re_wc_error_strings[status], status); + abort(); } +} - return 0; +__device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) { + struct bnxt_re_req_cqe *cqe; + uint32_t sq_tail; + uint32_t sq_head; + uint32_t sq_depth; + uint32_t consumed_slots; + uint32_t available_slots; + + sq_depth = sq.depth; + + aquire_lock(&cq.lock); + + do { + cqe = (struct bnxt_re_req_cqe *) cq.buf; + +#ifdef DEBUG + bnxt_check_cqe_error(cqe); +#endif + + /* Update the SQ head + * This param provides us the wqe_idx but we need to convert to the slot idx. + * We assume a static slots size of GDA_BNXT_WQE_SLOT_COUNT thus can multipy by this value */ + sq_head = (((cqe->con_indx & 0xFFFF) * GDA_BNXT_WQE_SLOT_COUNT) % sq_depth); + sq.head = sq_head; + + sq_tail = __hip_atomic_load(&sq.tail, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); + + consumed_slots = (sq_tail - sq_head + sq_depth) % sq_depth; + available_slots = sq_depth - consumed_slots; + } while (available_slots < requested_available_slots); + + release_lock(&cq.lock); } __device__ void QueuePair::bnxt_quiet() { @@ -230,11 +209,7 @@ __device__ void QueuePair::bnxt_quiet() { active_lane_id = get_active_lane_num(active_lane_mask); if (0 == active_lane_id) { - aquire_lock(&cq.lock); - while (__hip_atomic_load(&sq.posted, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT)) { - poll_cq(); - } - release_lock(&cq.lock); + poll_cq_until(sq.depth); } } @@ -264,18 +239,18 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t * uint32_t hdr_flags; uint32_t inline_msg; - uint32_t rma_slots = 3; // (Three slots: hdr, rdma, sge) - inline_msg = length <= inline_threshold && opcode == gda_op_rdma_write; + poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT); + hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0); rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1); sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2); /* Populate Header Segment */ wqe_type = BNXT_RE_HDR_WT_MASK & opcode; - wqe_size = BNXT_RE_HDR_WS_MASK & rma_slots; + wqe_size = BNXT_RE_HDR_WS_MASK & GDA_BNXT_WQE_SLOT_COUNT; hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK) & ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED); @@ -314,16 +289,11 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t * bnxt_re_fill_psns_for_msntbl(&sq, length); /* Update SQ Pointer */ - bnxt_re_incr_tail(&sq, rma_slots); + bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT); /* Ring Doorbell */ - ring_sq_doorbell(sq.tail); - - __hip_atomic_fetch_add(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); - + bnxt_ring_doorbell(sq.tail); } - __threadfence_system(); - quiet(); } if (0 == active_lane_id) { @@ -357,17 +327,18 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(int pe, int32_t length, uintptr uint32_t wqe_size; uint32_t wqe_type; uint32_t hdr_flags; - uint32_t amo_slots = 3; // (Three slots: hdr, amo, sge) + + poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT); hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0); amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&sq, 1); sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2); /* Populate Header Segment */ - wqe_size = BNXT_RE_HDR_WS_MASK & amo_slots; + wqe_size = BNXT_RE_HDR_WS_MASK & GDA_BNXT_WQE_SLOT_COUNT; hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK) & ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED); - wqe_type = BNXT_RE_HDR_WT_MASK & opcode; + wqe_type = BNXT_RE_HDR_WT_MASK & opcode; hdr.rsv_ws_fl_wt = (wqe_size << BNXT_RE_HDR_WS_SHIFT) | (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT) @@ -399,15 +370,11 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(int pe, int32_t length, uintptr bnxt_re_fill_psns_for_msntbl(&sq, length); /* Update SQ Pointer */ - bnxt_re_incr_tail(&sq, amo_slots); + bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT); /* Ring Doorbell */ - ring_sq_doorbell(sq.tail); - - __hip_atomic_fetch_add(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); + bnxt_ring_doorbell(sq.tail); } - __threadfence_system(); - quiet(); } if (0 == active_lane_id) { @@ -415,6 +382,7 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(int pe, int32_t length, uintptr } if (fetching) { + quiet(); return fetching_atomic[atomic_idx]; } diff --git a/projects/rocshmem/src/gda/queue_pair.hpp b/projects/rocshmem/src/gda/queue_pair.hpp index fd74db41a4..8d348bbcb2 100644 --- a/projects/rocshmem/src/gda/queue_pair.hpp +++ b/projects/rocshmem/src/gda/queue_pair.hpp @@ -185,8 +185,7 @@ class QueuePair { __device__ void mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter); #endif #if defined(GDA_BNXT) - __device__ void ring_sq_doorbell(uint32_t slot_idx); - __device__ void ring_cq_doorbell(uint32_t slot_idx); + __device__ void bnxt_ring_doorbell(uint32_t slot_idx); #endif #if defined(GDA_IONIC) __device__ void ionic_ring_doorbell(uint32_t pos); @@ -199,7 +198,8 @@ class QueuePair { struct bnxt_device_cq cq; struct bnxt_device_sq sq; - __device__ int poll_cq(); + __device__ void poll_cq_until(uint32_t requested_available_slots); + __device__ void bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe); /* GDAProvider::BNXT END */