[GDA/BNXT] Remove doorbell arbitration (#363)

[ROCm/rocshmem commit: fe1a28e409]
Tento commit je obsažen v:
Yiltan
2025-12-15 09:23:01 -05:00
odevzdal GitHub
rodič f9fd5d3cdd
revize 5e2ba952f3
5 změnil soubory, kde provedl 29 přidání a 51 odebrání
+5 -2
Zobrazit soubor
@@ -715,12 +715,15 @@ void GDABackend::cleanup_ibv() {
int err;
if (gda_provider == GDAProvider::BNXT) {
CHECK_HIP(hipHostUnregister(db_region_attr.dbr));
for (int i = 0; i < qps.size(); i++) {
err = bnxt_re_dv.destroy_qp(qps[i]);
CHECK_ZERO(err, "bnxt_re_dv_destroy_qp");
CHECK_HIP(hipHostUnregister(bnxt_qps[i].db_region_attr->dbr));
err = bnxt_re_dv.free_db_region(context, bnxt_qps[i].db_region_attr);
CHECK_ZERO(err, "bnxt_re_dv_free_db_region");
err = bnxt_re_dv.umem_dereg(bnxt_qps[i].attr.rq_umem_handle);
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (RQ)");
-2
Zobrazit soubor
@@ -96,8 +96,6 @@ class GDABackend : public Backend {
std::vector<struct bnxt_host_qp> bnxt_qps;
std::vector<struct bnxt_host_cq> bnxt_scqs;
std::vector<struct bnxt_host_cq> bnxt_rcqs;
struct bnxt_re_dv_db_region_attr db_region_attr;
/* GDA_BNXT END */
/* GDA_IONIC & GDA_MLX5 START */
+19 -16
Zobrazit soubor
@@ -75,11 +75,8 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
gpu_qp->sq.mtu = ibv_mtu_to_int(portinfo.active_mtu);
/* Export DB */
err = bnxt_re_dv.get_default_db_region(context, &db_region_attr);
CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");
CHECK_HIP(hipHostRegister(db_region_attr.dbr, getpagesize(), hipHostRegisterDefault));
CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->dbr, db_region_attr.dbr, 0));
CHECK_HIP(hipHostRegister(bnxt_qps[conn_num].db_region_attr->dbr, getpagesize(), hipHostRegisterDefault));
CHECK_HIP(hipHostGetDevicePointer((void**) &gpu_qp->dbr, bnxt_qps[conn_num].db_region_attr->dbr, 0));
/* Export Memory Keys */
gpu_qp->lkey = heap_mr->lkey;
@@ -229,6 +226,10 @@ void GDABackend::bnxt_create_qps(int sq_length) {
rq_umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
CHECK_NNULL(rq_umem_handle, "bnxt_re_dv_umem_reg(rq)");
/* Alloc DPI */
bnxt_qps[i].db_region_attr = bnxt_re_dv.alloc_db_region(context);
CHECK_NNULL(bnxt_qps[i].db_region_attr, "bnxt_re_dv_alloc_db_region");
/* IB DV QP Init Attr */
memset(&bnxt_qps[i].attr, 0, sizeof(struct bnxt_re_dv_qp_init_attr));
bnxt_qps[i].attr.send_cq = ib_qp_attr.send_cq;
@@ -240,19 +241,20 @@ void GDABackend::bnxt_create_qps(int sq_length) {
bnxt_qps[i].attr.max_inline_data = ib_qp_attr.cap.max_inline_data;
bnxt_qps[i].attr.qp_type = ib_qp_attr.qp_type;
bnxt_qps[i].attr.qp_handle = bnxt_qps[i].mem_info.qp_handle;
bnxt_qps[i].attr.qp_handle = bnxt_qps[i].mem_info.qp_handle;
bnxt_qps[i].attr.dbr_handle = bnxt_qps[i].db_region_attr;
bnxt_qps[i].attr.sq_umem_handle = sq_umem_handle;
bnxt_qps[i].attr.sq_len = bnxt_qps[i].mem_info.sq_len;
bnxt_qps[i].attr.sq_slots = bnxt_qps[i].mem_info.sq_slots;
bnxt_qps[i].attr.sq_wqe_sz = bnxt_qps[i].mem_info.sq_wqe_sz;
bnxt_qps[i].attr.sq_psn_sz = bnxt_qps[i].mem_info.sq_psn_sz;
bnxt_qps[i].attr.sq_npsn = bnxt_qps[i].mem_info.sq_npsn;
bnxt_qps[i].attr.sq_len = bnxt_qps[i].mem_info.sq_len;
bnxt_qps[i].attr.sq_slots = bnxt_qps[i].mem_info.sq_slots;
bnxt_qps[i].attr.sq_wqe_sz = bnxt_qps[i].mem_info.sq_wqe_sz;
bnxt_qps[i].attr.sq_psn_sz = bnxt_qps[i].mem_info.sq_psn_sz;
bnxt_qps[i].attr.sq_npsn = bnxt_qps[i].mem_info.sq_npsn;
bnxt_qps[i].attr.rq_umem_handle = rq_umem_handle;
bnxt_qps[i].attr.rq_len = bnxt_qps[i].mem_info.rq_len;
bnxt_qps[i].attr.rq_slots = bnxt_qps[i].mem_info.rq_slots;
bnxt_qps[i].attr.rq_wqe_sz = bnxt_qps[i].mem_info.rq_wqe_sz;
bnxt_qps[i].attr.comp_mask = bnxt_qps[i].mem_info.comp_mask;
bnxt_qps[i].attr.rq_len = bnxt_qps[i].mem_info.rq_len;
bnxt_qps[i].attr.rq_slots = bnxt_qps[i].mem_info.rq_slots;
bnxt_qps[i].attr.rq_wqe_sz = bnxt_qps[i].mem_info.rq_wqe_sz;
bnxt_qps[i].attr.comp_mask = bnxt_qps[i].mem_info.comp_mask;
/* Alloc QP */
qps[i] = bnxt_re_dv.create_qp(pd_orig, &bnxt_qps[i].attr);
@@ -288,7 +290,8 @@ int GDABackend::bnxt_dv_dl_init() {
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, cq_mem_alloc);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, umem_reg);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, umem_dereg);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, get_default_db_region);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, alloc_db_region);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, free_db_region);
return ROCSHMEM_SUCCESS;
}
+3 -2
Zobrazit soubor
@@ -70,6 +70,7 @@ struct bnxt_host_cq {
struct bnxt_host_qp {
struct bnxt_re_dv_qp_mem_info mem_info;
struct bnxt_re_dv_qp_init_attr attr;
struct bnxt_re_dv_db_region_attr *db_region_attr;
void *sq_buf;
void *rq_buf;
void *msntbl;
@@ -96,8 +97,8 @@ struct bnxtdv_funcs_t {
void* (*umem_reg)(struct ibv_context *ibvctx,
struct bnxt_re_dv_umem_reg_attr *in);
int (*umem_dereg)(void *umem_handle);
int (*get_default_db_region)(struct ibv_context *ibvctx,
struct bnxt_re_dv_db_region_attr *out);
struct bnxt_re_dv_db_region_attr * (*alloc_db_region)(struct ibv_context *ctx);
int (*free_db_region)(struct ibv_context *ctx, struct bnxt_re_dv_db_region_attr *attr);
};
#endif //LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_
+2 -29
Zobrazit soubor
@@ -320,21 +320,7 @@ __device__ void QueuePair::bnxt_post_wqe_rma_single(int32_t length, uintptr_t *l
bnxt_write_rma_wqe(raddr, laddr, length, opcode);
if (ring_db) {
uint64_t active_lane_mask;
uint8_t active_lane_count;
uint8_t active_lane_id;
active_lane_mask = get_active_lane_mask();
active_lane_count = get_active_lane_count(active_lane_mask);
active_lane_id = get_active_lane_num(active_lane_mask);
/* Ring Doorbell
* Doorbell ring must be serialized as we cannot have all threads write to the same address */
for (int i = 0; i < active_lane_count; i++) {
if (i == active_lane_id) {
bnxt_ring_doorbell(sq.tail);
}
}
bnxt_ring_doorbell(sq.tail);
}
release_lock(&sq.lock);
@@ -443,27 +429,14 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(uintptr_t *raddr, uint8_t opcod
__device__ uint64_t QueuePair::bnxt_post_wqe_amo_single(uintptr_t *raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp,
bool fetching) {
uint64_t active_lane_mask;
uint8_t active_lane_count;
uint8_t active_lane_id;
uint32_t atomic_idx = 0;
active_lane_mask = get_active_lane_mask();
active_lane_count = get_active_lane_count(active_lane_mask);
active_lane_id = get_active_lane_num(active_lane_mask);
aquire_lock(&sq.lock);
/* Write WQE to SQ */
atomic_idx = bnxt_write_amo_wqe(raddr, opcode, atomic_data, atomic_cmp, fetching);
/* Ring Doorbell
* Doorbell ring must be serialized as we cannot have all threads write to the same address */
for (int i = 0; i < active_lane_count; i++) {
if (i == active_lane_id) {
bnxt_ring_doorbell(sq.tail);
}
}
bnxt_ring_doorbell(sq.tail);
if (fetching) {
quiet();