[GDA/BNXT] Implemented CQE Collapsing (#279)
[ROCm/rocshmem commit: 6290db319c]
This commit is contained in:
@@ -626,13 +626,20 @@ void GDABackend::cleanup_ibv() {
|
||||
CHECK_HIP(hipFree(bnxt_qps[i].sq_buf));
|
||||
CHECK_HIP(hipFree(bnxt_qps[i].rq_buf));
|
||||
|
||||
err = bnxt_re_dv.destroy_cq(cqs[i]);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_destroy_cq");
|
||||
err = bnxt_re_dv.destroy_cq(bnxt_scqs[i].cq);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_destroy_cq (SCQ)");
|
||||
|
||||
err = bnxt_re_dv.umem_dereg(bnxt_cqs[i].umem_handle);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg");
|
||||
err = bnxt_re_dv.destroy_cq(bnxt_rcqs[i].cq);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_destroy_cq (RCQ)");
|
||||
|
||||
CHECK_HIP(hipFree(bnxt_cqs[i].buf));
|
||||
err = bnxt_re_dv.umem_dereg(bnxt_scqs[i].umem_handle);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (SCQ)");
|
||||
|
||||
err = bnxt_re_dv.umem_dereg(bnxt_rcqs[i].umem_handle);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (RCQ)");
|
||||
|
||||
CHECK_HIP(hipFree(bnxt_scqs[i].buf));
|
||||
CHECK_HIP(hipFree(bnxt_rcqs[i].buf));
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < qps.size(); i++) {
|
||||
@@ -847,6 +854,8 @@ void GDABackend::open_ib_device() {
|
||||
dump_ibv_context(context);
|
||||
dump_ibv_device(context->device);
|
||||
|
||||
validate_ib_device();
|
||||
|
||||
pd_orig = ibv_alloc_pd(context);
|
||||
CHECK_NNULL(pd_orig, "ib allocate pd");
|
||||
dump_ibv_pd(pd_orig);
|
||||
@@ -865,6 +874,35 @@ void GDABackend::open_ib_device() {
|
||||
ibv_free_device_list(device_list);
|
||||
}
|
||||
|
||||
void GDABackend::validate_ib_device() {
|
||||
int err;
|
||||
|
||||
err = ibv_query_device(context, &device_attr);
|
||||
CHECK_ZERO(err, "ibv_query_device");
|
||||
|
||||
if (gda_provider == GDAProvider::BNXT) {
|
||||
const uint32_t bnxt_vendor_id = 0x14E4;
|
||||
const std::set<uint32_t> supported_bnxt_part_ids = { 0x1760 /* BCM57608 */};
|
||||
const char min_supported_bnxt_fw_ver[12] = "233.2.104.0";
|
||||
|
||||
|
||||
if (bnxt_vendor_id != device_attr.vendor_id) {
|
||||
printf("GDAProvider::BNXT requested but an invalid device is selected\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
if (supported_bnxt_part_ids.find(device_attr.vendor_part_id) == supported_bnxt_part_ids.end()) {
|
||||
printf("Unsupported Broadcom Part: %x\n", device_attr.vendor_part_id);
|
||||
abort();
|
||||
}
|
||||
|
||||
if (strverscmp(min_supported_bnxt_fw_ver, device_attr.fw_ver) > 0) {
|
||||
printf("Unsupported firmware version: %s\n", device_attr.fw_ver);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GDABackend::modify_qps_reset_to_init() {
|
||||
int err;
|
||||
struct ibv_qp_attr attr;
|
||||
@@ -999,7 +1037,8 @@ void GDABackend::create_queues() {
|
||||
cqs.resize(resize_length);
|
||||
qps.resize(resize_length);
|
||||
|
||||
bnxt_cqs.resize(resize_length);
|
||||
bnxt_scqs.resize(resize_length);
|
||||
bnxt_rcqs.resize(resize_length);
|
||||
bnxt_qps.resize(resize_length);
|
||||
|
||||
if (gda_provider == GDAProvider::BNXT) {
|
||||
@@ -1052,10 +1091,11 @@ void GDABackend::alternate_qp_ports() {
|
||||
|
||||
if (new_qp_idx < qps.size()) {
|
||||
// Swap QPs
|
||||
std::swap(cqs[cur_qp_idx], cqs[new_qp_idx]);
|
||||
std::swap(qps[cur_qp_idx], qps[new_qp_idx]);
|
||||
std::swap(bnxt_cqs[cur_qp_idx], bnxt_cqs[new_qp_idx]);
|
||||
std::swap(bnxt_qps[cur_qp_idx], bnxt_qps[new_qp_idx]);
|
||||
std::swap(cqs[cur_qp_idx], cqs[new_qp_idx]);
|
||||
std::swap(qps[cur_qp_idx], qps[new_qp_idx]);
|
||||
std::swap(bnxt_scqs[cur_qp_idx], bnxt_scqs[new_qp_idx]);
|
||||
std::swap(bnxt_rcqs[cur_qp_idx], bnxt_rcqs[new_qp_idx]);
|
||||
std::swap(bnxt_qps[cur_qp_idx], bnxt_qps[new_qp_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,6 +65,7 @@ class GDABackend : public Backend {
|
||||
|
||||
const char *requested_dev = nullptr;
|
||||
struct ibv_context *context = nullptr;;
|
||||
struct ibv_device_attr device_attr;
|
||||
struct ibv_pd *pd_orig = nullptr;
|
||||
enum GDAProvider gda_provider = GDAProvider::UNSET;
|
||||
|
||||
@@ -85,7 +86,8 @@ class GDABackend : public Backend {
|
||||
|
||||
/* GDA_BNXT START */
|
||||
std::vector<struct bnxt_host_qp> bnxt_qps;
|
||||
std::vector<struct bnxt_host_cq> bnxt_cqs;
|
||||
std::vector<struct bnxt_host_cq> bnxt_scqs;
|
||||
std::vector<struct bnxt_host_cq> bnxt_rcqs;
|
||||
|
||||
struct bnxt_re_dv_db_region_attr db_region_attr;
|
||||
/* GDA_BNXT END */
|
||||
@@ -332,6 +334,11 @@ class GDABackend : public Backend {
|
||||
*/
|
||||
void open_ib_device();
|
||||
|
||||
/**
|
||||
* @brief Validated the rocSHMEM will run with the currently open InfiniBand Device
|
||||
*/
|
||||
void validate_ib_device();
|
||||
|
||||
/**
|
||||
* @brief Selects the best GID index
|
||||
*/
|
||||
|
||||
@@ -37,19 +37,18 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
|
||||
|
||||
ib_qp = qps[conn_num];
|
||||
|
||||
/* Export CQ */
|
||||
/* Export SCQ */
|
||||
memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj));
|
||||
dv_obj.cq.in = cqs[conn_num];
|
||||
dv_obj.cq.in = bnxt_scqs[conn_num].cq;
|
||||
dv_obj.cq.out = &dv_cq;
|
||||
|
||||
err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ);
|
||||
CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)");
|
||||
|
||||
memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq));
|
||||
gpu_qp->cq.buf = bnxt_cqs[conn_num].buf;
|
||||
gpu_qp->cq.depth = bnxt_cqs[conn_num].depth;
|
||||
gpu_qp->cq.buf = bnxt_scqs[conn_num].buf;
|
||||
gpu_qp->cq.depth = bnxt_scqs[conn_num].depth;
|
||||
gpu_qp->cq.id = dv_cq.cqn;
|
||||
gpu_qp->cq.phase = BNXT_RE_QUEUE_START_PHASE;
|
||||
|
||||
/* Export QP */
|
||||
memset(&dv_obj, 0, sizeof(struct bnxt_re_dv_obj));
|
||||
@@ -95,34 +94,72 @@ void GDABackend::bnxt_create_cqs(int cqe) {
|
||||
struct bnxt_re_dv_cq_init_attr cq_init_attr;
|
||||
struct bnxt_re_dv_umem_reg_attr umem_attr;
|
||||
|
||||
/* Ignore value of cqe as we only need of length 1 to use CQE compression */
|
||||
cqe = 1;
|
||||
|
||||
/* Create SCQs */
|
||||
for (int i = 0; i < qps.size(); i++) {
|
||||
/* Allocate CQ mem */
|
||||
/* Allocate SCQ mem */
|
||||
memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr));
|
||||
bnxt_cqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr);
|
||||
CHECK_NNULL(bnxt_cqs[i].handle, "bnxt_re_dv_cq_mem_alloc");
|
||||
bnxt_scqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr);
|
||||
CHECK_NNULL(bnxt_scqs[i].handle, "bnxt_re_dv_cq_mem_alloc (SCQ)");
|
||||
|
||||
/* Allocate CQ UMEM */
|
||||
bnxt_cqs[i].length = cq_attr.ncqe * cq_attr.cqe_size;
|
||||
bnxt_cqs[i].depth = cq_attr.ncqe;
|
||||
CHECK_HIP(hipExtMallocWithFlags(&bnxt_cqs[i].buf, bnxt_cqs[i].length, hipDeviceMallocUncached));
|
||||
/* We must force this to a value of 1 to use CQE Compression */
|
||||
cq_attr.ncqe = cqe;
|
||||
|
||||
/* Register CQ UMEM */
|
||||
/* Allocate SCQ UMEM */
|
||||
bnxt_scqs[i].length = cq_attr.ncqe * cq_attr.cqe_size;
|
||||
bnxt_scqs[i].depth = cq_attr.ncqe;
|
||||
CHECK_HIP(hipExtMallocWithFlags(&bnxt_scqs[i].buf, bnxt_scqs[i].length, hipDeviceMallocUncached));
|
||||
|
||||
/* Register SCQ UMEM */
|
||||
memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr));
|
||||
umem_attr.addr = bnxt_cqs[i].buf;
|
||||
umem_attr.size = bnxt_cqs[i].length;
|
||||
umem_attr.addr = bnxt_scqs[i].buf;
|
||||
umem_attr.size = bnxt_scqs[i].length;
|
||||
umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;
|
||||
|
||||
bnxt_cqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
|
||||
CHECK_NNULL(bnxt_cqs[i].umem_handle, "bnxt_re_dv_umem_reg(cq_buf)");
|
||||
bnxt_scqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
|
||||
CHECK_NNULL(bnxt_scqs[i].umem_handle, "bnxt_re_dv_umem_reg(scq_buf)");
|
||||
|
||||
/* Create CQ */
|
||||
/* Create SCQ */
|
||||
memset(&cq_init_attr, 0, sizeof(struct bnxt_re_dv_cq_init_attr));
|
||||
cq_init_attr.cq_handle = (uint64_t) bnxt_cqs[i].handle;
|
||||
cq_init_attr.umem_handle = bnxt_cqs[i].umem_handle;
|
||||
cq_init_attr.cq_handle = (uint64_t) bnxt_scqs[i].handle;
|
||||
cq_init_attr.umem_handle = bnxt_scqs[i].umem_handle;
|
||||
cq_init_attr.ncqe = cq_attr.ncqe;
|
||||
|
||||
cqs[i] = bnxt_re_dv.create_cq(context, &cq_init_attr);
|
||||
CHECK_NNULL(cqs[i], "bnxt_re_dv_create_cq");
|
||||
bnxt_scqs[i].cq = bnxt_re_dv.create_cq(context, &cq_init_attr);
|
||||
CHECK_NNULL(bnxt_scqs[i].cq, "bnxt_re_dv_create_cq (SCQ) ");
|
||||
}
|
||||
|
||||
/* Create RCQs */
|
||||
for (int i = 0; i < qps.size(); i++) {
|
||||
/* Allocate RCQ mem */
|
||||
memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr));
|
||||
bnxt_rcqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr);
|
||||
CHECK_NNULL(bnxt_rcqs[i].handle, "bnxt_re_dv_cq_mem_alloc (RCQ)");
|
||||
|
||||
/* Allocate RCQ UMEM */
|
||||
bnxt_rcqs[i].length = cq_attr.ncqe * cq_attr.cqe_size;
|
||||
bnxt_rcqs[i].depth = cq_attr.ncqe;
|
||||
CHECK_HIP(hipExtMallocWithFlags(&bnxt_rcqs[i].buf, bnxt_rcqs[i].length, hipDeviceMallocUncached));
|
||||
|
||||
/* Register RCQ UMEM */
|
||||
memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr));
|
||||
umem_attr.addr = bnxt_rcqs[i].buf;
|
||||
umem_attr.size = bnxt_rcqs[i].length;
|
||||
umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;
|
||||
|
||||
bnxt_rcqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
|
||||
CHECK_NNULL(bnxt_rcqs[i].umem_handle, "bnxt_re_dv_umem_reg(rcq_buf)");
|
||||
|
||||
/* Create RCQ */
|
||||
memset(&cq_init_attr, 0, sizeof(struct bnxt_re_dv_cq_init_attr));
|
||||
cq_init_attr.cq_handle = (uint64_t) bnxt_rcqs[i].handle;
|
||||
cq_init_attr.umem_handle = bnxt_rcqs[i].umem_handle;
|
||||
cq_init_attr.ncqe = cq_attr.ncqe;
|
||||
|
||||
bnxt_rcqs[i].cq = bnxt_re_dv.create_cq(context, &cq_init_attr);
|
||||
CHECK_NNULL(bnxt_rcqs[i].cq, "bnxt_re_dv_create_cq (RCQ)");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,8 +177,8 @@ void GDABackend::bnxt_create_qps(int sq_length) {
|
||||
for (int i = 0; i < qps.size(); i++) {
|
||||
/* IB QP Init Attr */
|
||||
memset(&ib_qp_attr, 0, sizeof(struct ibv_qp_init_attr));
|
||||
ib_qp_attr.send_cq = cqs[i];
|
||||
ib_qp_attr.recv_cq = cqs[i];
|
||||
ib_qp_attr.send_cq = bnxt_scqs[i].cq;
|
||||
ib_qp_attr.recv_cq = bnxt_rcqs[i].cq;
|
||||
ib_qp_attr.cap.max_send_wr = sq_length;
|
||||
ib_qp_attr.cap.max_recv_wr = 0;
|
||||
ib_qp_attr.cap.max_send_sge = 1;
|
||||
|
||||
@@ -30,11 +30,7 @@ extern "C" {
|
||||
#include "gda/bnxt/bnxt_re_hsi.h"
|
||||
}
|
||||
|
||||
#define bnxt_re_get_cqe_sz() (sizeof(struct bnxt_re_req_cqe) + \
|
||||
sizeof(struct bnxt_re_bcqe))
|
||||
|
||||
#define bnxt_re_is_cqe_valid(valid, phase) \
|
||||
(((valid) & BNXT_RE_BCQE_PH_MASK) == (phase))
|
||||
#define GDA_BNXT_WQE_SLOT_COUNT 3
|
||||
|
||||
struct bnxt_device_wq {
|
||||
void *buf;
|
||||
@@ -50,12 +46,10 @@ struct bnxt_device_wq {
|
||||
} __attribute__((packed));
|
||||
|
||||
struct bnxt_device_cq : public bnxt_device_wq {
|
||||
uint32_t phase;
|
||||
} __attribute__((packed));
|
||||
|
||||
struct bnxt_device_sq : public bnxt_device_wq {
|
||||
uint32_t psn;
|
||||
volatile uint32_t posted;
|
||||
|
||||
void *msntbl;
|
||||
uint32_t msn;
|
||||
@@ -70,6 +64,7 @@ struct bnxt_host_cq {
|
||||
void *umem_handle;
|
||||
uint64_t length;
|
||||
uint32_t depth;
|
||||
struct ibv_cq *cq;
|
||||
} __attribute__((packed));
|
||||
|
||||
struct bnxt_host_qp {
|
||||
|
||||
@@ -27,36 +27,6 @@
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
static const __device__ char bnxt_re_wc_error_strings[12][14] = {
|
||||
"OK",
|
||||
"BAD_RESP",
|
||||
"LOC_LEN",
|
||||
"LOC_QP_OP",
|
||||
"PROT",
|
||||
"MEM_OP",
|
||||
"REM_INVAL",
|
||||
"REM_ACC",
|
||||
"REM_OP",
|
||||
"RNR_NAK_XCED",
|
||||
"TRNSP_XCED",
|
||||
"WR_FLUSH",
|
||||
};
|
||||
|
||||
__device__ static inline void bnxt_re_init_db_hdr(struct bnxt_re_db_hdr *hdr,
|
||||
uint32_t indx, uint32_t toggle,
|
||||
uint32_t qid, uint32_t typ) {
|
||||
uint64_t key_lo;
|
||||
uint64_t key_hi;
|
||||
|
||||
key_lo = (indx | toggle);
|
||||
|
||||
key_hi = (qid & BNXT_RE_DB_QID_MASK)
|
||||
| ((typ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT)
|
||||
| (0x1UL << BNXT_RE_DB_VALID_SHIFT);
|
||||
|
||||
hdr->typ_qid_indx = (key_lo | (key_hi << 32));
|
||||
}
|
||||
|
||||
__device__ static inline struct bnxt_re_msns* bnxt_re_pull_psn_buff(struct bnxt_device_sq *sq) {
|
||||
return (struct bnxt_re_msns*)(((char *) sq->msntbl) + ((sq->msn) << sq->psn_sz_log2));
|
||||
}
|
||||
@@ -126,23 +96,6 @@ __device__ static inline void* bnxt_re_get_hwqe(struct bnxt_device_sq *sq, uint3
|
||||
return (void *)((char*)sq->buf + (idx << 4));
|
||||
}
|
||||
|
||||
__device__ static inline void bnxt_re_incr_head(struct bnxt_device_cq *cq, uint8_t cnt)
|
||||
{
|
||||
cq->head += cnt;
|
||||
if (cq->head >= cq->depth) {
|
||||
cq->head %= cq->depth;
|
||||
/* Rolled over, Toggle HEAD bit in epoch flags */
|
||||
cq->flags ^= 1UL << BNXT_RE_FLAG_EPOCH_HEAD_SHIFT;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ static inline void bnxt_re_change_cq_phase(struct bnxt_device_cq *cq)
|
||||
{
|
||||
if (!cq->head) {
|
||||
cq->phase = !(cq->phase & BNXT_RE_BCQE_PH_MASK);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ static inline void aquire_lock(uint32_t *lock) {
|
||||
uint32_t expected;
|
||||
|
||||
@@ -158,68 +111,94 @@ __device__ static inline void release_lock(uint32_t *lock) {
|
||||
*lock = 0;
|
||||
}
|
||||
|
||||
__device__ void QueuePair::ring_cq_doorbell(uint32_t slot_idx) {
|
||||
struct bnxt_re_db_hdr hdr;
|
||||
uint32_t epoch;
|
||||
|
||||
epoch = (cq.flags & BNXT_RE_FLAG_EPOCH_HEAD_MASK) << BNXT_RE_DB_EPOCH_HEAD_SHIFT;
|
||||
|
||||
bnxt_re_init_db_hdr(&hdr, (slot_idx | epoch), 0, cq.flags, BNXT_RE_QUE_TYPE_CQ);
|
||||
|
||||
__threadfence_system();
|
||||
__hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::ring_sq_doorbell(uint32_t slot_idx) {
|
||||
__device__ void QueuePair::bnxt_ring_doorbell(uint32_t slot_idx) {
|
||||
struct bnxt_re_db_hdr hdr;
|
||||
uint32_t epoch;
|
||||
uint64_t key_lo;
|
||||
uint64_t key_hi;
|
||||
|
||||
epoch = (sq.flags & BNXT_RE_FLAG_EPOCH_TAIL_MASK) << BNXT_RE_DB_EPOCH_TAIL_SHIFT;
|
||||
|
||||
bnxt_re_init_db_hdr(&hdr, (slot_idx | epoch), 0, sq.id, BNXT_RE_QUE_TYPE_SQ);
|
||||
key_lo = (slot_idx | epoch);
|
||||
|
||||
key_hi = (sq.id & BNXT_RE_DB_QID_MASK)
|
||||
| (((uint64_t) BNXT_RE_QUE_TYPE_SQ & BNXT_RE_DB_TYP_MASK) << BNXT_RE_DB_TYP_SHIFT)
|
||||
| (0x1UL << BNXT_RE_DB_VALID_SHIFT);
|
||||
|
||||
hdr.typ_qid_indx = (key_lo | (key_hi << 32));
|
||||
|
||||
__threadfence_system();
|
||||
__hip_atomic_store(dbr, hdr.typ_qid_indx, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__ int QueuePair::poll_cq() {
|
||||
__device__ void QueuePair::bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe) {
|
||||
struct bnxt_re_bcqe *hdr;
|
||||
void *cqe;
|
||||
uint32_t flg_val;
|
||||
int type;
|
||||
uint8_t status;
|
||||
|
||||
cqe = (void*) ((char*) cq.buf + (cq.head * bnxt_re_get_cqe_sz()));
|
||||
const char bnxt_re_wc_error_strings[12][14] = {
|
||||
"OK",
|
||||
"BAD_RESP",
|
||||
"LOC_LEN",
|
||||
"LOC_QP_OP",
|
||||
"PROT",
|
||||
"MEM_OP",
|
||||
"REM_INVAL",
|
||||
"REM_ACC",
|
||||
"REM_OP",
|
||||
"RNR_NAK_XCED",
|
||||
"TRNSP_XCED",
|
||||
"WR_FLUSH",
|
||||
};
|
||||
|
||||
hdr = (struct bnxt_re_bcqe*) ((char*)cqe + sizeof(struct bnxt_re_req_cqe));
|
||||
|
||||
flg_val = hdr->flg_st_typ_ph;
|
||||
|
||||
__threadfence_system();
|
||||
__threadfence();
|
||||
|
||||
if (bnxt_re_is_cqe_valid(flg_val, cq.phase)) {
|
||||
// Is the CQE valid?
|
||||
status = (flg_val >> BNXT_RE_BCQE_STATUS_SHIFT)
|
||||
& BNXT_RE_BCQE_STATUS_MASK;
|
||||
// Is the CQE valid?
|
||||
status = (flg_val >> BNXT_RE_BCQE_STATUS_SHIFT)
|
||||
& BNXT_RE_BCQE_STATUS_MASK;
|
||||
|
||||
if (status != BNXT_RE_REQ_ST_OK) {
|
||||
printf("CQ Error %s (%x)\n", bnxt_re_wc_error_strings[status], status);
|
||||
abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Update the CQ Ptr */
|
||||
bnxt_re_incr_head(&cq, 1);
|
||||
bnxt_re_change_cq_phase(&cq);
|
||||
|
||||
/* Ring Doorbell */
|
||||
ring_cq_doorbell(cq.head);
|
||||
|
||||
__hip_atomic_fetch_sub(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
|
||||
|
||||
return 1;
|
||||
if (status != BNXT_RE_REQ_ST_OK) {
|
||||
printf("CQ Error %s (%x)\n", bnxt_re_wc_error_strings[status], status);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
__device__ void QueuePair::poll_cq_until(uint32_t requested_available_slots) {
|
||||
struct bnxt_re_req_cqe *cqe;
|
||||
uint32_t sq_tail;
|
||||
uint32_t sq_head;
|
||||
uint32_t sq_depth;
|
||||
uint32_t consumed_slots;
|
||||
uint32_t available_slots;
|
||||
|
||||
sq_depth = sq.depth;
|
||||
|
||||
aquire_lock(&cq.lock);
|
||||
|
||||
do {
|
||||
cqe = (struct bnxt_re_req_cqe *) cq.buf;
|
||||
|
||||
#ifdef DEBUG
|
||||
bnxt_check_cqe_error(cqe);
|
||||
#endif
|
||||
|
||||
/* Update the SQ head
|
||||
* This param provides us the wqe_idx but we need to convert to the slot idx.
|
||||
* We assume a static slots size of GDA_BNXT_WQE_SLOT_COUNT thus can multipy by this value */
|
||||
sq_head = (((cqe->con_indx & 0xFFFF) * GDA_BNXT_WQE_SLOT_COUNT) % sq_depth);
|
||||
sq.head = sq_head;
|
||||
|
||||
sq_tail = __hip_atomic_load(&sq.tail, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
|
||||
|
||||
consumed_slots = (sq_tail - sq_head + sq_depth) % sq_depth;
|
||||
available_slots = sq_depth - consumed_slots;
|
||||
} while (available_slots < requested_available_slots);
|
||||
|
||||
release_lock(&cq.lock);
|
||||
}
|
||||
|
||||
__device__ void QueuePair::bnxt_quiet() {
|
||||
@@ -230,11 +209,7 @@ __device__ void QueuePair::bnxt_quiet() {
|
||||
active_lane_id = get_active_lane_num(active_lane_mask);
|
||||
|
||||
if (0 == active_lane_id) {
|
||||
aquire_lock(&cq.lock);
|
||||
while (__hip_atomic_load(&sq.posted, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT)) {
|
||||
poll_cq();
|
||||
}
|
||||
release_lock(&cq.lock);
|
||||
poll_cq_until(sq.depth);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -264,18 +239,18 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t *
|
||||
uint32_t hdr_flags;
|
||||
uint32_t inline_msg;
|
||||
|
||||
uint32_t rma_slots = 3; // (Three slots: hdr, rdma, sge)
|
||||
|
||||
inline_msg = length <= inline_threshold &&
|
||||
opcode == gda_op_rdma_write;
|
||||
|
||||
poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
|
||||
|
||||
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
|
||||
rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1);
|
||||
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2);
|
||||
|
||||
/* Populate Header Segment */
|
||||
wqe_type = BNXT_RE_HDR_WT_MASK & opcode;
|
||||
wqe_size = BNXT_RE_HDR_WS_MASK & rma_slots;
|
||||
wqe_size = BNXT_RE_HDR_WS_MASK & GDA_BNXT_WQE_SLOT_COUNT;
|
||||
hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK)
|
||||
& ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED);
|
||||
|
||||
@@ -314,16 +289,11 @@ __device__ void QueuePair::bnxt_post_wqe_rma(int pe, int32_t length, uintptr_t *
|
||||
bnxt_re_fill_psns_for_msntbl(&sq, length);
|
||||
|
||||
/* Update SQ Pointer */
|
||||
bnxt_re_incr_tail(&sq, rma_slots);
|
||||
bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT);
|
||||
|
||||
/* Ring Doorbell */
|
||||
ring_sq_doorbell(sq.tail);
|
||||
|
||||
__hip_atomic_fetch_add(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
|
||||
|
||||
bnxt_ring_doorbell(sq.tail);
|
||||
}
|
||||
__threadfence_system();
|
||||
quiet();
|
||||
}
|
||||
|
||||
if (0 == active_lane_id) {
|
||||
@@ -357,17 +327,18 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(int pe, int32_t length, uintptr
|
||||
uint32_t wqe_size;
|
||||
uint32_t wqe_type;
|
||||
uint32_t hdr_flags;
|
||||
uint32_t amo_slots = 3; // (Three slots: hdr, amo, sge)
|
||||
|
||||
poll_cq_until(GDA_BNXT_WQE_SLOT_COUNT);
|
||||
|
||||
hdr_ptr = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
|
||||
amo_ptr = (struct bnxt_re_atomic*) bnxt_re_get_hwqe(&sq, 1);
|
||||
sge_ptr = (struct bnxt_re_sge*) bnxt_re_get_hwqe(&sq, 2);
|
||||
|
||||
/* Populate Header Segment */
|
||||
wqe_size = BNXT_RE_HDR_WS_MASK & amo_slots;
|
||||
wqe_size = BNXT_RE_HDR_WS_MASK & GDA_BNXT_WQE_SLOT_COUNT;
|
||||
hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK)
|
||||
& ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED);
|
||||
wqe_type = BNXT_RE_HDR_WT_MASK & opcode;
|
||||
wqe_type = BNXT_RE_HDR_WT_MASK & opcode;
|
||||
|
||||
hdr.rsv_ws_fl_wt = (wqe_size << BNXT_RE_HDR_WS_SHIFT)
|
||||
| (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT)
|
||||
@@ -399,15 +370,11 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(int pe, int32_t length, uintptr
|
||||
bnxt_re_fill_psns_for_msntbl(&sq, length);
|
||||
|
||||
/* Update SQ Pointer */
|
||||
bnxt_re_incr_tail(&sq, amo_slots);
|
||||
bnxt_re_incr_tail(&sq, GDA_BNXT_WQE_SLOT_COUNT);
|
||||
|
||||
/* Ring Doorbell */
|
||||
ring_sq_doorbell(sq.tail);
|
||||
|
||||
__hip_atomic_fetch_add(&sq.posted, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bnxt_ring_doorbell(sq.tail);
|
||||
}
|
||||
__threadfence_system();
|
||||
quiet();
|
||||
}
|
||||
|
||||
if (0 == active_lane_id) {
|
||||
@@ -415,6 +382,7 @@ __device__ uint64_t QueuePair::bnxt_post_wqe_amo(int pe, int32_t length, uintptr
|
||||
}
|
||||
|
||||
if (fetching) {
|
||||
quiet();
|
||||
return fetching_atomic[atomic_idx];
|
||||
}
|
||||
|
||||
|
||||
@@ -185,8 +185,7 @@ class QueuePair {
|
||||
__device__ void mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter);
|
||||
#endif
|
||||
#if defined(GDA_BNXT)
|
||||
__device__ void ring_sq_doorbell(uint32_t slot_idx);
|
||||
__device__ void ring_cq_doorbell(uint32_t slot_idx);
|
||||
__device__ void bnxt_ring_doorbell(uint32_t slot_idx);
|
||||
#endif
|
||||
#if defined(GDA_IONIC)
|
||||
__device__ void ionic_ring_doorbell(uint32_t pos);
|
||||
@@ -199,7 +198,8 @@ class QueuePair {
|
||||
struct bnxt_device_cq cq;
|
||||
struct bnxt_device_sq sq;
|
||||
|
||||
__device__ int poll_cq();
|
||||
__device__ void poll_cq_until(uint32_t requested_available_slots);
|
||||
__device__ void bnxt_check_cqe_error(struct bnxt_re_req_cqe *cqe);
|
||||
|
||||
/* GDAProvider::BNXT END */
|
||||
|
||||
|
||||
Referens i nytt ärende
Block a user