* Split ionic code to a subdirectory; dyld libionicl; move the fntable to provider_gda_xxx.hpp
pass the pattr to ionic_setup_pd, include endian.hpp
Enable building IONIC conduit for runtime selection

* Uniform style for the fntable between ionic and the rest

* Move mlx5 gda conduit to a subdir; resolve conflict with backend_can_run
function

* Don't forget to init qp for ionic, move mlx5 specialized init qp code to
the mlx5 subdir

* Don't add cmakecaches...

Typo: GDA_BXNT

* Add gda-ionic to all_backends build scripts

* Apply suggestion from reviews

Co-authored-by: Omri Mor <omri50@gmail.com>
Co-authored-by: Edgar Gabriel <edgar.gabriel@amd.com>

* Remove duplicate definitiion of DLSYM macros

---------

Co-authored-by: Omri Mor <omri50@gmail.com>
Co-authored-by: Edgar Gabriel <edgar.gabriel@amd.com>

[ROCm/rocshmem commit: 3cfe76522e]
Этот коммит содержится в:
Aurelien Bouteiller
2025-10-16 15:53:01 -04:00
коммит произвёл GitHub
родитель 6c4325d131
Коммит bb8406b013
21 изменённых файлов: 1950 добавлений и 876 удалений
+1 -28
Просмотреть файл
@@ -25,7 +25,6 @@
find_package(PkgConfig QUIET)
if (PkgConfig_FOUND)
if (IBVerbs_ROOT )
# We don't use IBVerbs_DIR as this is supposed to be used when finding hwloc-config.cmake only
set(ENV{PKG_CONFIG_PATH} "${IBVerbs_ROOT}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
endif()
pkg_check_modules(PC_IBVerbs QUIET libibverbs)
@@ -42,32 +41,11 @@ find_library(IBVerbs_LIBRARY
PATH_SUFFIXES lib lib64
)
if (GDA_IONIC)
list(APPEND provider_vars IBVerbs_IONIC_LIBRARY IBVerbs_IONIC_INCLUDE_DIR)
find_path(IBVerbs_IONIC_INCLUDE_DIR infiniband/ionic_dv.h
HINTS ${PC_IBVerbs_INCLUDEDIR} ${PC_IBVerbs_INCLUDE_DIRS}
PATH_SUFFIXES include
)
find_library(IBVerbs_IONIC_LIBRARY
NAMES ionic libionic
HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS}
PATH_SUFFIXES lib lib64
)
add_library(IBVerbs::verbs_ionic UNKNOWN IMPORTED)
set_target_properties(IBVerbs::verbs_ionic PROPERTIES
IMPORTED_LOCATION "${IBVerbs_IONIC_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_IONIC_INCLUDE_DIR}"
)
endif()
find_package_handle_standard_args(IBVerbs DEFAULT_MSG
IBVerbs_LIBRARY
IBVerbs_INCLUDE_DIR
${provider_vars}
)
mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR ${provider_vars})
mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR)
if (IBVerbs_FOUND)
add_library(IBVerbs::verbs UNKNOWN IMPORTED)
@@ -76,9 +54,4 @@ set_target_properties(IBVerbs::verbs PROPERTIES
INTERFACE_COMPILE_OPTIONS "${PC_IBVerbs_CFLAGS_OTHER}"
INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_INCLUDE_DIR}"
)
target_link_libraries(IBVerbs::verbs INTERFACE
$<TARGET_NAME_IF_EXISTS:IBVerbs::verbs_ionic>
)
endif()
+1 -1
Просмотреть файл
@@ -40,7 +40,7 @@ cmake \
-DUSE_GDA=ON \
-DGDA_MLX5=ON \
-DGDA_BNXT=ON \
-DGDA_IONIC=OFF \
-DGDA_IONIC=ON \
-DUSE_RO=ON \
-DUSE_IPC=ON \
-DUSE_THREADS=OFF \
+2 -1
Просмотреть файл
@@ -35,7 +35,6 @@ target_sources(
gda_team.cpp
queue_pair.cpp
endian.cpp
segment_builder.cpp
topology.cpp
)
@@ -48,4 +47,6 @@ target_link_libraries(
numa
)
add_subdirectory(mlx5)
add_subdirectory(bnxt)
add_subdirectory(ionic)
+76 -178
Просмотреть файл
@@ -122,11 +122,7 @@ GDABackend::~GDABackend() {
cleanup_heap_memory_rkey();
cleanup_ibv();
if (bnxtdv_handle_ != nullptr)
dlclose(bnxtdv_handle_);
if (mlx5dv_handle_ != nullptr)
dlclose(mlx5dv_handle_);
close_dv_libs();
}
void GDABackend::read_env() {
@@ -532,52 +528,45 @@ void GDABackend::rte_barrier() {
}
}
int GDABackend::mlx5_dv_dl_init () {
mlx5dv_handle_ = dlopen("libmlx5.so", RTLD_NOW);
if (!mlx5dv_handle_) {
DPRINTF("Could not open libmlx5.so. Returning\n");
return ROCSHMEM_ERROR;
}
DLSYM_HELPER(mlx5dv_ftable_, mlx5dv_, mlx5dv_handle_, init_obj);
return ROCSHMEM_SUCCESS;
}
/* Currently we only check whether we can dlopen a Direct Verbs library.
** We might need to extend this logic to check whether we have interfaces that
** can use those DV libraries
*/
* We might need to extend this logic to check whether we have interfaces that
* can use those DV libraries
*/
int GDABackend::backend_can_run() {
void *handle{nullptr};
/* Try opening bnxt DV libraries */
handle = dlopen("libbnxt_re.so", RTLD_NOW);
#if defined(GDA_BNXT)
handle = bnxt_dv_dlopen();
if (handle) {
dlclose(handle);
return ROCSHMEM_SUCCESS;
} else {
/* Try hard-coded PATH */
handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW);
if (handle) {
dlclose(handle);
return ROCSHMEM_SUCCESS;
}
}
#endif //defined(GDA_BNXT)
/* Try opening ionic DV libraries */
#if defined(GDA_IONIC)
handle = ionic_dv_dlopen();
if (handle) {
dlclose(handle);
return ROCSHMEM_SUCCESS;
}
#endif //defined(GDA_IONIC)
/* Try opening mlx5 DV libraries */
handle = dlopen("libmlx5.so", RTLD_NOW);
#if defined(GDA_MLX5)
handle = mlx5_dv_dlopen();
if (handle) {
dlclose(handle);
return ROCSHMEM_SUCCESS;
}
/* ToDo: opening ionic DV libraries */
#endif //defined(GDA_MLX5)
return ROCSHMEM_ERROR;
}
void GDABackend::setup_ibv() {
autodetect_dv_libs();
open_dv_libs();
open_ib_device();
@@ -599,22 +588,22 @@ void GDABackend::cleanup_ibv() {
CHECK_HIP(hipHostUnregister(db_region_attr.dbr));
for (int i = 0; i < qps.size(); i++) {
err = bnxtdv_ftable_.destroy_qp(qps[i]);
err = bnxt_re_dv.destroy_qp(qps[i]);
CHECK_ZERO(err, "bnxt_re_dv_destroy_qp");
err = bnxtdv_ftable_.umem_dereg(bnxt_qps[i].attr.rq_umem_handle);
err = bnxt_re_dv.umem_dereg(bnxt_qps[i].attr.rq_umem_handle);
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (RQ)");
err = bnxtdv_ftable_.umem_dereg(bnxt_qps[i].attr.sq_umem_handle);
err = bnxt_re_dv.umem_dereg(bnxt_qps[i].attr.sq_umem_handle);
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (SQ)");
CHECK_HIP(hipFree(bnxt_qps[i].sq_buf));
CHECK_HIP(hipFree(bnxt_qps[i].rq_buf));
err = bnxtdv_ftable_.destroy_cq(cqs[i]);
err = bnxt_re_dv.destroy_cq(cqs[i]);
CHECK_ZERO(err, "bnxt_re_dv_destroy_cq");
err = bnxtdv_ftable_.umem_dereg(bnxt_cqs[i].umem_handle);
err = bnxt_re_dv.umem_dereg(bnxt_cqs[i].umem_handle);
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg");
CHECK_HIP(hipFree(bnxt_cqs[i].buf));
@@ -647,13 +636,14 @@ void GDABackend::cleanup_ibv() {
CHECK_ZERO(err, "ibv_close_device");
}
void GDABackend::autodetect_dv_libs() {
void GDABackend::open_dv_libs() {
int ret;
#ifdef GDA_IONIC
gda_vendor = GDAVendor::IONIC;
#endif
//TODO: environment variable selection/deselection
//this hardcoded init order will always prefer BNXT>IONIC>MLX5
//if all three drivers are installed
#if defined(GDA_BNXT)
if (gda_vendor == GDAVendor::NONE) {
ret = bnxt_dv_dl_init();
@@ -663,7 +653,21 @@ void GDABackend::autodetect_dv_libs() {
DPRINTF("Initializing rocSHMEM BNXT GDA support failed\n");
}
}
#endif // defined(GDA_BNXT)
#if defined(GDA_IONIC)
if (gda_vendor == GDAVendor::NONE) {
ret = ionic_dv_dl_init();
if (ret == ROCSHMEM_SUCCESS) {
gda_vendor = GDAVendor::IONIC;
} else {
DPRINTF("Initializing rocSHMEM IONIC GDA support failed\n");
}
}
#endif // defined(GDA_IONIC)
#if defined(GDA_MLX5)
if (gda_vendor == GDAVendor::NONE) {
ret = mlx5_dv_dl_init();
@@ -673,13 +677,26 @@ void GDABackend::autodetect_dv_libs() {
DPRINTF("Initializing rocSHMEM MLX5 GDA support failed\n");
}
}
#endif // defined(GDA_MLX5)
if (gda_vendor == GDAVendor::NONE) {
printf("Initializing rocSHMEM with IONIC, BNXT, or MLX5 GDA support failed\n");
printf("Initializing rocSHMEM with IONIC, BNXT, or MLX5 GDA support failed: no DV library found\n");
abort();
}
}
void GDABackend::close_dv_libs() {
if (ionicdv_handle_ != nullptr)
dlclose(ionicdv_handle_);
if (bnxtdv_handle_ != nullptr)
dlclose(bnxtdv_handle_);
if (mlx5dv_handle_ != nullptr)
dlclose(mlx5dv_handle_);
gda_vendor = GDAVendor::NONE;
}
void GDABackend::exchange_qp_dest_info() {
for (int i = 0; i < qps.size(); i++) {
@@ -752,11 +769,7 @@ void GDABackend::setup_gpu_qps() {
new (&host_qps[i]) QueuePair(pd_orig, gda_vendor);
CHECK_HIP(hipMemcpy(&gpu_qps[i], &host_qps[i], sizeof(QueuePair), hipMemcpyDefault));
if (gda_vendor == GDAVendor::BNXT) {
bnxt_initialize_gpu_qp(&gpu_qps[i], i);
} else {
initialize_gpu_qp(&gpu_qps[i], i);
}
initialize_gpu_qp(&gpu_qps[i], i);
}
}
@@ -844,7 +857,7 @@ void GDABackend::modify_qps_reset_to_init() {
for (int i =0; i < qps.size() ; i++) {
if (gda_vendor == GDAVendor::BNXT) {
err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0);
} else {
err = ibv_modify_qp(qps[i], &attr, attr_mask);
}
@@ -896,7 +909,7 @@ void GDABackend::modify_qps_init_to_rtr() {
}
if (gda_vendor == GDAVendor::BNXT) {
err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0);
} else {
err = ibv_modify_qp(qps[i], &attr, attr_mask);
}
@@ -932,7 +945,7 @@ void GDABackend::modify_qps_rtr_to_rts() {
attr.sq_psn = dest_info[i].psn;
if (gda_vendor == GDAVendor::BNXT) {
err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0);
} else {
err = ibv_modify_qp(qps[i], &attr, attr_mask);
}
@@ -1057,19 +1070,9 @@ void GDABackend::create_parent_domain() {
CHECK_NNULL(pd_parent, "ibv_alloc_parent_domain");
dump_ibv_pd(pd_parent);
#ifdef GDA_IONIC
ionic_dv_pd_set_sqcmb(pd_parent, false, false, false);
ionic_dv_pd_set_rqcmb(pd_parent, false, false, false);
for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) {
pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(context, &pattr);
CHECK_NNULL(pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)");
ionic_dv_pd_set_sqcmb(pd_uxdma[uxdma_i], false, false, false);
ionic_dv_pd_set_rqcmb(pd_uxdma[uxdma_i], false, false, false);
ionic_dv_pd_set_udma_mask(pd_uxdma[uxdma_i], 1u << uxdma_i);
if (gda_vendor == GDAVendor::IONIC) {
ionic_setup_parent_domain(&pattr);
}
#endif /* GDA_IONIC */
}
void GDABackend::create_cqs(int cqe) {
@@ -1099,123 +1102,18 @@ void GDABackend::create_cqs(int cqe) {
}
void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
int hip_dev_id{-1};
CHECK_HIP(hipGetDevice(&hip_dev_id));
#ifdef GDA_IONIC
ionic_dv_ctx dvctx;
ionic_dv_get_ctx(&dvctx, context);
void* gpu_db_page = nullptr;
rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id);
uint64_t *db_page_u64 = reinterpret_cast<uint64_t*>(dvctx.db_page);
uint64_t *gpu_db_page_u64 = reinterpret_cast<uint64_t*>(gpu_db_page);
uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64];
gpu_db_page = gpu_db_page;
gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype];
gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype];
uint8_t udma_idx = ionic_dv_qp_get_udma_idx(qps[conn_num]);
ionic_dv_cq dvcq;
ionic_dv_get_cq(&dvcq, cqs[conn_num], udma_idx);
gpu_qp->cq_dbreg = gpu_db_cq;
gpu_qp->cq_dbval = dvcq.q.db_val;
gpu_qp->cq_mask = dvcq.q.mask;
gpu_qp->ionic_cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
ionic_dv_qp dvqp;
ionic_dv_get_qp(&dvqp, qps[conn_num]);
gpu_qp->sq_dbreg = gpu_db_sq;
gpu_qp->sq_dbval = dvqp.sq.db_val;
gpu_qp->sq_mask = dvqp.sq.mask;
gpu_qp->ionic_sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
strncpy(gpu_qp->dev_name,
qps[conn_num]->context->device->name,
sizeof(gpu_qp->dev_name));
gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0;
gpu_qp->qp_num = qps[conn_num]->qp_num;
gpu_qp->lkey = heap_mr->lkey;
gpu_qp->rkey = heap_rkey[conn_num % num_pes];
gpu_qp->inline_threshold = 32;
#endif /* GDA_IONIC */
if (gda_vendor == GDAVendor::MLX5) {
mlx5dv_cq cq_out;
mlx5dv_obj mlx_obj;
mlx_obj.cq.in = cqs[conn_num];
mlx_obj.cq.out = &cq_out;
mlx5dv_ftable_.init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
dump_mlx5dv_cq(&cq_out, conn_num);
/*
* struct mlx5dv_cq {
* void *buf;
* __be32 *dbrec;
* uint32_t cqe_cnt;
* uint32_t cqe_size;
* void *cq_uar;
* uint32_t cqn;
* uint64_t comp_mask;
* };
*/
gpu_qp->cq_buf = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
gpu_qp->cq_cnt = cq_out.cqe_cnt;
gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt);
gpu_qp->cq_dbrec = cq_out.dbrec;
mlx5dv_qp qp_out;
mlx_obj.qp.in = qps[conn_num];
mlx_obj.qp.out = &qp_out;
mlx5dv_ftable_.init_obj(&mlx_obj, MLX5DV_OBJ_QP);
dump_mlx5dv_qp(&qp_out, conn_num);
/*
* struct mlx5dv_qp {
* __be32 *dbrec;
* struct {
* void *buf;
* uint32_t wqe_cnt;
* uint32_t stride;
* } sq;
* struct {
* void *buf;
* uint32_t wqe_cnt;
* uint32_t stride;
* } rq;
* struct {
* void *reg;
* uint32_t size;
* } bf;
* uint64_t comp_mask;
* off_t uar_mmap_offset;
* uint32_t tirn;
* uint32_t tisn;
* uint32_t rqn;
* uint32_t sqn;
* uint64_t tir_icm_addr;
* };
*/
gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR
gpu_qp->sq_buf = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt;
gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]);
gpu_qp->lkey = htobe32(heap_mr->lkey);
gpu_qp->qp_num = qps[conn_num]->qp_num;
gpu_qp->inline_threshold = inline_threshold;
// The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers
void* gpu_ptr{nullptr};
rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id);
gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
switch (gda_vendor) {
case GDAVendor::IONIC:
ionic_initialize_gpu_qp(gpu_qp, conn_num);
break;
case GDAVendor::BNXT:
bnxt_initialize_gpu_qp(gpu_qp, conn_num);
break;
case GDAVendor::MLX5:
mlx5_initialize_gpu_qp(gpu_qp, conn_num);
break;
default:
assert(false /* GDAVendor initialize_gpu_qp */);
}
}
+43 -30
Просмотреть файл
@@ -40,32 +40,6 @@
#include "gda/bnxt/provider_gda_bnxt.hpp"
#include "gda/mlx5/provider_gda_mlx5.hpp"
struct bnxtdv_funcs_t {
int (*init_obj)(struct bnxt_re_dv_obj *obj, uint64_t obj_type);
struct ibv_qp* (*create_qp)(struct ibv_pd *pd,
struct bnxt_re_dv_qp_init_attr *qp_attr);
int (*destroy_qp)(struct ibv_qp *ibvqp);
int (*modify_qp)(struct ibv_qp *ibv_qp, struct ibv_qp_attr *attr,
int attr_mask, uint32_t type, uint32_t value);
int (*qp_mem_alloc)(struct ibv_pd *ibvpd,
struct ibv_qp_init_attr *attr,
struct bnxt_re_dv_qp_mem_info *dv_qp_mem);
struct ibv_cq* (*create_cq)(struct ibv_context *ibvctx,
struct bnxt_re_dv_cq_init_attr *cq_attr);
int (*destroy_cq)(struct ibv_cq *ibv_cq);
void* (*cq_mem_alloc)(struct ibv_context *ibvctx, int num_cqe,
struct bnxt_re_dv_cq_attr *cq_attr);
void* (*umem_reg)(struct ibv_context *ibvctx,
struct bnxt_re_dv_umem_reg_attr *in);
int (*umem_dereg)(void *umem_handle);
int (*get_default_db_region)(struct ibv_context *ibvctx,
struct bnxt_re_dv_db_region_attr *out);
};
struct mlx5dv_funcs_t {
int (*init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type);
};
namespace rocshmem {
class GDAContext;
@@ -325,6 +299,8 @@ class GDABackend : public Backend {
void initialize_gpu_qp(QueuePair* qp, int conn_num);
void bnxt_initialize_gpu_qp(QueuePair* qp, int conn_num);
void ionic_initialize_gpu_qp(QueuePair* qp, int conn_num);
void mlx5_initialize_gpu_qp(QueuePair* qp, int conn_num);
/**
* @brief Setup InfiniBand Resources
@@ -337,9 +313,14 @@ class GDABackend : public Backend {
void cleanup_ibv();
/**
* @brief Detect the available direct verbs libraries
* @brief Detect and load the available direct verbs libraries
*/
void autodetect_dv_libs();
void open_dv_libs();
/**
* @ brief Close opened direct verbs libraries
*/
void close_dv_libs();
/**
* @brief Open InfiniBand Device and create common structures
@@ -404,6 +385,7 @@ class GDABackend : public Backend {
static void pd_release(ibv_pd* pd, void* pd_context, void* ptr, uint64_t resource_type);
void create_parent_domain();
void ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* pattr);
void setup_gpu_qps();
void cleanup_gpu_qps();
@@ -509,7 +491,7 @@ class GDABackend : public Backend {
* @brief structures holding the function pointers to the direct verbs functionality
* of each network driver.
*/
bnxtdv_funcs_t bnxtdv_ftable_;
bnxtdv_funcs_t bnxt_re_dv;
/**
* @brief handle used for the dlopen of the BCOM library
@@ -521,11 +503,16 @@ class GDABackend : public Backend {
*/
int bnxt_dv_dl_init();
/**
* @brief open bnxt dv lib
*/
static void* bnxt_dv_dlopen();
/**
* @brief structures holding the function pointers to the direct verbs functionality
* of each network driver.
*/
mlx5dv_funcs_t mlx5dv_ftable_;
mlx5dv_funcs_t mlx5dv;
/**
* @brief handle used for the dlopen of the MLX5 library
@@ -536,6 +523,32 @@ class GDABackend : public Backend {
* @brief initialize function table for MLNX direct verbs support
*/
int mlx5_dv_dl_init();
/**
* @brief open mlx5 dv lib
*/
static void* mlx5_dv_dlopen();
/**
* @brief structures holding the function pointers to the direct verbs functionality
* of each network driver.
*/
ionicdv_funcs_t ionic_dv;
/**
* @brief handle used for the dlopen of the IONIC library
*/
void *ionicdv_handle_{nullptr};
/**
* @brief initialize function table for IONIC direct verbs support
*/
int ionic_dv_dl_init();
/**
* @brief open ionic dv lib
*/
static void* ionic_dv_dlopen();
};
} // namespace rocshmem
+34 -27
Просмотреть файл
@@ -42,7 +42,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
dv_obj.cq.in = cqs[conn_num];
dv_obj.cq.out = &dv_cq;
err = bnxtdv_ftable_.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ);
err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ);
CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)");
memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq));
@@ -56,7 +56,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
dv_obj.qp.in = ib_qp;
dv_obj.qp.out = &dv_qp;
err = bnxtdv_ftable_.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP);
err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP);
CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");
memset(&gpu_qp->sq, 0, sizeof(bnxt_device_sq));
@@ -76,7 +76,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
gpu_qp->sq.mtu = ibv_mtu_to_int(portinfo.active_mtu);
/* Export DB */
err = bnxtdv_ftable_.get_default_db_region(context, &db_region_attr);
err = bnxt_re_dv.get_default_db_region(context, &db_region_attr);
CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)");
CHECK_HIP(hipHostRegister(db_region_attr.dbr, getpagesize(), hipHostRegisterDefault));
@@ -98,7 +98,7 @@ void GDABackend::bnxt_create_cqs(int cqe) {
for (int i = 0; i < qps.size(); i++) {
/* Allocate CQ mem */
memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr));
bnxt_cqs[i].handle = bnxtdv_ftable_.cq_mem_alloc(context, cqe, &cq_attr);
bnxt_cqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr);
CHECK_NNULL(bnxt_cqs[i].handle, "bnxt_re_dv_cq_mem_alloc");
/* Allocate CQ UMEM */
@@ -112,7 +112,7 @@ void GDABackend::bnxt_create_cqs(int cqe) {
umem_attr.size = bnxt_cqs[i].length;
umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;
bnxt_cqs[i].umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr);
bnxt_cqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
CHECK_NNULL(bnxt_cqs[i].umem_handle, "bnxt_re_dv_umem_reg(cq_buf)");
/* Create CQ */
@@ -121,7 +121,7 @@ void GDABackend::bnxt_create_cqs(int cqe) {
cq_init_attr.umem_handle = bnxt_cqs[i].umem_handle;
cq_init_attr.ncqe = cq_attr.ncqe;
cqs[i] = bnxtdv_ftable_.create_cq(context, &cq_init_attr);
cqs[i] = bnxt_re_dv.create_cq(context, &cq_init_attr);
CHECK_NNULL(cqs[i], "bnxt_re_dv_create_cq");
}
}
@@ -152,7 +152,7 @@ void GDABackend::bnxt_create_qps(int sq_length) {
/* Alloc qp_mem_info */
memset(&bnxt_qps[i].mem_info, 0, sizeof(struct bnxt_re_dv_qp_mem_info));
err = bnxtdv_ftable_.qp_mem_alloc(pd_orig, &ib_qp_attr, &bnxt_qps[i].mem_info);
err = bnxt_re_dv.qp_mem_alloc(pd_orig, &ib_qp_attr, &bnxt_qps[i].mem_info);
CHECK_ZERO(err, "bnxt_re_dv_qp_mem_alloc");
/* Alloc SQ */
@@ -177,7 +177,7 @@ void GDABackend::bnxt_create_qps(int sq_length) {
umem_attr.size = bnxt_qps[i].mem_info.sq_len;
umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;
sq_umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr);
sq_umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
CHECK_NNULL(sq_umem_handle, "bnxt_re_dv_umem_reg(sq)");
memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr));
@@ -185,7 +185,7 @@ void GDABackend::bnxt_create_qps(int sq_length) {
umem_attr.size = bnxt_qps[i].mem_info.rq_len;
umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE;
rq_umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr);
rq_umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr);
CHECK_NNULL(rq_umem_handle, "bnxt_re_dv_umem_reg(rq)");
/* IB DV QP Init Attr */
@@ -214,33 +214,40 @@ void GDABackend::bnxt_create_qps(int sq_length) {
bnxt_qps[i].attr.comp_mask = bnxt_qps[i].mem_info.comp_mask;
/* Alloc QP */
qps[i] = bnxtdv_ftable_.create_qp(pd_orig, &bnxt_qps[i].attr);
qps[i] = bnxt_re_dv.create_qp(pd_orig, &bnxt_qps[i].attr);
CHECK_NNULL(qps[i], "bnxt_re_dv_create_qp");
}
}
int GDABackend::bnxt_dv_dl_init() {
bnxtdv_handle_ = dlopen("libbnxt_re.so", RTLD_NOW);
if (!bnxtdv_handle_) {
void* GDABackend::bnxt_dv_dlopen() {
void* dv_handle{nullptr};
dv_handle = dlopen("libbnxt_re.so", RTLD_NOW);
if (!dv_handle) {
// Try hard-coded PATH
bnxtdv_handle_ = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW);
if (!bnxtdv_handle_) {
dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW);
if (!dv_handle) {
DPRINTF("Could not open libbnxt_re.so. Returning\n");
return ROCSHMEM_ERROR;
}
}
return dv_handle;
}
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, init_obj);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, create_qp);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, destroy_qp);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, modify_qp);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, qp_mem_alloc);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, create_cq);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, destroy_cq);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, cq_mem_alloc);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, umem_reg);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, umem_dereg);
DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, get_default_db_region);
int GDABackend::bnxt_dv_dl_init() {
bnxtdv_handle_ = bnxt_dv_dlopen();
if (!bnxtdv_handle_)
return ROCSHMEM_ERROR;
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, init_obj);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, create_qp);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, destroy_qp);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, modify_qp);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, qp_mem_alloc);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, create_cq);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, destroy_cq);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, cq_mem_alloc);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, umem_reg);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, umem_dereg);
DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, get_default_db_region);
return ROCSHMEM_SUCCESS;
}
+22
Просмотреть файл
@@ -83,4 +83,26 @@ struct bnxt_host_qp {
/*****************************************************************************/
struct bnxtdv_funcs_t {
int (*init_obj)(struct bnxt_re_dv_obj *obj, uint64_t obj_type);
struct ibv_qp* (*create_qp)(struct ibv_pd *pd,
struct bnxt_re_dv_qp_init_attr *qp_attr);
int (*destroy_qp)(struct ibv_qp *ibvqp);
int (*modify_qp)(struct ibv_qp *ibv_qp, struct ibv_qp_attr *attr,
int attr_mask, uint32_t type, uint32_t value);
int (*qp_mem_alloc)(struct ibv_pd *ibvpd,
struct ibv_qp_init_attr *attr,
struct bnxt_re_dv_qp_mem_info *dv_qp_mem);
struct ibv_cq* (*create_cq)(struct ibv_context *ibvctx,
struct bnxt_re_dv_cq_init_attr *cq_attr);
int (*destroy_cq)(struct ibv_cq *ibv_cq);
void* (*cq_mem_alloc)(struct ibv_context *ibvctx, int num_cqe,
struct bnxt_re_dv_cq_attr *cq_attr);
void* (*umem_reg)(struct ibv_context *ibvctx,
struct bnxt_re_dv_umem_reg_attr *in);
int (*umem_dereg)(void *umem_handle);
int (*get_default_db_region)(struct ibv_context *ibvctx,
struct bnxt_re_dv_db_region_attr *out);
};
#endif //LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_
+36
Просмотреть файл
@@ -0,0 +1,36 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
target_sources(
${PROJECT_NAME}
PRIVATE
backend_gda_ionic.cpp
)
if(GDA_IONIC)
target_sources(
${PROJECT_NAME}
PRIVATE
queue_pair_ionic.cpp
)
endif()
+122
Просмотреть файл
@@ -0,0 +1,122 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "gda/backend_gda.hpp"
#include "util.hpp"
namespace rocshmem {
void GDABackend::ionic_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
ionic_dv_ctx dvctx;
ionic_dv.get_ctx(&dvctx, context);
int hip_dev_id{-1};
CHECK_HIP(hipGetDevice(&hip_dev_id));
void* gpu_db_page = nullptr;
rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id);
uint64_t *db_page_u64 = reinterpret_cast<uint64_t*>(dvctx.db_page);
uint64_t *gpu_db_page_u64 = reinterpret_cast<uint64_t*>(gpu_db_page);
uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64];
gpu_db_page = gpu_db_page;
gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype];
gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype];
uint8_t udma_idx = ionic_dv.qp_get_udma_idx(qps[conn_num]);
ionic_dv_cq dvcq;
ionic_dv.get_cq(&dvcq, cqs[conn_num], udma_idx);
gpu_qp->cq_dbreg = gpu_db_cq;
gpu_qp->cq_dbval = dvcq.q.db_val;
gpu_qp->cq_mask = dvcq.q.mask;
gpu_qp->ionic_cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
ionic_dv_qp dvqp;
ionic_dv.get_qp(&dvqp, qps[conn_num]);
gpu_qp->sq_dbreg = gpu_db_sq;
gpu_qp->sq_dbval = dvqp.sq.db_val;
gpu_qp->sq_mask = dvqp.sq.mask;
gpu_qp->ionic_sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
strncpy(gpu_qp->dev_name,
qps[conn_num]->context->device->name,
sizeof(gpu_qp->dev_name));
gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0;
gpu_qp->qp_num = qps[conn_num]->qp_num;
gpu_qp->lkey = heap_mr->lkey;
gpu_qp->rkey = heap_rkey[conn_num % num_pes];
gpu_qp->inline_threshold = 32;
}
void GDABackend::ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* pattr) {
ionic_dv.pd_set_sqcmb(pd_parent, false, false, false);
ionic_dv.pd_set_rqcmb(pd_parent, false, false, false);
for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) {
pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(context, pattr);
CHECK_NNULL(pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)");
ionic_dv.pd_set_sqcmb(pd_uxdma[uxdma_i], false, false, false);
ionic_dv.pd_set_rqcmb(pd_uxdma[uxdma_i], false, false, false);
ionic_dv.pd_set_udma_mask(pd_uxdma[uxdma_i], 1u << uxdma_i);
}
}
void* GDABackend::ionic_dv_dlopen() {
void* dv_handle{nullptr};
dv_handle = dlopen("libionic.so", RTLD_NOW);
if (!dv_handle) {
// Try hard-coded PATH
dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_NOW);
if (!dv_handle) {
DPRINTF("Could not open libionic.so. Returning\n");
}
}
return dv_handle;
}
int GDABackend::ionic_dv_dl_init() {
ionicdv_handle_ = ionic_dv_dlopen();
if (!ionicdv_handle_)
return ROCSHMEM_ERROR;
DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_ctx);
DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, qp_get_udma_idx);
DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_cq);
DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_qp);
DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_sqcmb);
DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_rqcmb);
DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_udma_mask);
return ROCSHMEM_SUCCESS;
}
} // namespace rocshmem
+245
Просмотреть файл
@@ -0,0 +1,245 @@
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/*
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/
#ifndef IONIC_DV_H
#define IONIC_DV_H
#include <stdbool.h>
#include <infiniband/verbs.h>
struct ibv_cq;
struct ibv_qp;
/** IONIC_PD_TAG - tag used for parent domain resource allocation. */
#define IONIC_PD_TAG ((uint64_t)RDMA_DRIVER_IONIC << 32)
#define IONIC_PD_TAG_CQ (IONIC_PD_TAG | 1)
#define IONIC_PD_TAG_SQ (IONIC_PD_TAG | 2)
#define IONIC_PD_TAG_RQ (IONIC_PD_TAG | 3)
#define IONIC_PD_TAG_RCQ (IONIC_PD_TAG | 4)
/* deprecated */
#define IONIC_SQ_SIG_ALL 1
/* deprecated */
#define IONIC_SQ_SIG_HACK_HIGH 2
/** IONIC_UDMA_MASK_LOW - flag represents the udma0 pipeline in the udma mask. */
#define IONIC_UDMA_MASK_LOW 1
/** IONIC_UDMA_MASK_HIGH - flag represents the udma1 pipeline in the udma mask. */
#define IONIC_UDMA_MASK_HIGH 2
#define IONIC_DV_PUEC_NPORTS_MAX 8
/** struct ionic_dv_ctx - Context information for gpu-initiated rdma. */
struct ionic_dv_ctx {
void *db_page;
uint64_t *db_ptr;
uint8_t sq_qtype;
uint8_t rq_qtype;
uint8_t cq_qtype;
};
/** struct ionic_dv_ctx - Queue information for gpu-initiated rdma. */
struct ionic_dv_queue {
void *ptr;
size_t size;
uint64_t db_val;
uint16_t mask;
uint8_t depth_log2;
uint8_t stride_log2;
};
/** struct ionic_dv_ctx - CQ information for gpu-initiated rdma. */
struct ionic_dv_cq {
struct ionic_dv_queue q;
};
/** struct ionic_dv_ctx - QP information for gpu-initiated rdma. */
struct ionic_dv_qp {
struct ionic_dv_queue rq;
struct ionic_dv_queue sq;
};
/** struct ionic_puec_route - Info needed to setup a PUEC plane route. */
struct ionic_dv_puec_route {
union ibv_gid dgid;
union ibv_gid sgid;
uint32_t flow_label;
uint8_t hop_limit;
uint8_t traffic_class;
uint8_t sl;
uint8_t rsvd[5];
uint32_t flags;
};
/**
* ionic_dv_is_ionic_ctx - Test if context belongs to ionic provider.
*/
bool ionic_dv_is_ionic_ctx(struct ibv_context *ibctx);
/**
* ionic_dv_is_ionic_pd - Test if pd belongs to ionic provider.
*/
bool ionic_dv_is_ionic_pd(struct ibv_pd *ibpd);
/**
* ionic_dv_is_ionic_cq - Test if cq belongs to ionic provider.
*/
bool ionic_dv_is_ionic_cq(struct ibv_cq *ibcq);
/**
* ionic_dv_is_ionic_qp - Test if qp belongs to ionic provider.
*/
bool ionic_dv_is_ionic_qp(struct ibv_qp *ibqp);
/**
* ionic_dv_ctx_get_udma_count - Get number of udma pipelines.
*/
uint8_t ionic_dv_ctx_get_udma_count(struct ibv_context *ibctx);
/**
* ionic_dv_ctx_get_udma_mask - Get mask of udma pipeline ids.
*/
uint8_t ionic_dv_ctx_get_udma_mask(struct ibv_context *ibctx);
/**
* ionic_dv_pd_get_udma_mask - Get mask of udma pipeline ids of pd or parent domain.
*/
uint8_t ionic_dv_pd_get_udma_mask(struct ibv_pd *ibpd);
/**
* ionic_dv_pd_set_udma_mask - Restrict pipeline ids of pd or parent domain.
*
* Queues associated with this pd will be restricted to one of the pipelines enabled by
* the mask at the time of queue creation.
*
* Recommended usage is to create a pd, then parent domains of that pd for each different
* udma mask. Set the desired udma mask on each parent domain. Then, create queues
* associated with the parent domain with the desired udma mask.
*
* Alternative usage is to create a pd, and set the desired udma mask prior to creating
* each queue. Changing the udma mask of the pd has no effect on previously created
* queues.
*/
int ionic_dv_pd_set_udma_mask(struct ibv_pd *ibpd, uint8_t udma_mask);
/**
* ionic_dv_cq_get_udma_mask - Get mask of udma pipeline ids of completion queueue.
*/
uint8_t ionic_dv_cq_get_udma_mask(struct ibv_cq *ibcq);
/**
* ionic_dv_qp_get_udma_idx - Get udma pipeline id of queueue pair.
*/
uint8_t ionic_dv_qp_get_udma_idx(struct ibv_qp *ibqp);
/**
* ionic_dv_pd_set_sqcmb - Specify send queue preference for controller memory bar.
*
* Send queues associated with this pd will use the controller memory bar according to
* this preference at the time of queue creation.
*
* @enable - Allow the use of the controller memory bar.
* @expdb - Allow the use of express doorbell optimizations.
* @require - Require preferences to be met, no fallback.
*/
int ionic_dv_pd_set_sqcmb(struct ibv_pd *ibpd, bool enable, bool expdb, bool require);
/**
* ionic_dv_pd_set_rqcmb - Specify receive queue preference for controller memory bar.
*
* See ionic_dv_pd_set_sqcmb().
*/
int ionic_dv_pd_set_rqcmb(struct ibv_pd *ibpd, bool enable, bool expdb, bool require);
/**
* ionic_dv_qp_set_gda - Enable or disable GPU-Direct Async (GDA) mode.
*
* In GDA mode, when the application calls ibv_post_send() or ibv_post_recv(), the
* provider writes WQEs in the descriptor ring without ringing the doorbell.
*
* To ring the doorbell, after posting the work the application should query to get the
* doorbell data, and later write that data to the memory mapped doorbell register.
*
* See also: ionic_dv_get_ctx()
* See also: ionic_dv_qp_get_send_dbell_data()
* See also: ionic_dv_qp_get_recv_dbell_data()
*
* @ibqp - Set GDA mode for this queue pair.
* @enable_send - Enable GDA mode for the send queue.
* @enable_recv - Enable GDA mode for the recv queue.
*/
int ionic_dv_qp_set_gda(struct ibv_qp *ibqp, bool enable_send, bool enable_recv);
/**
* ionic_dv_qp_get_send_dbell_data - Get send queue doorbell data.
*
* In GDA mode, when the application calls ibv_post_send() the provider writes WQEs in
* the descriptor ring without ringing the doorbell. The application should query the
* doorbell data immediately after posting the work. The application requests the
* GPU to fill the source buffers of the data transfer with the result of computation.
* The application requests the GPU to write the doorbell data to the memory mapped
* doorbell register immediately when the computation is complete, triggering the data
* transfer.
*
* It is important that the GPU ring the doorbell in sequential order. If work requests
* are posted in batches A, B, and C, with respective doorbell data, the data path must
* not write B or C before A, and must not write C before B. It is ok to skip writing a
* doorbell, like writing only C, which will trigger the data transfer for all of the
* work up to that point in the sequence.
*
* @ibqp - Get send doorbell data for this queue pair.
* @dbdata - Output parameter for doorbell data.
*/
int ionic_dv_qp_get_send_dbell_data(struct ibv_qp *ibqp, uint64_t *dbdata);
/**
* ionic_dv_qp_get_recv_dbell_data - Get recv queue doorbell data.
*
* In GDA mode, when the application calls ibv_post_recv() the provider writes WQEs in
* the descriptor ring without ringing the doorbell. After polling recv completions, the
* application can immediately re-post the receive buffers without ringing the doorbell.
* The application should query the doorbell data immediately after posting the buffers.
* The application requests the GPU consume the data from the receive buffers. The
* application requests the GPU to write the doorbell data to the memory mapped doorbell
* register immediately after the received data is consumed, making the buffers available
* for the next data transfer.
*
* It is important that the GPU ring the doorbell in sequential order. If work requests
* are posted in batches A, B, and C, with respective doorbell data, the data path must
* not write B or C before A, and must not write C before B. It is ok to skip writing a
* doorbell, like writing only C, which will make buffers available up to that point in
* the sequence.
*
* @ibqp - Get recv doorbell data for this queue pair.
* @dbdata - Output parameter for doorbell data.
*/
int ionic_dv_qp_get_recv_dbell_data(struct ibv_qp *ibqp, uint64_t *dbdata);
/**
* ionic_dv_get_ctx - Extract context information for gpu-initiated rdma.
*/
int ionic_dv_get_ctx(struct ionic_dv_ctx *dvctx, struct ibv_context *ibctx);
/**
* ionic_dv_get_cq - Extract cq information for gpu-initiated rdma.
*/
int ionic_dv_get_cq(struct ionic_dv_cq *dvcq, struct ibv_cq *ibcq, uint8_t udma_idx);
/**
* ionic_dv_get_qp - Extract qp information for gpu-initiated rdma.
*/
int ionic_dv_get_qp(struct ionic_dv_qp *dvqp, struct ibv_qp *ibqp);
/**
* ionic_dv_qp_set_puec_plane_route - set route info for a PUEC plane.
*/
int ionic_dv_qp_set_puec_plane_route(struct ibv_qp *ibqp, uint8_t plane_idx,
struct ionic_dv_puec_route *ipr);
#endif /* IONIC_DV_H */
+534
Просмотреть файл
@@ -0,0 +1,534 @@
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/*
* Copyright (c) 2018-2022 Pensando Systems, Inc. All rights reserved.
* 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
*/
#ifndef IONIC_FW_H
#define IONIC_FW_H
#if !defined(__cplusplus)
#include <util/util.h>
#else
#ifndef BIT
#define BIT(n) (1u << (n))
#endif
#endif
#define IONIC_EXP_DBELL_SZ 8
/* common to all versions */
/* wqe scatter gather element */
struct ionic_sge {
__be64 va;
__be32 len;
__be32 lkey;
};
/* admin queue mr type */
enum ionic_mr_flags {
/* bits that determine mr access */
IONIC_MRF_LOCAL_WRITE = BIT(0),
IONIC_MRF_REMOTE_WRITE = BIT(1),
IONIC_MRF_REMOTE_READ = BIT(2),
IONIC_MRF_REMOTE_ATOMIC = BIT(3),
IONIC_MRF_MW_BIND = BIT(4),
IONIC_MRF_ZERO_BASED = BIT(5),
IONIC_MRF_ON_DEMAND = BIT(6),
IONIC_MRF_PB = BIT(7),
IONIC_MRF_ACCESS_MASK = BIT(12) - 1,
/* bits that determine mr type */
IONIC_MRF_IS_MW = BIT(14),
IONIC_MRF_INV_EN = BIT(15),
/* base flags combinations for mr types */
IONIC_MRF_USER_MR = 0,
IONIC_MRF_PHYS_MR = IONIC_MRF_INV_EN,
IONIC_MRF_MW_1 = IONIC_MRF_IS_MW,
IONIC_MRF_MW_2 = IONIC_MRF_IS_MW | IONIC_MRF_INV_EN,
};
static inline int to_ionic_mr_flags(int access)
{
int flags = 0;
if (access & IBV_ACCESS_LOCAL_WRITE)
flags |= IONIC_MRF_LOCAL_WRITE;
if (access & IBV_ACCESS_REMOTE_READ)
flags |= IONIC_MRF_REMOTE_READ;
if (access & IBV_ACCESS_REMOTE_WRITE)
flags |= IONIC_MRF_REMOTE_WRITE;
if (access & IBV_ACCESS_REMOTE_ATOMIC)
flags |= IONIC_MRF_REMOTE_ATOMIC;
if (access & IBV_ACCESS_MW_BIND)
flags |= IONIC_MRF_MW_BIND;
if (access & IBV_ACCESS_ZERO_BASED)
flags |= IONIC_MRF_ZERO_BASED;
return flags;
}
/* cqe status indicated in status_length field when err bit is set */
enum ionic_status {
IONIC_STS_OK,
IONIC_STS_LOCAL_LEN_ERR,
IONIC_STS_LOCAL_QP_OPER_ERR,
IONIC_STS_LOCAL_PROT_ERR,
IONIC_STS_WQE_FLUSHED_ERR,
IONIC_STS_MEM_MGMT_OPER_ERR,
IONIC_STS_BAD_RESP_ERR,
IONIC_STS_LOCAL_ACC_ERR,
IONIC_STS_REMOTE_INV_REQ_ERR,
IONIC_STS_REMOTE_ACC_ERR,
IONIC_STS_REMOTE_OPER_ERR,
IONIC_STS_RETRY_EXCEEDED,
IONIC_STS_RNR_RETRY_EXCEEDED,
IONIC_STS_XRC_VIO_ERR,
};
static inline int ionic_to_ibv_status(int sts)
{
switch (sts) {
case IONIC_STS_OK:
return IBV_WC_SUCCESS;
case IONIC_STS_LOCAL_LEN_ERR:
return IBV_WC_LOC_LEN_ERR;
case IONIC_STS_LOCAL_QP_OPER_ERR:
return IBV_WC_LOC_QP_OP_ERR;
case IONIC_STS_LOCAL_PROT_ERR:
return IBV_WC_LOC_PROT_ERR;
case IONIC_STS_WQE_FLUSHED_ERR:
return IBV_WC_WR_FLUSH_ERR;
case IONIC_STS_MEM_MGMT_OPER_ERR:
return IBV_WC_MW_BIND_ERR;
case IONIC_STS_BAD_RESP_ERR:
return IBV_WC_BAD_RESP_ERR;
case IONIC_STS_LOCAL_ACC_ERR:
return IBV_WC_LOC_ACCESS_ERR;
case IONIC_STS_REMOTE_INV_REQ_ERR:
return IBV_WC_REM_INV_REQ_ERR;
case IONIC_STS_REMOTE_ACC_ERR:
return IBV_WC_REM_ACCESS_ERR;
case IONIC_STS_REMOTE_OPER_ERR:
return IBV_WC_REM_OP_ERR;
case IONIC_STS_RETRY_EXCEEDED:
return IBV_WC_RETRY_EXC_ERR;
case IONIC_STS_RNR_RETRY_EXCEEDED:
return IBV_WC_RNR_RETRY_EXC_ERR;
case IONIC_STS_XRC_VIO_ERR:
default:
return IBV_WC_GENERAL_ERR;
}
}
/* fw abi v1 */
/* data payload part of v1 wqe */
union ionic_v1_pld {
struct ionic_sge sgl[2];
__be32 spec32[8];
__be16 spec16[16];
__u8 data[32];
};
/* completion queue v1 cqe */
struct ionic_v1_cqe {
union {
struct {
__le64 wqe_idx_timestamp;
__be32 src_qpn_op;
__u8 src_mac[6];
__be16 vlan_tag;
__be32 imm_data_rkey;
} recv;
struct {
__u8 rsvd[4];
__be32 msg_msn;
__u8 rsvd2[8];
__le64 npg_wqe_idx_timestamp;
} send;
};
__be32 status_length;
__be32 qid_type_flags;
};
/* bits for cqe wqe_idx and timestamp */
enum ionic_v1_cqe_wqe_idx_timestamp_bits {
IONIC_V1_CQE_WQE_IDX_MASK = 0xffff,
IONIC_V1_CQE_TIMESTAMP_SHIFT = 16,
};
/* bits for cqe recv */
enum ionic_v1_cqe_src_qpn_bits {
IONIC_V1_CQE_RECV_QPN_MASK = 0xffffff,
IONIC_V1_CQE_RECV_OP_SHIFT = 24,
/* MASK could be 0x3, but need 0x1f for makeshift values:
* OP_TYPE_RDMA_OPER_WITH_IMM, OP_TYPE_SEND_RCVD
*/
IONIC_V1_CQE_RECV_OP_MASK = 0x1f,
IONIC_V1_CQE_RECV_OP_SEND = 0,
IONIC_V1_CQE_RECV_OP_SEND_INV = 1,
IONIC_V1_CQE_RECV_OP_SEND_IMM = 2,
IONIC_V1_CQE_RECV_OP_RDMA_IMM = 3,
IONIC_V1_CQE_RECV_IS_IPV4 = BIT(7 + IONIC_V1_CQE_RECV_OP_SHIFT),
IONIC_V1_CQE_RECV_IS_VLAN = BIT(6 + IONIC_V1_CQE_RECV_OP_SHIFT),
};
/* bits for cqe qid_type_flags */
enum ionic_v1_cqe_qtf_bits {
IONIC_V1_CQE_COLOR = BIT(0),
IONIC_V1_CQE_ERROR = BIT(1),
IONIC_V1_CQE_TYPE_SHIFT = 5,
IONIC_V1_CQE_TYPE_MASK = 0x7,
IONIC_V1_CQE_QID_SHIFT = 8,
IONIC_V1_CQE_TYPE_RECV = 1,
IONIC_V1_CQE_TYPE_SEND_MSN = 2,
IONIC_V1_CQE_TYPE_SEND_NPG = 3,
IONIC_V1_CQE_TYPE_RECV_INDIR = 4,
};
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_HCC__)
static inline bool ionic_v1_cqe_color(struct ionic_v1_cqe *cqe)
{
return !!(cqe->qid_type_flags & htobe32(IONIC_V1_CQE_COLOR));
}
static inline bool ionic_v1_cqe_error(struct ionic_v1_cqe *cqe)
{
return !!(cqe->qid_type_flags & htobe32(IONIC_V1_CQE_ERROR));
}
static inline bool ionic_v1_cqe_recv_is_ipv4(struct ionic_v1_cqe *cqe)
{
return !!(cqe->recv.src_qpn_op &
htobe32(IONIC_V1_CQE_RECV_IS_IPV4));
}
static inline bool ionic_v1_cqe_recv_is_vlan(struct ionic_v1_cqe *cqe)
{
return !!(cqe->recv.src_qpn_op &
htobe32(IONIC_V1_CQE_RECV_IS_VLAN));
}
static inline void ionic_v1_cqe_clean(struct ionic_v1_cqe *cqe)
{
cqe->qid_type_flags |= htobe32(~0u << IONIC_V1_CQE_QID_SHIFT);
}
static inline uint32_t ionic_v1_cqe_qtf(struct ionic_v1_cqe *cqe)
{
return be32toh(cqe->qid_type_flags);
}
#endif // !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_HCC__)
static inline uint8_t ionic_v1_cqe_qtf_type(uint32_t qtf)
{
return (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
}
static inline uint32_t ionic_v1_cqe_qtf_qid(uint32_t qtf)
{
return qtf >> IONIC_V1_CQE_QID_SHIFT;
}
/* v1 base wqe header */
struct ionic_v1_base_hdr {
__le64 wqe_idx;
__u8 op;
__u8 num_sge_key;
__be16 flags;
__be32 imm_data_key;
};
/* v1 receive wqe body */
struct ionic_v1_recv_bdy {
__u8 rsvd[16];
union ionic_v1_pld pld;
};
/* v1 send/rdma wqe body (common, has sgl) */
struct ionic_v1_common_bdy {
union {
struct {
__be32 ah_id;
__be32 dest_qpn;
__be32 dest_qkey;
} send;
struct {
__be32 remote_va_high;
__be32 remote_va_low;
__be32 remote_rkey;
} rdma;
};
__be32 length;
union ionic_v1_pld pld;
};
/* v1 atomic wqe body */
struct ionic_v1_atomic_bdy {
__be32 remote_va_high;
__be32 remote_va_low;
__be32 remote_rkey;
__be32 swap_add_high;
__be32 swap_add_low;
__be32 compare_high;
__be32 compare_low;
__u8 rsvd[4];
struct ionic_sge sge;
};
/* v2 atomic wqe body */
struct ionic_v2_atomic_bdy {
__be32 remote_va_high;
__be32 remote_va_low;
__be32 remote_rkey;
__be32 swap_add_high;
__be32 swap_add_low;
__be32 compare_high;
__be32 compare_low;
__be32 lkey;
__be64 local_va;
__u8 rsvd_expdb[8];
};
/* v1 bind mw wqe body */
struct ionic_v1_bind_mw_bdy {
__be64 va;
__be64 length;
__be32 lkey;
__be16 flags;
__u8 rsvd[26];
};
/* v1 send/recv wqe */
struct ionic_v1_wqe {
struct ionic_v1_base_hdr base;
union {
struct ionic_v1_recv_bdy recv;
struct ionic_v1_common_bdy common;
struct ionic_v1_atomic_bdy atomic;
struct ionic_v2_atomic_bdy atomic_v2;
struct ionic_v1_bind_mw_bdy bind_mw;
};
};
/* queue pair v1 send opcodes */
enum ionic_v1_op {
IONIC_V1_OP_SEND,
IONIC_V1_OP_SEND_INV,
IONIC_V1_OP_SEND_IMM,
IONIC_V1_OP_RDMA_READ,
IONIC_V1_OP_RDMA_WRITE,
IONIC_V1_OP_RDMA_WRITE_IMM,
IONIC_V1_OP_ATOMIC_CS,
IONIC_V1_OP_ATOMIC_FA,
IONIC_V1_OP_REG_MR,
IONIC_V1_OP_LOCAL_INV,
IONIC_V1_OP_BIND_MW,
/* flags */
IONIC_V1_FLAG_FENCE = BIT(0),
IONIC_V1_FLAG_SOL = BIT(1),
IONIC_V1_FLAG_INL = BIT(2),
IONIC_V1_FLAG_SIG = BIT(3),
IONIC_V1_FLAG_COLOR = BIT(4),
/* flags last four bits for sgl spec format */
IONIC_V1_FLAG_SPEC32 = (1u << 12),
IONIC_V1_FLAG_SPEC16 = (2u << 12),
IONIC_V1_SPEC_FIRST_SGE = 2,
};
/* queue pair v2 send opcodes */
enum ionic_v2_op {
IONIC_V2_OPSL_OUT = 0x20,
IONIC_V2_OPSL_IMM = 0x40,
IONIC_V2_OPSL_INV = 0x80,
IONIC_V2_OP_SEND = 0x0 | IONIC_V2_OPSL_OUT,
IONIC_V2_OP_SEND_IMM = IONIC_V2_OP_SEND | IONIC_V2_OPSL_IMM,
IONIC_V2_OP_SEND_INV = IONIC_V2_OP_SEND | IONIC_V2_OPSL_INV,
IONIC_V2_OP_RDMA_WRITE = 0x1 | IONIC_V2_OPSL_OUT,
IONIC_V2_OP_RDMA_WRITE_IMM = IONIC_V2_OP_RDMA_WRITE | IONIC_V2_OPSL_IMM,
IONIC_V2_OP_RDMA_READ = 0x2,
IONIC_V2_OP_ATOMIC_CS = 0x4,
IONIC_V2_OP_ATOMIC_FA = 0x5,
IONIC_V2_OP_REG_MR = 0x6,
IONIC_V2_OP_LOCAL_INV = 0x7,
IONIC_V2_OP_BIND_MW = 0x8,
};
#if !defined(__cplusplus)
static inline size_t ionic_v1_send_wqe_min_size(int min_sge, int min_data,
int spec, bool expdb)
{
size_t sz_wqe, sz_sgl, sz_data;
if (spec > IONIC_V1_SPEC_FIRST_SGE)
min_sge += IONIC_V1_SPEC_FIRST_SGE;
if (expdb) {
min_sge += 1;
min_data += IONIC_EXP_DBELL_SZ;
}
sz_wqe = sizeof(struct ionic_v1_wqe);
sz_sgl = offsetof(struct ionic_v1_wqe, common.pld.sgl[min_sge]);
sz_data = offsetof(struct ionic_v1_wqe, common.pld.data[min_data]);
if (sz_sgl > sz_wqe)
sz_wqe = sz_sgl;
if (sz_data > sz_wqe)
sz_wqe = sz_data;
return roundup_pow_of_two(sz_wqe);
}
static inline int ionic_v1_send_wqe_max_sge(uint8_t stride_log2, int spec, bool expdb)
{
struct ionic_v1_wqe *wqe = (void *)0;
struct ionic_sge *sge = (void *)(1ull << stride_log2);
int num_sge = 0;
if (expdb)
sge -= 1;
if (spec > IONIC_V1_SPEC_FIRST_SGE)
num_sge = IONIC_V1_SPEC_FIRST_SGE;
num_sge = sge - &wqe->common.pld.sgl[num_sge];
if (spec && num_sge > spec)
num_sge = spec;
return num_sge;
}
static inline int ionic_v1_send_wqe_max_data(uint8_t stride_log2, bool expdb)
{
struct ionic_v1_wqe *wqe = (void *)0;
__u8 *data = (void *)(1ull << stride_log2);
if (expdb)
data -= IONIC_EXP_DBELL_SZ;
return data - wqe->common.pld.data;
}
static inline size_t ionic_v1_recv_wqe_min_size(int min_sge, int spec, bool expdb)
{
size_t sz_wqe, sz_sgl;
if (spec > IONIC_V1_SPEC_FIRST_SGE)
min_sge += IONIC_V1_SPEC_FIRST_SGE;
if (expdb)
min_sge += 1;
sz_wqe = sizeof(struct ionic_v1_wqe);
sz_sgl = offsetof(struct ionic_v1_wqe, recv.pld.sgl[min_sge]);
if (sz_sgl > sz_wqe)
sz_wqe = sz_sgl;
return sz_wqe;
}
static inline int ionic_v1_recv_wqe_max_sge(uint8_t stride_log2, int spec, bool expdb)
{
struct ionic_v1_wqe *wqe = (void *)0;
struct ionic_sge *sge = (void *)(1ull << stride_log2);
int num_sge = 0;
if (expdb)
sge -= 1;
if (spec > IONIC_V1_SPEC_FIRST_SGE)
num_sge = IONIC_V1_SPEC_FIRST_SGE;
num_sge = sge - &wqe->recv.pld.sgl[num_sge];
if (spec && num_sge > spec)
num_sge = spec;
return num_sge;
}
static inline int ionic_v1_use_spec_sge(int min_sge, int spec)
{
if (!spec || min_sge > spec)
return 0;
if (min_sge <= IONIC_V1_SPEC_FIRST_SGE)
return IONIC_V1_SPEC_FIRST_SGE;
return spec;
}
#define IONIC_RCQ_SIZE 4096
#define IONIC_RCQ_DEPTH 128
#define IONIC_RCQ_DEPTH_LOG2 7
#define IONIC_RCQ_STRIDE_LOG2 4
struct ionic_rcq_hdr {
uint8_t pad[60];
uint32_t seq_pad;
};
struct ionic_rcqe {
uint32_t status_length;
uint32_t imm_data;
uint32_t seq_flags;
uint32_t rsvd;
};
enum ionic_rcqe_flag {
IONIC_RCQE_C = BIT(7),
IONIC_RCQE_I = BIT(6),
};
struct ionic_rcq {
struct ionic_rcq_hdr hdr;
struct ionic_rcqe ring[IONIC_RCQ_DEPTH];
};
static inline uint32_t ionic_rcq_hdr_seq(struct ionic_rcq_hdr *hdr)
{
return be32toh(hdr->seq_pad) >> 8;
}
static inline uint32_t ionic_rcqe_seq(struct ionic_rcqe *rcqe)
{
return be32toh(rcqe->seq_flags) >> 8;
}
static inline bool ionic_rcqe_color(struct ionic_rcqe *rcqe)
{
return !!(rcqe->seq_flags & htobe32(IONIC_RCQE_C));
}
static inline bool ionic_rcqe_imm(struct ionic_rcqe *rcqe)
{
return !!(rcqe->seq_flags & htobe32(IONIC_RCQE_I));
}
#endif // !defined(__cplusplus)
#endif /* IONIC_FW_H */
+12 -4
Просмотреть файл
@@ -25,11 +25,19 @@
#ifndef LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_
#define LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_
#ifdef GDA_IONIC
extern "C" {
#include <infiniband/ionic_dv.h>
#include <infiniband/ionic_fw.h>
#include "gda/ionic/ionic_dv.h"
#include "gda/ionic/ionic_fw.h"
}
#endif
struct ionicdv_funcs_t {
int (*get_ctx)(struct ionic_dv_ctx *dvctx, struct ibv_context *ibctx);
uint8_t (*qp_get_udma_idx)(struct ibv_qp *ibqp);
int (*get_cq)(struct ionic_dv_cq *dvcq, struct ibv_cq *ibcq, uint8_t udma_idx);
int (*get_qp)(struct ionic_dv_qp *dvqp, struct ibv_qp *ibqp);
int (*pd_set_sqcmb)(struct ibv_pd *ibpd, bool enable, bool expdb, bool require);
int (*pd_set_rqcmb)(struct ibv_pd *ibpd, bool enable, bool expdb, bool require);
int (*pd_set_udma_mask)(struct ibv_pd *ibpd, uint8_t udma_mask);
};
#endif //LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_
+312
Просмотреть файл
@@ -0,0 +1,312 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "gda/queue_pair.hpp"
#include "gda/endian.hpp"
#include "util.hpp"
#include "containers/free_list_impl.hpp"
namespace rocshmem {
__device__ uint64_t QueuePair::get_same_qp_lane_mask() {
uint64_t lane_mask = get_active_lane_mask();
uintptr_t this_val = reinterpret_cast<uintptr_t>(this);
// exclude threads operating on a different qp from this thread lane mask
#pragma unroll
for (int i = 0; i < 64; ++i) {
uint64_t bit_i = 1ull << i;
if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) {
lane_mask &= ~bit_i;
}
}
return lane_mask;
}
__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) {
uint32_t my_sq_prod = 0;
// reserve space for wqes in sq
if (is_first_active_lane(activemask)) {
my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask));
// wait for that space to be available
ionic_quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask);
return my_sq_prod;
}
__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) {
uint32_t dbprod = my_sq_prod + num_wqes;
spin_lock_acquire_shared(&sq_lock, activemask);
if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) {
sq_dbprod = dbprod;
ionic_ring_doorbell(dbprod);
}
spin_lock_release_shared(&sq_lock, activemask);
return dbprod;
}
__device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
uint32_t my_cq_pos = cq_pos + my_logical_lane_id;
/* Look at the cqe at the current position in the cq buffer */
struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask];
/* Determine expected color based on cq wrap count */
uint32_t qtf_color_bit = swap_endian_val<uint32_t>(IONIC_V1_CQE_COLOR);
uint32_t qtf_color_exp = qtf_color_bit;
if (my_cq_pos & (cq_mask + 1)) {
qtf_color_exp = 0;
}
/* Check if my cqe color == expected color */
uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
if ((qtf_be & qtf_color_bit) != qtf_color_exp) {
return;
}
uint32_t msn = swap_endian_val<uint32_t>(cqe->send.msg_msn);
/* Report if the completion indicates an error. */
if (!!(qtf_be & swap_endian_val<uint32_t>(IONIC_V1_CQE_ERROR))) {
#if defined(DEBUG)
uint32_t qtf = swap_endian_val<uint32_t>(qtf_be);
uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT;
uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
uint32_t flag = qtf & 0xf;
uint32_t status = swap_endian_val<uint32_t>(cqe->status_length);
uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK;
printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n",
dev_name, qid, type, flag, status, msn, npg);
#endif
/* No other way to signal an error, so just crash. */
abort();
}
/* Only proceed with the furthest ahead cqe to update the sq state */
uint64_t my_lane_mask = 1ull << __lane_id();
uint64_t lesser_lane_mask = my_lane_mask - 1;
if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) {
return;
}
/* update position in the cq */
cq_pos = my_cq_pos + 1;
/*
* Ring cq doorbell frequently enough to avoid cq full.
*
* NB: IONIC_CQ_GRACE is 100
*/
if (((cq_pos - cq_dbpos) & cq_mask) >= 100) {
cq_dbpos = cq_pos;
__atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed?
}
sq_msn = msn;
}
__device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) {
uint32_t greed = 10;
/* wait for sq_msn to catch up or pass cons. */
/* 0x800000 - sign bit for 24-bit fields */
while ((sq_msn - cons) & 0x800000) {
if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) {
continue;
}
/* with lock acquired, this wave polls cqes until caught up */
while ((sq_msn - cons) & 0x800000) {
uint32_t old_sq_msn = sq_msn;
poll_wave_cqes(activemask);
if (!((sq_msn - cons) & 0x800000)) {
if (sq_msn == old_sq_msn) {
break;
}
if (!greed) {
break;
}
--greed;
}
}
spin_lock_release_shared(&cq_lock, activemask);
break;
}
}
__device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) {
// TODO When threads write at once to the same address, not all writes reach the bus.
for (int i = 0; i < 64; ++i) {
if (__lane_id() == i) {
__threadfence();
__atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST);
}
}
__threadfence();
}
__device__ void QueuePair::ionic_quiet() {
ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod);
}
__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
uint64_t activemask = get_same_qp_lane_mask();
uint32_t num_wqes = get_active_lane_count(activemask);
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
uint16_t wqe_flags = 0;
if (!(my_sq_pos & (sq_mask + 1))) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
}
if (is_last_active_lane(activemask)) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
}
// TODO why is this needed?
if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) {
size = 1;
}
wqe->base.wqe_idx = my_sq_pos;
wqe->base.op = opcode;
wqe->base.num_sge_key = size ? 1 : 0;
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
wqe->common.rdma.remote_rkey = swap_endian_val<uint32_t>(rkey);
wqe->common.length = swap_endian_val<uint32_t>(size);
if (size) {
if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
wqe->base.num_sge_key = 0;
if (!laddr) {
// TODO why is this needed?
wqe->common.pld.data[0] = 1;
} else {
memcpy(wqe->common.pld.data, laddr, size);
}
} else {
wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(laddr));
wqe->common.pld.sgl[0].len = swap_endian_val<uint32_t>(size);
wqe->common.pld.sgl[0].lkey = swap_endian_val<uint32_t>(lkey);
}
}
__hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
}
__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
uint64_t activemask = get_same_qp_lane_mask();
uint32_t num_wqes = get_active_lane_count(activemask);
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
bool is_leader{my_logical_lane_id == 0};
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
uint16_t wqe_flags = 0;
uint32_t cons;
uint64_t* wave_fetch_atomic{nullptr};
if (fetching) {
if (is_leader) {
auto res = fetching_atomic_freelist->pop_front();
while (!res.success) {
res = fetching_atomic_freelist->pop_front();
}
wave_fetch_atomic = res.value;
}
wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
}
if (!(my_sq_pos & (sq_mask + 1))) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
}
if (is_last_active_lane(activemask)) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
}
wqe->base.wqe_idx = my_sq_pos;
wqe->base.op = opcode;
wqe->base.num_sge_key = 1;
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
wqe->atomic_v2.remote_rkey = swap_endian_val<uint32_t>(rkey);
wqe->atomic_v2.swap_add_high = swap_endian_val<uint32_t>(atomic_data >> 32);
wqe->atomic_v2.swap_add_low = swap_endian_val<uint32_t>(atomic_data);
wqe->atomic_v2.compare_high = swap_endian_val<uint32_t>(atomic_cmp >> 32);
wqe->atomic_v2.compare_low = swap_endian_val<uint32_t>(atomic_cmp);
if (fetching) {
wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(wave_fetch_atomic + my_logical_lane_id));
wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(fetching_atomic_lkey);
} else {
wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(nonfetching_atomic));
wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(nonfetching_atomic_lkey);
}
__hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
uint64_t ret{0};
if (fetching) {
ionic_quiet_internal(activemask, cons);
ret = wave_fetch_atomic[my_logical_lane_id];
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
fetching_atomic_freelist->push_back(wave_fetch_atomic);
}
}
return ret;
}
} // namespace rocshmem
+37
Просмотреть файл
@@ -0,0 +1,37 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
target_sources(
${PROJECT_NAME}
PRIVATE
backend_gda_mlx5.cpp
)
if(GDA_MLX5)
target_sources(
${PROJECT_NAME}
PRIVATE
queue_pair_mlx5.cpp
segment_builder.cpp
)
endif()
+122
Просмотреть файл
@@ -0,0 +1,122 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "gda/backend_gda.hpp"
#include "util.hpp"
namespace rocshmem {
void* GDABackend::mlx5_dv_dlopen() {
void* dv_handle{nullptr};
dv_handle = dlopen("libmlx5.so", RTLD_NOW);
if (!dv_handle) {
DPRINTF("Could not open libmlx5.so. Returning\n");
}
return dv_handle;
}
int GDABackend::mlx5_dv_dl_init() {
mlx5dv_handle_ = mlx5_dv_dlopen();
if (!mlx5dv_handle_)
return ROCSHMEM_ERROR;
DLSYM_HELPER(mlx5dv, mlx5dv_, mlx5dv_handle_, init_obj);
return ROCSHMEM_SUCCESS;
}
void GDABackend::mlx5_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
mlx5dv_cq cq_out;
mlx5dv_obj mlx_obj;
mlx_obj.cq.in = cqs[conn_num];
mlx_obj.cq.out = &cq_out;
mlx5dv.init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
dump_mlx5dv_cq(&cq_out, conn_num);
/*
* struct mlx5dv_cq {
* void *buf;
* __be32 *dbrec;
* uint32_t cqe_cnt;
* uint32_t cqe_size;
* void *cq_uar;
* uint32_t cqn;
* uint64_t comp_mask;
* };
*/
gpu_qp->cq_buf = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
gpu_qp->cq_cnt = cq_out.cqe_cnt;
gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt);
gpu_qp->cq_dbrec = cq_out.dbrec;
mlx5dv_qp qp_out;
mlx_obj.qp.in = qps[conn_num];
mlx_obj.qp.out = &qp_out;
mlx5dv.init_obj(&mlx_obj, MLX5DV_OBJ_QP);
dump_mlx5dv_qp(&qp_out, conn_num);
/*
* struct mlx5dv_qp {
* __be32 *dbrec;
* struct {
* void *buf;
* uint32_t wqe_cnt;
* uint32_t stride;
* } sq;
* struct {
* void *buf;
* uint32_t wqe_cnt;
* uint32_t stride;
* } rq;
* struct {
* void *reg;
* uint32_t size;
* } bf;
* uint64_t comp_mask;
* off_t uar_mmap_offset;
* uint32_t tirn;
* uint32_t tisn;
* uint32_t rqn;
* uint32_t sqn;
* uint64_t tir_icm_addr;
* };
*/
gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR
gpu_qp->sq_buf = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt;
gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]);
gpu_qp->lkey = htobe32(heap_mr->lkey);
gpu_qp->qp_num = qps[conn_num]->qp_num;
gpu_qp->inline_threshold = inline_threshold;
// The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers
int hip_dev_id{-1};
CHECK_HIP(hipGetDevice(&hip_dev_id));
void* gpu_ptr{nullptr};
rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id);
gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
}
} // namespace rocshmem
+4
Просмотреть файл
@@ -34,4 +34,8 @@ typedef union db_reg {
uintptr_t uint;
} db_reg_t;
struct mlx5dv_funcs_t {
int (*init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type);
};
#endif //LIBRARY_SRC_GDA_MLX5_GDA_PROVIDER_HPP_
+273
Просмотреть файл
@@ -0,0 +1,273 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "gda/queue_pair.hpp"
#include "util.hpp"
#include "containers/free_list_impl.hpp"
#include "gda/endian.hpp"
#include "segment_builder.hpp"
namespace rocshmem {
__device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) {
swap_endian_store(const_cast<uint32_t*>(dbrec), (uint32_t)my_sq_counter);
__atomic_signal_fence(__ATOMIC_SEQ_CST);
__hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
uint64_t db_uint = __hip_atomic_load(&db.uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
db_uint ^= 0x100;
__hip_atomic_store(&db.uint, db_uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
__device__ void QueuePair::mlx5_quiet() {
constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE;
__shared__ uint64_t wqe_broadcast[BROADCAST_SIZE];
uint8_t wavefront_id = get_flat_block_id() / WF_SIZE;
wqe_broadcast[wavefront_id] = 0;
uint64_t activemask = get_active_lane_mask();
uint8_t num_active_lanes = get_active_lane_count(activemask);
uint8_t my_logical_lane_id = get_active_lane_num(activemask);
bool is_leader{my_logical_lane_id == 0};
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
while (true) {
bool done{false};
uint64_t quiet_amount{0};
uint64_t wave_cq_consumer{0};
while (!done) {
uint64_t active = __hip_atomic_load(&quiet_active, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
uint64_t posted = __hip_atomic_load(&quiet_posted, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
uint64_t completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
if (!(posted - completed)) {
return;
}
int64_t quiet_val = posted - active;
if (quiet_val <= 0) {
continue;
}
quiet_amount = min(num_active_lanes, quiet_val);
if (is_leader) {
done = __hip_atomic_compare_exchange_strong(&quiet_active, &active, active + quiet_amount, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
if (done) {
wave_cq_consumer = __hip_atomic_fetch_add(&cq_consumer, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
}
done = __shfl(done, leader_phys_lane_id);
}
wave_cq_consumer = __shfl(wave_cq_consumer, leader_phys_lane_id);
uint64_t my_cq_consumer = wave_cq_consumer + my_logical_lane_id;
uint64_t my_cq_index = my_cq_consumer % cq_cnt;
if (my_logical_lane_id < quiet_amount) {
volatile mlx5_cqe64 *cqe_entry = &cq_buf[my_cq_index];
uint16_t be_wqe_counter{0};
uint8_t op_own{0};
uint8_t owner_bit = (my_cq_consumer >> cq_log_cnt) & 1;
bool vote_failed{true};
while (vote_failed) {
op_own = *((volatile uint8_t*)&cqe_entry->op_own);
bool my_ownership_vote = (op_own & 1) == owner_bit;
bool my_opcode_vote = (op_own >> 4) != MLX5_CQE_INVALID;
uint64_t votes = __ballot(my_ownership_vote && my_opcode_vote);
vote_failed = __popcll(votes) < quiet_amount;
if (!vote_failed) {
be_wqe_counter = *((volatile uint16_t*)&cqe_entry->wqe_counter);
}
}
uint16_t wqe_counter;
swap_endian_store(const_cast<uint16_t*>(&wqe_counter), reinterpret_cast<uint16_t>(be_wqe_counter));
uint64_t wqe_id = outstanding_wqes[wqe_counter];
__hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit;
*((volatile uint8_t*)&cqe_entry->op_own) = mlx5_invld_bits;
__atomic_signal_fence(__ATOMIC_SEQ_CST);
}
if (is_leader) {
uint64_t completed {0};
do {
completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (completed != wave_cq_consumer);
swap_endian_store(const_cast<uint32_t*>(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount));
__atomic_signal_fence(__ATOMIC_SEQ_CST);
uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id];
__hip_atomic_fetch_max(&sq_sunk, sunk_wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_fetch_add(&quiet_completed, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
}
}
__device__ void QueuePair::mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
uint64_t activemask = get_active_lane_mask();
uint8_t num_active_lanes = get_active_lane_count(activemask);
uint8_t my_logical_lane_id = get_active_lane_num(activemask);
bool is_leader{my_logical_lane_id == 0};
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
uint8_t num_wqes{num_active_lanes};
uint64_t wave_sq_counter{0};
if (is_leader) {
wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
}
wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
while (true) {
uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
int64_t num_active_sq_entries = db_touched - sunk;
if (num_active_sq_entries < 0) {
continue;
}
uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
if (num_free_entries > num_entries_until_wave_last_entry) {
break;
}
mlx5_quiet();
}
outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
SegmentBuilder seg_build(my_sq_index, sq_buf);
seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0);
seg_build.update_raddr_seg(raddr, rkey);
if (size <= inline_threshold && opcode == gda_op_rdma_write) {
seg_build.update_inl_data_seg(laddr, size);
} else {
seg_build.update_data_seg(laddr, size, lkey);
}
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
uint64_t db_touched {0};
do {
db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (db_touched != wave_sq_counter);
uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
__hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
}
__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
uint64_t activemask = get_active_lane_mask();
uint8_t num_active_lanes = get_active_lane_count(activemask);
uint8_t my_logical_lane_id = get_active_lane_num(activemask);
bool is_leader{my_logical_lane_id == 0};
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
uint8_t num_wqes{num_active_lanes};
uint64_t wave_sq_counter{0};
if (is_leader) {
wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
while (true) {
uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
int64_t num_active_sq_entries = db_touched - sunk;
if (num_active_sq_entries < 0) {
continue;
}
uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
if (num_free_entries > num_entries_until_wave_last_entry) {
break;
}
mlx5_quiet();
}
uint64_t* wave_fetch_atomic{nullptr};
if (fetching) {
if (is_leader) {
uint64_t db_touched {0};
do {
db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (db_touched != wave_sq_counter);
auto res = fetching_atomic_freelist->pop_front();
while (!res.success) {
res = fetching_atomic_freelist->pop_front();
}
wave_fetch_atomic = res.value;
}
wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
}
outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
SegmentBuilder seg_build(my_sq_index, sq_buf);
seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 4, 0, 0);
seg_build.update_raddr_seg(raddr, rkey);
seg_build.update_atomic_seg(atomic_data, atomic_cmp);
if (fetching) {
seg_build.update_data_seg(wave_fetch_atomic + my_logical_lane_id, 8, fetching_atomic_lkey);
} else {
seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey);
}
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
uint64_t db_touched {0};
do {
db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (db_touched != wave_sq_counter);
uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
__hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
uint64_t ret{0};
if (fetching) {
mlx5_quiet();
ret = wave_fetch_atomic[my_logical_lane_id];
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
fetching_atomic_freelist->push_back(wave_fetch_atomic);
}
}
return ret;
}
} // namespace rocshmem
@@ -25,7 +25,7 @@
#include "segment_builder.hpp"
#include "util.hpp"
#include "endian.hpp"
#include "gda/endian.hpp"
namespace rocshmem {
+18 -547
Просмотреть файл
@@ -27,8 +27,6 @@
#include <hip/hip_runtime.h>
#include "backend_gda.hpp"
#include "endian.hpp"
#include "segment_builder.hpp"
#include "constants.hpp"
namespace rocshmem {
@@ -66,22 +64,33 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_vendor) {
}
/* Set Correct opcodes for each NIC */
switch (gda_vendor) {
#if defined(GDA_IONIC)
gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE;
gda_op_rdma_read = IONIC_V2_OP_RDMA_READ;
gda_op_atomic_fa = IONIC_V2_OP_ATOMIC_FA;
gda_op_atomic_cs = IONIC_V2_OP_ATOMIC_CS;
#endif
if (gda_vendor == GDAVendor::BNXT) {
case GDAVendor::IONIC:
gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE;
gda_op_rdma_read = IONIC_V2_OP_RDMA_READ;
gda_op_atomic_fa = IONIC_V2_OP_ATOMIC_FA;
gda_op_atomic_cs = IONIC_V2_OP_ATOMIC_CS;
break;
#endif //defined(GDA_IONIC)
#if defined(GDA_BNXT)
case GDAVendor::BNXT:
gda_op_rdma_write = BNXT_RE_WR_OPCD_RDMA_WRITE;
gda_op_rdma_read = BNXT_RE_WR_OPCD_RDMA_READ;
gda_op_atomic_fa = BNXT_RE_WR_OPCD_ATOMIC_FA;
gda_op_atomic_cs = BNXT_RE_WR_OPCD_ATOMIC_CS;
} else if (gda_vendor == GDAVendor::MLX5) {
break;
#endif //defined(GDA_BNXT)
#if defined(GDA_MLX5)
case GDAVendor::MLX5:
gda_op_rdma_write = MLX5_OPCODE_RDMA_WRITE;
gda_op_rdma_read = MLX5_OPCODE_RDMA_READ;
gda_op_atomic_fa = MLX5_OPCODE_ATOMIC_FA;
gda_op_atomic_cs = MLX5_OPCODE_ATOMIC_CS;
break;
#endif //defined(GDA_MLX5)
default:
assert(false /* invalid nic provider */);
}
gda_vendor_ = gda_vendor;
}
@@ -105,264 +114,6 @@ QueuePair::~QueuePair() {
/******************************************************************************
************************ PROVIDER-SPECIFIC HELPERS ***************************
*****************************************************************************/
#if defined(GDA_IONIC)
__device__ uint64_t QueuePair::get_same_qp_lane_mask() {
uint64_t lane_mask = get_active_lane_mask();
uintptr_t this_val = reinterpret_cast<uintptr_t>(this);
// exclude threads operating on a different qp from this thread lane mask
#pragma unroll
for (int i = 0; i < 64; ++i) {
uint64_t bit_i = 1ull << i;
if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) {
lane_mask &= ~bit_i;
}
}
return lane_mask;
}
__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) {
uint32_t my_sq_prod = 0;
// reserve space for wqes in sq
if (is_first_active_lane(activemask)) {
my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask));
// wait for that space to be available
ionic_quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask);
return my_sq_prod;
}
__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) {
uint32_t dbprod = my_sq_prod + num_wqes;
spin_lock_acquire_shared(&sq_lock, activemask);
if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) {
sq_dbprod = dbprod;
ionic_ring_doorbell(dbprod);
}
spin_lock_release_shared(&sq_lock, activemask);
return dbprod;
}
__device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
uint32_t my_cq_pos = cq_pos + my_logical_lane_id;
/* Look at the cqe at the current position in the cq buffer */
struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask];
/* Determine expected color based on cq wrap count */
uint32_t qtf_color_bit = swap_endian_val<uint32_t>(IONIC_V1_CQE_COLOR);
uint32_t qtf_color_exp = qtf_color_bit;
if (my_cq_pos & (cq_mask + 1)) {
qtf_color_exp = 0;
}
/* Check if my cqe color == expected color */
uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
if ((qtf_be & qtf_color_bit) != qtf_color_exp) {
return;
}
uint32_t msn = swap_endian_val<uint32_t>(cqe->send.msg_msn);
/* Report if the completion indicates an error. */
if (!!(qtf_be & swap_endian_val<uint32_t>(IONIC_V1_CQE_ERROR))) {
#if defined(DEBUG)
uint32_t qtf = swap_endian_val<uint32_t>(qtf_be);
uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT;
uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
uint32_t flag = qtf & 0xf;
uint32_t status = swap_endian_val<uint32_t>(cqe->status_length);
uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK;
printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n",
dev_name, qid, type, flag, status, msn, npg);
#endif
/* No other way to signal an error, so just crash. */
abort();
}
/* Only proceed with the furthest ahead cqe to update the sq state */
uint64_t my_lane_mask = 1ull << __lane_id();
uint64_t lesser_lane_mask = my_lane_mask - 1;
if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) {
return;
}
/* update position in the cq */
cq_pos = my_cq_pos + 1;
/*
* Ring cq doorbell frequently enough to avoid cq full.
*
* NB: IONIC_CQ_GRACE is 100
*/
if (((cq_pos - cq_dbpos) & cq_mask) >= 100) {
cq_dbpos = cq_pos;
__atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed?
}
sq_msn = msn;
}
__device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) {
uint32_t greed = 10;
/* wait for sq_msn to catch up or pass cons. */
/* 0x800000 - sign bit for 24-bit fields */
while ((sq_msn - cons) & 0x800000) {
if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) {
continue;
}
/* with lock acquired, this wave polls cqes until caught up */
while ((sq_msn - cons) & 0x800000) {
uint32_t old_sq_msn = sq_msn;
poll_wave_cqes(activemask);
if (!((sq_msn - cons) & 0x800000)) {
if (sq_msn == old_sq_msn) {
break;
}
if (!greed) {
break;
}
--greed;
}
}
spin_lock_release_shared(&cq_lock, activemask);
break;
}
}
#endif // GDA_IONIC
#if defined(GDA_IONIC)
__device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) {
// TODO When threads write at once to the same address, not all writes reach the bus.
for (int i = 0; i < 64; ++i) {
if (__lane_id() == i) {
__threadfence();
__atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST);
}
}
__threadfence();
}
#endif
#if defined(GDA_MLX5)
__device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) {
swap_endian_store(const_cast<uint32_t*>(dbrec), (uint32_t)my_sq_counter);
__atomic_signal_fence(__ATOMIC_SEQ_CST);
__hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
uint64_t db_uint = __hip_atomic_load(&db.uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
db_uint ^= 0x100;
__hip_atomic_store(&db.uint, db_uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
#endif // GDA_MLX5
#if defined(GDA_IONIC)
__device__ void QueuePair::ionic_quiet() {
ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod);
}
#endif
#if defined(GDA_MLX5)
__device__ void QueuePair::mlx5_quiet() {
constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE;
__shared__ uint64_t wqe_broadcast[BROADCAST_SIZE];
uint8_t wavefront_id = get_flat_block_id() / WF_SIZE;
wqe_broadcast[wavefront_id] = 0;
uint64_t activemask = get_active_lane_mask();
uint8_t num_active_lanes = get_active_lane_count(activemask);
uint8_t my_logical_lane_id = get_active_lane_num(activemask);
bool is_leader{my_logical_lane_id == 0};
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
while (true) {
bool done{false};
uint64_t quiet_amount{0};
uint64_t wave_cq_consumer{0};
while (!done) {
uint64_t active = __hip_atomic_load(&quiet_active, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
uint64_t posted = __hip_atomic_load(&quiet_posted, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
uint64_t completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
if (!(posted - completed)) {
return;
}
int64_t quiet_val = posted - active;
if (quiet_val <= 0) {
continue;
}
quiet_amount = min(num_active_lanes, quiet_val);
if (is_leader) {
done = __hip_atomic_compare_exchange_strong(&quiet_active, &active, active + quiet_amount, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
if (done) {
wave_cq_consumer = __hip_atomic_fetch_add(&cq_consumer, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
}
done = __shfl(done, leader_phys_lane_id);
}
wave_cq_consumer = __shfl(wave_cq_consumer, leader_phys_lane_id);
uint64_t my_cq_consumer = wave_cq_consumer + my_logical_lane_id;
uint64_t my_cq_index = my_cq_consumer % cq_cnt;
if (my_logical_lane_id < quiet_amount) {
volatile mlx5_cqe64 *cqe_entry = &cq_buf[my_cq_index];
uint16_t be_wqe_counter{0};
uint8_t op_own{0};
uint8_t owner_bit = (my_cq_consumer >> cq_log_cnt) & 1;
bool vote_failed{true};
while (vote_failed) {
op_own = *((volatile uint8_t*)&cqe_entry->op_own);
bool my_ownership_vote = (op_own & 1) == owner_bit;
bool my_opcode_vote = (op_own >> 4) != MLX5_CQE_INVALID;
uint64_t votes = __ballot(my_ownership_vote && my_opcode_vote);
vote_failed = __popcll(votes) < quiet_amount;
if (!vote_failed) {
be_wqe_counter = *((volatile uint16_t*)&cqe_entry->wqe_counter);
}
}
uint16_t wqe_counter;
swap_endian_store(const_cast<uint16_t*>(&wqe_counter), reinterpret_cast<uint16_t>(be_wqe_counter));
uint64_t wqe_id = outstanding_wqes[wqe_counter];
__hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit;
*((volatile uint8_t*)&cqe_entry->op_own) = mlx5_invld_bits;
__atomic_signal_fence(__ATOMIC_SEQ_CST);
}
if (is_leader) {
uint64_t completed {0};
do {
completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (completed != wave_cq_consumer);
swap_endian_store(const_cast<uint32_t*>(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount));
__atomic_signal_fence(__ATOMIC_SEQ_CST);
uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id];
__hip_atomic_fetch_max(&sq_sunk, sunk_wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_fetch_add(&quiet_completed, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
}
}
#endif // GDA_MLX5
__device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
switch (gda_vendor_) {
#if defined(GDA_MLX5)
@@ -428,286 +179,6 @@ __device__ void QueuePair::quiet() {
}
}
#if defined(GDA_IONIC)
__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
uint64_t activemask = get_same_qp_lane_mask();
uint32_t num_wqes = get_active_lane_count(activemask);
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
uint16_t wqe_flags = 0;
if (!(my_sq_pos & (sq_mask + 1))) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
}
if (is_last_active_lane(activemask)) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
}
// TODO why is this needed?
if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) {
size = 1;
}
wqe->base.wqe_idx = my_sq_pos;
wqe->base.op = opcode;
wqe->base.num_sge_key = size ? 1 : 0;
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
wqe->common.rdma.remote_rkey = swap_endian_val<uint32_t>(rkey);
wqe->common.length = swap_endian_val<uint32_t>(size);
if (size) {
if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
wqe->base.num_sge_key = 0;
if (!laddr) {
// TODO why is this needed?
wqe->common.pld.data[0] = 1;
} else {
memcpy(wqe->common.pld.data, laddr, size);
}
} else {
wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(laddr));
wqe->common.pld.sgl[0].len = swap_endian_val<uint32_t>(size);
wqe->common.pld.sgl[0].lkey = swap_endian_val<uint32_t>(lkey);
}
}
__hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
}
#endif
#if defined (GDA_MLX5)
__device__ void QueuePair::mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
uint64_t activemask = get_active_lane_mask();
uint8_t num_active_lanes = get_active_lane_count(activemask);
uint8_t my_logical_lane_id = get_active_lane_num(activemask);
bool is_leader{my_logical_lane_id == 0};
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
uint8_t num_wqes{num_active_lanes};
uint64_t wave_sq_counter{0};
if (is_leader) {
wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT);
}
wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
while (true) {
uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
int64_t num_active_sq_entries = db_touched - sunk;
if (num_active_sq_entries < 0) {
continue;
}
uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
if (num_free_entries > num_entries_until_wave_last_entry) {
break;
}
mlx5_quiet();
}
outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
SegmentBuilder seg_build(my_sq_index, sq_buf);
seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0);
seg_build.update_raddr_seg(raddr, rkey);
if (size <= inline_threshold && opcode == gda_op_rdma_write) {
seg_build.update_inl_data_seg(laddr, size);
} else {
seg_build.update_data_seg(laddr, size, lkey);
}
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
uint64_t db_touched {0};
do {
db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (db_touched != wave_sq_counter);
uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
__hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
}
#endif // GDA_MLX5
#if defined(GDA_IONIC)
__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
uint64_t activemask = get_same_qp_lane_mask();
uint32_t num_wqes = get_active_lane_count(activemask);
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
bool is_leader{my_logical_lane_id == 0};
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
uint16_t wqe_flags = 0;
uint32_t cons;
uint64_t* wave_fetch_atomic{nullptr};
if (fetching) {
if (is_leader) {
auto res = fetching_atomic_freelist->pop_front();
while (!res.success) {
res = fetching_atomic_freelist->pop_front();
}
wave_fetch_atomic = res.value;
}
wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
}
if (!(my_sq_pos & (sq_mask + 1))) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
}
if (is_last_active_lane(activemask)) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
}
wqe->base.wqe_idx = my_sq_pos;
wqe->base.op = opcode;
wqe->base.num_sge_key = 1;
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr));
wqe->atomic_v2.remote_rkey = swap_endian_val<uint32_t>(rkey);
wqe->atomic_v2.swap_add_high = swap_endian_val<uint32_t>(atomic_data >> 32);
wqe->atomic_v2.swap_add_low = swap_endian_val<uint32_t>(atomic_data);
wqe->atomic_v2.compare_high = swap_endian_val<uint32_t>(atomic_cmp >> 32);
wqe->atomic_v2.compare_low = swap_endian_val<uint32_t>(atomic_cmp);
if (fetching) {
wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(wave_fetch_atomic + my_logical_lane_id));
wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(fetching_atomic_lkey);
} else {
wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(nonfetching_atomic));
wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(nonfetching_atomic_lkey);
}
__hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
uint64_t ret{0};
if (fetching) {
ionic_quiet_internal(activemask, cons);
ret = wave_fetch_atomic[my_logical_lane_id];
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
fetching_atomic_freelist->push_back(wave_fetch_atomic);
}
}
return ret;
}
#endif
#if defined(GDA_MLX5)
__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode,
int64_t atomic_data, int64_t atomic_cmp, bool fetching) {
uint64_t activemask = get_active_lane_mask();
uint8_t num_active_lanes = get_active_lane_count(activemask);
uint8_t my_logical_lane_id = get_active_lane_num(activemask);
bool is_leader{my_logical_lane_id == 0};
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
uint8_t num_wqes{num_active_lanes};
uint64_t wave_sq_counter{0};
if (is_leader) {
wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id);
uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id;
uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt;
while (true) {
uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
int64_t num_active_sq_entries = db_touched - sunk;
if (num_active_sq_entries < 0) {
continue;
}
uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries;
uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched;
if (num_free_entries > num_entries_until_wave_last_entry) {
break;
}
mlx5_quiet();
}
uint64_t* wave_fetch_atomic{nullptr};
if (fetching) {
if (is_leader) {
uint64_t db_touched {0};
do {
db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (db_touched != wave_sq_counter);
auto res = fetching_atomic_freelist->pop_front();
while (!res.success) {
res = fetching_atomic_freelist->pop_front();
}
wave_fetch_atomic = res.value;
}
wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
}
outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter;
SegmentBuilder seg_build(my_sq_index, sq_buf);
seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 4, 0, 0);
seg_build.update_raddr_seg(raddr, rkey);
seg_build.update_atomic_seg(atomic_data, atomic_cmp);
if (fetching) {
seg_build.update_data_seg(wave_fetch_atomic + my_logical_lane_id, 8, fetching_atomic_lkey);
} else {
seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey);
}
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
uint64_t db_touched {0};
do {
db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (db_touched != wave_sq_counter);
uint8_t *base_ptr = reinterpret_cast<uint8_t*>(sq_buf);
uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast<uint64_t*>(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]);
mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes);
__hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
uint64_t ret{0};
if (fetching) {
mlx5_quiet();
ret = wave_fetch_atomic[my_logical_lane_id];
__atomic_signal_fence(__ATOMIC_SEQ_CST);
if (is_leader) {
fetching_atomic_freelist->push_back(wave_fetch_atomic);
}
}
return ret;
}
#endif // GDA_MLX5
/******************************************************************************
****************************** SHMEM INTERFACE *******************************
*****************************************************************************/
+55 -59
Просмотреть файл
@@ -163,14 +163,17 @@ class QueuePair {
#if defined(GDA_MLX5)
__device__ uint64_t mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
__device__ void mlx5_quiet();
#endif
#if defined(GDA_BNXT)
__device__ uint64_t bnxt_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
__device__ void bnxt_quiet();
#endif
#if defined(GDA_IONIC)
__device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
__device__ void ionic_quiet();
#endif
/**
@@ -189,65 +192,6 @@ class QueuePair {
__device__ void ionic_ring_doorbell(uint32_t pos);
#endif
#ifdef GDA_IONIC
__device__ uint64_t get_same_qp_lane_mask();
/**
* @brief Reserve space in the sq to post this many wqes.
* @param my_tid my logical thread id.
* @param num_wqes number of sq wqes to reserve for this wave.
* @return position of my_tid=0's wqe.
*/
__device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes);
/**
* @brief Ring the sq doorbell maintaining order between waves.
* @param last this is the last wqe posted in this wave.
* @param my_sq_prod position of my_tid=0's wqe.
* @param num_wqes number of sq wqes posted in this wave.
* @param wqe this thread's wqe.
* @return doorbell producer index.
*/
__device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes);
/**
* @brief Helper method to poll the next completion queue entry.
*/
__device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask);
/**
* @brief Helper method to drain completion queue entries.
* @param cons wait for sq_msn to catch up to this position.
*/
__device__ __attribute__((noinline)) void ionic_quiet_internal(uint64_t active_lane_mask, uint32_t cons);
uint64_t *cq_dbreg{nullptr};
uint64_t cq_dbval{0};
uint64_t cq_mask{0};
struct ionic_v1_cqe *ionic_cq_buf{nullptr};
uint32_t cq_lock{SPIN_LOCK_UNLOCKED};
uint32_t cq_pos{0};
uint32_t cq_dbpos{0};
uint64_t *sq_dbreg{nullptr};
uint64_t sq_dbval{0};
uint64_t sq_mask{0};
struct ionic_v1_wqe *ionic_sq_buf{nullptr};
uint32_t sq_lock{SPIN_LOCK_UNLOCKED};
uint32_t sq_dbprod{0};
uint32_t sq_prod{0};
uint32_t sq_msn{0};
#endif
#if defined(GDA_MLX5)
__device__ void mlx5_quiet();
#endif
#if defined(GDA_BNXT)
__device__ void bnxt_quiet();
#endif
#if defined(GDA_IONIC)
__device__ void ionic_quiet();
#endif
int gda_vendor_{0};
/* GDAVendor::BNXT START */
@@ -322,6 +266,58 @@ class QueuePair {
/* GDAVendor::MLX5 END */
/* GDAVendor::IONIC START */
uint64_t *cq_dbreg{nullptr};
uint64_t cq_dbval{0};
uint64_t cq_mask{0};
struct ionic_v1_cqe *ionic_cq_buf{nullptr};
uint32_t cq_lock{SPIN_LOCK_UNLOCKED};
uint32_t cq_pos{0};
uint32_t cq_dbpos{0};
uint64_t *sq_dbreg{nullptr};
uint64_t sq_dbval{0};
uint64_t sq_mask{0};
struct ionic_v1_wqe *ionic_sq_buf{nullptr};
uint32_t sq_lock{SPIN_LOCK_UNLOCKED};
uint32_t sq_dbprod{0};
uint32_t sq_prod{0};
uint32_t sq_msn{0};
__device__ uint64_t get_same_qp_lane_mask();
/**
* @brief Reserve space in the sq to post this many wqes.
* @param my_tid my logical thread id.
* @param num_wqes number of sq wqes to reserve for this wave.
* @return position of my_tid=0's wqe.
*/
__device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes);
/**
* @brief Ring the sq doorbell maintaining order between waves.
* @param last this is the last wqe posted in this wave.
* @param my_sq_prod position of my_tid=0's wqe.
* @param num_wqes number of sq wqes posted in this wave.
* @param wqe this thread's wqe.
* @return doorbell producer index.
*/
__device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes);
/**
* @brief Helper method to poll the next completion queue entry.
*/
__device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask);
/**
* @brief Helper method to drain completion queue entries.
* @param cons wait for sq_msn to catch up to this position.
*/
__device__ __attribute__((noinline)) void ionic_quiet_internal(uint64_t active_lane_mask, uint32_t cons);
/* GDAVendor::IONIC END */
uint32_t inline_threshold{0};
char dev_name[24];