diff --git a/projects/rocshmem/cmake/FindIBVerbs.cmake b/projects/rocshmem/cmake/FindIBVerbs.cmake index 258e91687c..a9b7a37911 100644 --- a/projects/rocshmem/cmake/FindIBVerbs.cmake +++ b/projects/rocshmem/cmake/FindIBVerbs.cmake @@ -25,7 +25,6 @@ find_package(PkgConfig QUIET) if (PkgConfig_FOUND) if (IBVerbs_ROOT ) - # We don't use IBVerbs_DIR as this is supposed to be used when finding hwloc-config.cmake only set(ENV{PKG_CONFIG_PATH} "${IBVerbs_ROOT}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") endif() pkg_check_modules(PC_IBVerbs QUIET libibverbs) @@ -42,32 +41,11 @@ find_library(IBVerbs_LIBRARY PATH_SUFFIXES lib lib64 ) -if (GDA_IONIC) -list(APPEND provider_vars IBVerbs_IONIC_LIBRARY IBVerbs_IONIC_INCLUDE_DIR) -find_path(IBVerbs_IONIC_INCLUDE_DIR infiniband/ionic_dv.h - HINTS ${PC_IBVerbs_INCLUDEDIR} ${PC_IBVerbs_INCLUDE_DIRS} - PATH_SUFFIXES include -) - -find_library(IBVerbs_IONIC_LIBRARY - NAMES ionic libionic - HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS} - PATH_SUFFIXES lib lib64 -) - -add_library(IBVerbs::verbs_ionic UNKNOWN IMPORTED) -set_target_properties(IBVerbs::verbs_ionic PROPERTIES - IMPORTED_LOCATION "${IBVerbs_IONIC_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_IONIC_INCLUDE_DIR}" -) -endif() - find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR - ${provider_vars} ) -mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR ${provider_vars}) +mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR) if (IBVerbs_FOUND) add_library(IBVerbs::verbs UNKNOWN IMPORTED) @@ -76,9 +54,4 @@ set_target_properties(IBVerbs::verbs PROPERTIES INTERFACE_COMPILE_OPTIONS "${PC_IBVerbs_CFLAGS_OTHER}" INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_INCLUDE_DIR}" ) - -target_link_libraries(IBVerbs::verbs INTERFACE - $ -) - endif() diff --git a/projects/rocshmem/scripts/build_configs/all_backends b/projects/rocshmem/scripts/build_configs/all_backends index 6c49e98e54..1458a983f3 100755 --- a/projects/rocshmem/scripts/build_configs/all_backends +++ b/projects/rocshmem/scripts/build_configs/all_backends @@ -40,7 +40,7 @@ cmake \ -DUSE_GDA=ON \ -DGDA_MLX5=ON \ -DGDA_BNXT=ON \ - -DGDA_IONIC=OFF \ + -DGDA_IONIC=ON \ -DUSE_RO=ON \ -DUSE_IPC=ON \ -DUSE_THREADS=OFF \ diff --git a/projects/rocshmem/src/gda/CMakeLists.txt b/projects/rocshmem/src/gda/CMakeLists.txt index 1d8234297c..983a90cb0b 100644 --- a/projects/rocshmem/src/gda/CMakeLists.txt +++ b/projects/rocshmem/src/gda/CMakeLists.txt @@ -35,7 +35,6 @@ target_sources( gda_team.cpp queue_pair.cpp endian.cpp - segment_builder.cpp topology.cpp ) @@ -48,4 +47,6 @@ target_link_libraries( numa ) +add_subdirectory(mlx5) add_subdirectory(bnxt) +add_subdirectory(ionic) diff --git a/projects/rocshmem/src/gda/backend_gda.cpp b/projects/rocshmem/src/gda/backend_gda.cpp index 60c96eaf45..232089cb8f 100644 --- a/projects/rocshmem/src/gda/backend_gda.cpp +++ b/projects/rocshmem/src/gda/backend_gda.cpp @@ -122,11 +122,7 @@ GDABackend::~GDABackend() { cleanup_heap_memory_rkey(); cleanup_ibv(); - if (bnxtdv_handle_ != nullptr) - dlclose(bnxtdv_handle_); - - if (mlx5dv_handle_ != nullptr) - dlclose(mlx5dv_handle_); + close_dv_libs(); } void GDABackend::read_env() { @@ -532,52 +528,45 @@ void GDABackend::rte_barrier() { } } -int GDABackend::mlx5_dv_dl_init () { - mlx5dv_handle_ = dlopen("libmlx5.so", RTLD_NOW); - if (!mlx5dv_handle_) { - DPRINTF("Could not open libmlx5.so. Returning\n"); - return ROCSHMEM_ERROR; - } - - DLSYM_HELPER(mlx5dv_ftable_, mlx5dv_, mlx5dv_handle_, init_obj); - return ROCSHMEM_SUCCESS; -} - /* Currently we only check whether we can dlopen a Direct Verbs library. -** We might need to extend this logic to check whether we have interfaces that -** can use those DV libraries -*/ + * We might need to extend this logic to check whether we have interfaces that + * can use those DV libraries + */ int GDABackend::backend_can_run() { void *handle{nullptr}; /* Try opening bnxt DV libraries */ - handle = dlopen("libbnxt_re.so", RTLD_NOW); +#if defined(GDA_BNXT) + handle = bnxt_dv_dlopen(); if (handle) { dlclose(handle); return ROCSHMEM_SUCCESS; - } else { - /* Try hard-coded PATH */ - handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW); - if (handle) { - dlclose(handle); - return ROCSHMEM_SUCCESS; - } } +#endif //defined(GDA_BNXT) + + /* Try opening ionic DV libraries */ +#if defined(GDA_IONIC) + handle = ionic_dv_dlopen(); + if (handle) { + dlclose(handle); + return ROCSHMEM_SUCCESS; + } +#endif //defined(GDA_IONIC) /* Try opening mlx5 DV libraries */ - handle = dlopen("libmlx5.so", RTLD_NOW); +#if defined(GDA_MLX5) + handle = mlx5_dv_dlopen(); if (handle) { dlclose(handle); return ROCSHMEM_SUCCESS; } - - /* ToDo: opening ionic DV libraries */ +#endif //defined(GDA_MLX5) return ROCSHMEM_ERROR; } void GDABackend::setup_ibv() { - autodetect_dv_libs(); + open_dv_libs(); open_ib_device(); @@ -599,22 +588,22 @@ void GDABackend::cleanup_ibv() { CHECK_HIP(hipHostUnregister(db_region_attr.dbr)); for (int i = 0; i < qps.size(); i++) { - err = bnxtdv_ftable_.destroy_qp(qps[i]); + err = bnxt_re_dv.destroy_qp(qps[i]); CHECK_ZERO(err, "bnxt_re_dv_destroy_qp"); - err = bnxtdv_ftable_.umem_dereg(bnxt_qps[i].attr.rq_umem_handle); + err = bnxt_re_dv.umem_dereg(bnxt_qps[i].attr.rq_umem_handle); CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (RQ)"); - err = bnxtdv_ftable_.umem_dereg(bnxt_qps[i].attr.sq_umem_handle); + err = bnxt_re_dv.umem_dereg(bnxt_qps[i].attr.sq_umem_handle); CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (SQ)"); CHECK_HIP(hipFree(bnxt_qps[i].sq_buf)); CHECK_HIP(hipFree(bnxt_qps[i].rq_buf)); - err = bnxtdv_ftable_.destroy_cq(cqs[i]); + err = bnxt_re_dv.destroy_cq(cqs[i]); CHECK_ZERO(err, "bnxt_re_dv_destroy_cq"); - err = bnxtdv_ftable_.umem_dereg(bnxt_cqs[i].umem_handle); + err = bnxt_re_dv.umem_dereg(bnxt_cqs[i].umem_handle); CHECK_ZERO(err, "bnxt_re_dv_umem_dereg"); CHECK_HIP(hipFree(bnxt_cqs[i].buf)); @@ -647,13 +636,14 @@ void GDABackend::cleanup_ibv() { CHECK_ZERO(err, "ibv_close_device"); } -void GDABackend::autodetect_dv_libs() { +void GDABackend::open_dv_libs() { int ret; -#ifdef GDA_IONIC - gda_vendor = GDAVendor::IONIC; -#endif + //TODO: environment variable selection/deselection + //this hardcoded init order will always prefer BNXT>IONIC>MLX5 + //if all three drivers are installed +#if defined(GDA_BNXT) if (gda_vendor == GDAVendor::NONE) { ret = bnxt_dv_dl_init(); @@ -663,7 +653,21 @@ void GDABackend::autodetect_dv_libs() { DPRINTF("Initializing rocSHMEM BNXT GDA support failed\n"); } } +#endif // defined(GDA_BNXT) +#if defined(GDA_IONIC) + if (gda_vendor == GDAVendor::NONE) { + ret = ionic_dv_dl_init(); + + if (ret == ROCSHMEM_SUCCESS) { + gda_vendor = GDAVendor::IONIC; + } else { + DPRINTF("Initializing rocSHMEM IONIC GDA support failed\n"); + } + } +#endif // defined(GDA_IONIC) + +#if defined(GDA_MLX5) if (gda_vendor == GDAVendor::NONE) { ret = mlx5_dv_dl_init(); @@ -673,13 +677,26 @@ void GDABackend::autodetect_dv_libs() { DPRINTF("Initializing rocSHMEM MLX5 GDA support failed\n"); } } +#endif // defined(GDA_MLX5) if (gda_vendor == GDAVendor::NONE) { - printf("Initializing rocSHMEM with IONIC, BNXT, or MLX5 GDA support failed\n"); + printf("Initializing rocSHMEM with IONIC, BNXT, or MLX5 GDA support failed: no DV library found\n"); abort(); } } +void GDABackend::close_dv_libs() { + if (ionicdv_handle_ != nullptr) + dlclose(ionicdv_handle_); + + if (bnxtdv_handle_ != nullptr) + dlclose(bnxtdv_handle_); + + if (mlx5dv_handle_ != nullptr) + dlclose(mlx5dv_handle_); + + gda_vendor = GDAVendor::NONE; +} void GDABackend::exchange_qp_dest_info() { for (int i = 0; i < qps.size(); i++) { @@ -752,11 +769,7 @@ void GDABackend::setup_gpu_qps() { new (&host_qps[i]) QueuePair(pd_orig, gda_vendor); CHECK_HIP(hipMemcpy(&gpu_qps[i], &host_qps[i], sizeof(QueuePair), hipMemcpyDefault)); - if (gda_vendor == GDAVendor::BNXT) { - bnxt_initialize_gpu_qp(&gpu_qps[i], i); - } else { - initialize_gpu_qp(&gpu_qps[i], i); - } + initialize_gpu_qp(&gpu_qps[i], i); } } @@ -844,7 +857,7 @@ void GDABackend::modify_qps_reset_to_init() { for (int i =0; i < qps.size() ; i++) { if (gda_vendor == GDAVendor::BNXT) { - err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0); + err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0); } else { err = ibv_modify_qp(qps[i], &attr, attr_mask); } @@ -896,7 +909,7 @@ void GDABackend::modify_qps_init_to_rtr() { } if (gda_vendor == GDAVendor::BNXT) { - err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0); + err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0); } else { err = ibv_modify_qp(qps[i], &attr, attr_mask); } @@ -932,7 +945,7 @@ void GDABackend::modify_qps_rtr_to_rts() { attr.sq_psn = dest_info[i].psn; if (gda_vendor == GDAVendor::BNXT) { - err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0); + err = bnxt_re_dv.modify_qp(qps[i], &attr, attr_mask, 0, 0); } else { err = ibv_modify_qp(qps[i], &attr, attr_mask); } @@ -1057,19 +1070,9 @@ void GDABackend::create_parent_domain() { CHECK_NNULL(pd_parent, "ibv_alloc_parent_domain"); dump_ibv_pd(pd_parent); -#ifdef GDA_IONIC - ionic_dv_pd_set_sqcmb(pd_parent, false, false, false); - ionic_dv_pd_set_rqcmb(pd_parent, false, false, false); - - for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) { - pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(context, &pattr); - CHECK_NNULL(pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)"); - - ionic_dv_pd_set_sqcmb(pd_uxdma[uxdma_i], false, false, false); - ionic_dv_pd_set_rqcmb(pd_uxdma[uxdma_i], false, false, false); - ionic_dv_pd_set_udma_mask(pd_uxdma[uxdma_i], 1u << uxdma_i); + if (gda_vendor == GDAVendor::IONIC) { + ionic_setup_parent_domain(&pattr); } -#endif /* GDA_IONIC */ } void GDABackend::create_cqs(int cqe) { @@ -1099,123 +1102,18 @@ void GDABackend::create_cqs(int cqe) { } void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { - int hip_dev_id{-1}; - CHECK_HIP(hipGetDevice(&hip_dev_id)); - -#ifdef GDA_IONIC - ionic_dv_ctx dvctx; - ionic_dv_get_ctx(&dvctx, context); - - void* gpu_db_page = nullptr; - rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id); - - uint64_t *db_page_u64 = reinterpret_cast(dvctx.db_page); - uint64_t *gpu_db_page_u64 = reinterpret_cast(gpu_db_page); - - uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64]; - - gpu_db_page = gpu_db_page; - gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype]; - gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype]; - - uint8_t udma_idx = ionic_dv_qp_get_udma_idx(qps[conn_num]); - - ionic_dv_cq dvcq; - ionic_dv_get_cq(&dvcq, cqs[conn_num], udma_idx); - - gpu_qp->cq_dbreg = gpu_db_cq; - gpu_qp->cq_dbval = dvcq.q.db_val; - gpu_qp->cq_mask = dvcq.q.mask; - - gpu_qp->ionic_cq_buf = reinterpret_cast(dvcq.q.ptr); - - ionic_dv_qp dvqp; - ionic_dv_get_qp(&dvqp, qps[conn_num]); - - gpu_qp->sq_dbreg = gpu_db_sq; - gpu_qp->sq_dbval = dvqp.sq.db_val; - gpu_qp->sq_mask = dvqp.sq.mask; - gpu_qp->ionic_sq_buf = reinterpret_cast(dvqp.sq.ptr); - - strncpy(gpu_qp->dev_name, - qps[conn_num]->context->device->name, - sizeof(gpu_qp->dev_name)); - gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0; - - gpu_qp->qp_num = qps[conn_num]->qp_num; - gpu_qp->lkey = heap_mr->lkey; - gpu_qp->rkey = heap_rkey[conn_num % num_pes]; - gpu_qp->inline_threshold = 32; -#endif /* GDA_IONIC */ - if (gda_vendor == GDAVendor::MLX5) { - mlx5dv_cq cq_out; - mlx5dv_obj mlx_obj; - mlx_obj.cq.in = cqs[conn_num]; - mlx_obj.cq.out = &cq_out; - mlx5dv_ftable_.init_obj(&mlx_obj, MLX5DV_OBJ_CQ); - dump_mlx5dv_cq(&cq_out, conn_num); - - /* - * struct mlx5dv_cq { - * void *buf; - * __be32 *dbrec; - * uint32_t cqe_cnt; - * uint32_t cqe_size; - * void *cq_uar; - * uint32_t cqn; - * uint64_t comp_mask; - * }; - */ - - gpu_qp->cq_buf = reinterpret_cast(cq_out.buf); - gpu_qp->cq_cnt = cq_out.cqe_cnt; - gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt); - gpu_qp->cq_dbrec = cq_out.dbrec; - - mlx5dv_qp qp_out; - mlx_obj.qp.in = qps[conn_num]; - mlx_obj.qp.out = &qp_out; - mlx5dv_ftable_.init_obj(&mlx_obj, MLX5DV_OBJ_QP); - dump_mlx5dv_qp(&qp_out, conn_num); - - /* - * struct mlx5dv_qp { - * __be32 *dbrec; - * struct { - * void *buf; - * uint32_t wqe_cnt; - * uint32_t stride; - * } sq; - * struct { - * void *buf; - * uint32_t wqe_cnt; - * uint32_t stride; - * } rq; - * struct { - * void *reg; - * uint32_t size; - * } bf; - * uint64_t comp_mask; - * off_t uar_mmap_offset; - * uint32_t tirn; - * uint32_t tisn; - * uint32_t rqn; - * uint32_t sqn; - * uint64_t tir_icm_addr; - * }; - */ - - gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR - gpu_qp->sq_buf = reinterpret_cast(qp_out.sq.buf); - gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt; - gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]); - gpu_qp->lkey = htobe32(heap_mr->lkey); - gpu_qp->qp_num = qps[conn_num]->qp_num; - gpu_qp->inline_threshold = inline_threshold; - // The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers - void* gpu_ptr{nullptr}; - rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id); - gpu_qp->db.ptr = reinterpret_cast(gpu_ptr); + switch (gda_vendor) { + case GDAVendor::IONIC: + ionic_initialize_gpu_qp(gpu_qp, conn_num); + break; + case GDAVendor::BNXT: + bnxt_initialize_gpu_qp(gpu_qp, conn_num); + break; + case GDAVendor::MLX5: + mlx5_initialize_gpu_qp(gpu_qp, conn_num); + break; + default: + assert(false /* GDAVendor initialize_gpu_qp */); } } diff --git a/projects/rocshmem/src/gda/backend_gda.hpp b/projects/rocshmem/src/gda/backend_gda.hpp index 65f07edc55..71bb4024c6 100644 --- a/projects/rocshmem/src/gda/backend_gda.hpp +++ b/projects/rocshmem/src/gda/backend_gda.hpp @@ -40,32 +40,6 @@ #include "gda/bnxt/provider_gda_bnxt.hpp" #include "gda/mlx5/provider_gda_mlx5.hpp" -struct bnxtdv_funcs_t { - int (*init_obj)(struct bnxt_re_dv_obj *obj, uint64_t obj_type); - struct ibv_qp* (*create_qp)(struct ibv_pd *pd, - struct bnxt_re_dv_qp_init_attr *qp_attr); - int (*destroy_qp)(struct ibv_qp *ibvqp); - int (*modify_qp)(struct ibv_qp *ibv_qp, struct ibv_qp_attr *attr, - int attr_mask, uint32_t type, uint32_t value); - int (*qp_mem_alloc)(struct ibv_pd *ibvpd, - struct ibv_qp_init_attr *attr, - struct bnxt_re_dv_qp_mem_info *dv_qp_mem); - struct ibv_cq* (*create_cq)(struct ibv_context *ibvctx, - struct bnxt_re_dv_cq_init_attr *cq_attr); - int (*destroy_cq)(struct ibv_cq *ibv_cq); - void* (*cq_mem_alloc)(struct ibv_context *ibvctx, int num_cqe, - struct bnxt_re_dv_cq_attr *cq_attr); - void* (*umem_reg)(struct ibv_context *ibvctx, - struct bnxt_re_dv_umem_reg_attr *in); - int (*umem_dereg)(void *umem_handle); - int (*get_default_db_region)(struct ibv_context *ibvctx, - struct bnxt_re_dv_db_region_attr *out); -}; - -struct mlx5dv_funcs_t { - int (*init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type); -}; - namespace rocshmem { class GDAContext; @@ -325,6 +299,8 @@ class GDABackend : public Backend { void initialize_gpu_qp(QueuePair* qp, int conn_num); void bnxt_initialize_gpu_qp(QueuePair* qp, int conn_num); + void ionic_initialize_gpu_qp(QueuePair* qp, int conn_num); + void mlx5_initialize_gpu_qp(QueuePair* qp, int conn_num); /** * @brief Setup InfiniBand Resources @@ -337,9 +313,14 @@ class GDABackend : public Backend { void cleanup_ibv(); /** - * @brief Detect the available direct verbs libraries + * @brief Detect and load the available direct verbs libraries */ - void autodetect_dv_libs(); + void open_dv_libs(); + + /** + * @ brief Close opened direct verbs libraries + */ + void close_dv_libs(); /** * @brief Open InfiniBand Device and create common structures @@ -404,6 +385,7 @@ class GDABackend : public Backend { static void pd_release(ibv_pd* pd, void* pd_context, void* ptr, uint64_t resource_type); void create_parent_domain(); + void ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* pattr); void setup_gpu_qps(); void cleanup_gpu_qps(); @@ -509,7 +491,7 @@ class GDABackend : public Backend { * @brief structures holding the function pointers to the direct verbs functionality * of each network driver. */ - bnxtdv_funcs_t bnxtdv_ftable_; + bnxtdv_funcs_t bnxt_re_dv; /** * @brief handle used for the dlopen of the BCOM library @@ -521,11 +503,16 @@ class GDABackend : public Backend { */ int bnxt_dv_dl_init(); + /** + * @brief open bnxt dv lib + */ + static void* bnxt_dv_dlopen(); + /** * @brief structures holding the function pointers to the direct verbs functionality * of each network driver. */ - mlx5dv_funcs_t mlx5dv_ftable_; + mlx5dv_funcs_t mlx5dv; /** * @brief handle used for the dlopen of the MLX5 library @@ -536,6 +523,32 @@ class GDABackend : public Backend { * @brief initialize function table for MLNX direct verbs support */ int mlx5_dv_dl_init(); + + /** + * @brief open mlx5 dv lib + */ + static void* mlx5_dv_dlopen(); + + /** + * @brief structures holding the function pointers to the direct verbs functionality + * of each network driver. + */ + ionicdv_funcs_t ionic_dv; + + /** + * @brief handle used for the dlopen of the IONIC library + */ + void *ionicdv_handle_{nullptr}; + + /** + * @brief initialize function table for IONIC direct verbs support + */ + int ionic_dv_dl_init(); + + /** + * @brief open ionic dv lib + */ + static void* ionic_dv_dlopen(); }; } // namespace rocshmem diff --git a/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp b/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp index 46fc5d87e9..a61f20fb49 100644 --- a/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp +++ b/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp @@ -42,7 +42,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { dv_obj.cq.in = cqs[conn_num]; dv_obj.cq.out = &dv_cq; - err = bnxtdv_ftable_.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ); + err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_CQ); CHECK_ZERO(err, "bnxt_re_dv_init_obj(CQ)"); memset(&gpu_qp->cq, 0, sizeof(bnxt_device_cq)); @@ -56,7 +56,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { dv_obj.qp.in = ib_qp; dv_obj.qp.out = &dv_qp; - err = bnxtdv_ftable_.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP); + err = bnxt_re_dv.init_obj(&dv_obj, BNXT_RE_DV_OBJ_QP); CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)"); memset(&gpu_qp->sq, 0, sizeof(bnxt_device_sq)); @@ -76,7 +76,7 @@ void GDABackend::bnxt_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { gpu_qp->sq.mtu = ibv_mtu_to_int(portinfo.active_mtu); /* Export DB */ - err = bnxtdv_ftable_.get_default_db_region(context, &db_region_attr); + err = bnxt_re_dv.get_default_db_region(context, &db_region_attr); CHECK_ZERO(err, "bnxt_re_dv_init_obj(QP)"); CHECK_HIP(hipHostRegister(db_region_attr.dbr, getpagesize(), hipHostRegisterDefault)); @@ -98,7 +98,7 @@ void GDABackend::bnxt_create_cqs(int cqe) { for (int i = 0; i < qps.size(); i++) { /* Allocate CQ mem */ memset(&cq_attr, 0, sizeof(struct bnxt_re_dv_cq_attr)); - bnxt_cqs[i].handle = bnxtdv_ftable_.cq_mem_alloc(context, cqe, &cq_attr); + bnxt_cqs[i].handle = bnxt_re_dv.cq_mem_alloc(context, cqe, &cq_attr); CHECK_NNULL(bnxt_cqs[i].handle, "bnxt_re_dv_cq_mem_alloc"); /* Allocate CQ UMEM */ @@ -112,7 +112,7 @@ void GDABackend::bnxt_create_cqs(int cqe) { umem_attr.size = bnxt_cqs[i].length; umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE; - bnxt_cqs[i].umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr); + bnxt_cqs[i].umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr); CHECK_NNULL(bnxt_cqs[i].umem_handle, "bnxt_re_dv_umem_reg(cq_buf)"); /* Create CQ */ @@ -121,7 +121,7 @@ void GDABackend::bnxt_create_cqs(int cqe) { cq_init_attr.umem_handle = bnxt_cqs[i].umem_handle; cq_init_attr.ncqe = cq_attr.ncqe; - cqs[i] = bnxtdv_ftable_.create_cq(context, &cq_init_attr); + cqs[i] = bnxt_re_dv.create_cq(context, &cq_init_attr); CHECK_NNULL(cqs[i], "bnxt_re_dv_create_cq"); } } @@ -152,7 +152,7 @@ void GDABackend::bnxt_create_qps(int sq_length) { /* Alloc qp_mem_info */ memset(&bnxt_qps[i].mem_info, 0, sizeof(struct bnxt_re_dv_qp_mem_info)); - err = bnxtdv_ftable_.qp_mem_alloc(pd_orig, &ib_qp_attr, &bnxt_qps[i].mem_info); + err = bnxt_re_dv.qp_mem_alloc(pd_orig, &ib_qp_attr, &bnxt_qps[i].mem_info); CHECK_ZERO(err, "bnxt_re_dv_qp_mem_alloc"); /* Alloc SQ */ @@ -177,7 +177,7 @@ void GDABackend::bnxt_create_qps(int sq_length) { umem_attr.size = bnxt_qps[i].mem_info.sq_len; umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE; - sq_umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr); + sq_umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr); CHECK_NNULL(sq_umem_handle, "bnxt_re_dv_umem_reg(sq)"); memset(&umem_attr, 0, sizeof(struct bnxt_re_dv_umem_reg_attr)); @@ -185,7 +185,7 @@ void GDABackend::bnxt_create_qps(int sq_length) { umem_attr.size = bnxt_qps[i].mem_info.rq_len; umem_attr.access_flags = IBV_ACCESS_LOCAL_WRITE; - rq_umem_handle = bnxtdv_ftable_.umem_reg(context, &umem_attr); + rq_umem_handle = bnxt_re_dv.umem_reg(context, &umem_attr); CHECK_NNULL(rq_umem_handle, "bnxt_re_dv_umem_reg(rq)"); /* IB DV QP Init Attr */ @@ -214,33 +214,40 @@ void GDABackend::bnxt_create_qps(int sq_length) { bnxt_qps[i].attr.comp_mask = bnxt_qps[i].mem_info.comp_mask; /* Alloc QP */ - qps[i] = bnxtdv_ftable_.create_qp(pd_orig, &bnxt_qps[i].attr); + qps[i] = bnxt_re_dv.create_qp(pd_orig, &bnxt_qps[i].attr); CHECK_NNULL(qps[i], "bnxt_re_dv_create_qp"); } } -int GDABackend::bnxt_dv_dl_init() { - bnxtdv_handle_ = dlopen("libbnxt_re.so", RTLD_NOW); - if (!bnxtdv_handle_) { +void* GDABackend::bnxt_dv_dlopen() { + void* dv_handle{nullptr}; + dv_handle = dlopen("libbnxt_re.so", RTLD_NOW); + if (!dv_handle) { // Try hard-coded PATH - bnxtdv_handle_ = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW); - if (!bnxtdv_handle_) { + dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW); + if (!dv_handle) { DPRINTF("Could not open libbnxt_re.so. Returning\n"); - return ROCSHMEM_ERROR; } } + return dv_handle; +} - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, init_obj); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, create_qp); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, destroy_qp); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, modify_qp); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, qp_mem_alloc); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, create_cq); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, destroy_cq); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, cq_mem_alloc); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, umem_reg); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, umem_dereg); - DLSYM_HELPER(bnxtdv_ftable_, bnxt_re_dv_, bnxtdv_handle_, get_default_db_region); +int GDABackend::bnxt_dv_dl_init() { + bnxtdv_handle_ = bnxt_dv_dlopen(); + if (!bnxtdv_handle_) + return ROCSHMEM_ERROR; + + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, init_obj); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, create_qp); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, destroy_qp); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, modify_qp); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, qp_mem_alloc); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, create_cq); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, destroy_cq); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, cq_mem_alloc); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, umem_reg); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, umem_dereg); + DLSYM_HELPER(bnxt_re_dv, bnxt_re_dv_, bnxtdv_handle_, get_default_db_region); return ROCSHMEM_SUCCESS; } diff --git a/projects/rocshmem/src/gda/bnxt/provider_gda_bnxt.hpp b/projects/rocshmem/src/gda/bnxt/provider_gda_bnxt.hpp index dd83fa58c5..631e42d21c 100644 --- a/projects/rocshmem/src/gda/bnxt/provider_gda_bnxt.hpp +++ b/projects/rocshmem/src/gda/bnxt/provider_gda_bnxt.hpp @@ -83,4 +83,26 @@ struct bnxt_host_qp { /*****************************************************************************/ +struct bnxtdv_funcs_t { + int (*init_obj)(struct bnxt_re_dv_obj *obj, uint64_t obj_type); + struct ibv_qp* (*create_qp)(struct ibv_pd *pd, + struct bnxt_re_dv_qp_init_attr *qp_attr); + int (*destroy_qp)(struct ibv_qp *ibvqp); + int (*modify_qp)(struct ibv_qp *ibv_qp, struct ibv_qp_attr *attr, + int attr_mask, uint32_t type, uint32_t value); + int (*qp_mem_alloc)(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr, + struct bnxt_re_dv_qp_mem_info *dv_qp_mem); + struct ibv_cq* (*create_cq)(struct ibv_context *ibvctx, + struct bnxt_re_dv_cq_init_attr *cq_attr); + int (*destroy_cq)(struct ibv_cq *ibv_cq); + void* (*cq_mem_alloc)(struct ibv_context *ibvctx, int num_cqe, + struct bnxt_re_dv_cq_attr *cq_attr); + void* (*umem_reg)(struct ibv_context *ibvctx, + struct bnxt_re_dv_umem_reg_attr *in); + int (*umem_dereg)(void *umem_handle); + int (*get_default_db_region)(struct ibv_context *ibvctx, + struct bnxt_re_dv_db_region_attr *out); +}; + #endif //LIBRARY_SRC_GDA_BNXT_GDA_PROVIDER_HPP_ diff --git a/projects/rocshmem/src/gda/ionic/CMakeLists.txt b/projects/rocshmem/src/gda/ionic/CMakeLists.txt new file mode 100644 index 0000000000..896fb2c4dd --- /dev/null +++ b/projects/rocshmem/src/gda/ionic/CMakeLists.txt @@ -0,0 +1,36 @@ +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### +target_sources( + ${PROJECT_NAME} + PRIVATE + backend_gda_ionic.cpp +) + +if(GDA_IONIC) +target_sources( + ${PROJECT_NAME} + PRIVATE + queue_pair_ionic.cpp +) +endif() diff --git a/projects/rocshmem/src/gda/ionic/backend_gda_ionic.cpp b/projects/rocshmem/src/gda/ionic/backend_gda_ionic.cpp new file mode 100644 index 0000000000..ef62590f45 --- /dev/null +++ b/projects/rocshmem/src/gda/ionic/backend_gda_ionic.cpp @@ -0,0 +1,122 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "gda/backend_gda.hpp" +#include "util.hpp" + +namespace rocshmem { + +void GDABackend::ionic_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { + ionic_dv_ctx dvctx; + ionic_dv.get_ctx(&dvctx, context); + + int hip_dev_id{-1}; + CHECK_HIP(hipGetDevice(&hip_dev_id)); + + void* gpu_db_page = nullptr; + rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id); + + uint64_t *db_page_u64 = reinterpret_cast(dvctx.db_page); + uint64_t *gpu_db_page_u64 = reinterpret_cast(gpu_db_page); + + uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64]; + + gpu_db_page = gpu_db_page; + gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype]; + gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype]; + + uint8_t udma_idx = ionic_dv.qp_get_udma_idx(qps[conn_num]); + + ionic_dv_cq dvcq; + ionic_dv.get_cq(&dvcq, cqs[conn_num], udma_idx); + + gpu_qp->cq_dbreg = gpu_db_cq; + gpu_qp->cq_dbval = dvcq.q.db_val; + gpu_qp->cq_mask = dvcq.q.mask; + + gpu_qp->ionic_cq_buf = reinterpret_cast(dvcq.q.ptr); + + ionic_dv_qp dvqp; + ionic_dv.get_qp(&dvqp, qps[conn_num]); + + gpu_qp->sq_dbreg = gpu_db_sq; + gpu_qp->sq_dbval = dvqp.sq.db_val; + gpu_qp->sq_mask = dvqp.sq.mask; + gpu_qp->ionic_sq_buf = reinterpret_cast(dvqp.sq.ptr); + + strncpy(gpu_qp->dev_name, + qps[conn_num]->context->device->name, + sizeof(gpu_qp->dev_name)); + gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0; + + gpu_qp->qp_num = qps[conn_num]->qp_num; + gpu_qp->lkey = heap_mr->lkey; + gpu_qp->rkey = heap_rkey[conn_num % num_pes]; + gpu_qp->inline_threshold = 32; +} + +void GDABackend::ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* pattr) { + ionic_dv.pd_set_sqcmb(pd_parent, false, false, false); + ionic_dv.pd_set_rqcmb(pd_parent, false, false, false); + + for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) { + pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(context, pattr); + CHECK_NNULL(pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)"); + + ionic_dv.pd_set_sqcmb(pd_uxdma[uxdma_i], false, false, false); + ionic_dv.pd_set_rqcmb(pd_uxdma[uxdma_i], false, false, false); + ionic_dv.pd_set_udma_mask(pd_uxdma[uxdma_i], 1u << uxdma_i); + } +} + +void* GDABackend::ionic_dv_dlopen() { + void* dv_handle{nullptr}; + dv_handle = dlopen("libionic.so", RTLD_NOW); + if (!dv_handle) { + // Try hard-coded PATH + dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_NOW); + if (!dv_handle) { + DPRINTF("Could not open libionic.so. Returning\n"); + } + } + return dv_handle; +} + +int GDABackend::ionic_dv_dl_init() { + ionicdv_handle_ = ionic_dv_dlopen(); + if (!ionicdv_handle_) + return ROCSHMEM_ERROR; + + DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_ctx); + DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, qp_get_udma_idx); + DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_cq); + DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, get_qp); + DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_sqcmb); + DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_rqcmb); + DLSYM_HELPER(ionic_dv, ionic_dv_, ionicdv_handle_, pd_set_udma_mask); + + return ROCSHMEM_SUCCESS; +} + +} // namespace rocshmem diff --git a/projects/rocshmem/src/gda/ionic/ionic_dv.h b/projects/rocshmem/src/gda/ionic/ionic_dv.h new file mode 100644 index 0000000000..2b1821f1de --- /dev/null +++ b/projects/rocshmem/src/gda/ionic/ionic_dv.h @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + */ + +#ifndef IONIC_DV_H +#define IONIC_DV_H + +#include +#include + +struct ibv_cq; +struct ibv_qp; + +/** IONIC_PD_TAG - tag used for parent domain resource allocation. */ +#define IONIC_PD_TAG ((uint64_t)RDMA_DRIVER_IONIC << 32) +#define IONIC_PD_TAG_CQ (IONIC_PD_TAG | 1) +#define IONIC_PD_TAG_SQ (IONIC_PD_TAG | 2) +#define IONIC_PD_TAG_RQ (IONIC_PD_TAG | 3) +#define IONIC_PD_TAG_RCQ (IONIC_PD_TAG | 4) + +/* deprecated */ +#define IONIC_SQ_SIG_ALL 1 +/* deprecated */ +#define IONIC_SQ_SIG_HACK_HIGH 2 + +/** IONIC_UDMA_MASK_LOW - flag represents the udma0 pipeline in the udma mask. */ +#define IONIC_UDMA_MASK_LOW 1 +/** IONIC_UDMA_MASK_HIGH - flag represents the udma1 pipeline in the udma mask. */ +#define IONIC_UDMA_MASK_HIGH 2 + +#define IONIC_DV_PUEC_NPORTS_MAX 8 + +/** struct ionic_dv_ctx - Context information for gpu-initiated rdma. */ +struct ionic_dv_ctx { + void *db_page; + uint64_t *db_ptr; + uint8_t sq_qtype; + uint8_t rq_qtype; + uint8_t cq_qtype; +}; + +/** struct ionic_dv_ctx - Queue information for gpu-initiated rdma. */ +struct ionic_dv_queue { + void *ptr; + size_t size; + uint64_t db_val; + uint16_t mask; + uint8_t depth_log2; + uint8_t stride_log2; +}; + +/** struct ionic_dv_ctx - CQ information for gpu-initiated rdma. */ +struct ionic_dv_cq { + struct ionic_dv_queue q; +}; + +/** struct ionic_dv_ctx - QP information for gpu-initiated rdma. */ +struct ionic_dv_qp { + struct ionic_dv_queue rq; + struct ionic_dv_queue sq; +}; + +/** struct ionic_puec_route - Info needed to setup a PUEC plane route. */ +struct ionic_dv_puec_route { + union ibv_gid dgid; + union ibv_gid sgid; + uint32_t flow_label; + uint8_t hop_limit; + uint8_t traffic_class; + uint8_t sl; + uint8_t rsvd[5]; + uint32_t flags; +}; + +/** + * ionic_dv_is_ionic_ctx - Test if context belongs to ionic provider. + */ +bool ionic_dv_is_ionic_ctx(struct ibv_context *ibctx); + +/** + * ionic_dv_is_ionic_pd - Test if pd belongs to ionic provider. + */ +bool ionic_dv_is_ionic_pd(struct ibv_pd *ibpd); + +/** + * ionic_dv_is_ionic_cq - Test if cq belongs to ionic provider. + */ +bool ionic_dv_is_ionic_cq(struct ibv_cq *ibcq); + +/** + * ionic_dv_is_ionic_qp - Test if qp belongs to ionic provider. + */ +bool ionic_dv_is_ionic_qp(struct ibv_qp *ibqp); + + +/** + * ionic_dv_ctx_get_udma_count - Get number of udma pipelines. + */ +uint8_t ionic_dv_ctx_get_udma_count(struct ibv_context *ibctx); + +/** + * ionic_dv_ctx_get_udma_mask - Get mask of udma pipeline ids. + */ +uint8_t ionic_dv_ctx_get_udma_mask(struct ibv_context *ibctx); + +/** + * ionic_dv_pd_get_udma_mask - Get mask of udma pipeline ids of pd or parent domain. + */ +uint8_t ionic_dv_pd_get_udma_mask(struct ibv_pd *ibpd); + +/** + * ionic_dv_pd_set_udma_mask - Restrict pipeline ids of pd or parent domain. + * + * Queues associated with this pd will be restricted to one of the pipelines enabled by + * the mask at the time of queue creation. + * + * Recommended usage is to create a pd, then parent domains of that pd for each different + * udma mask. Set the desired udma mask on each parent domain. Then, create queues + * associated with the parent domain with the desired udma mask. + * + * Alternative usage is to create a pd, and set the desired udma mask prior to creating + * each queue. Changing the udma mask of the pd has no effect on previously created + * queues. + */ +int ionic_dv_pd_set_udma_mask(struct ibv_pd *ibpd, uint8_t udma_mask); + +/** + * ionic_dv_cq_get_udma_mask - Get mask of udma pipeline ids of completion queueue. + */ +uint8_t ionic_dv_cq_get_udma_mask(struct ibv_cq *ibcq); + +/** + * ionic_dv_qp_get_udma_idx - Get udma pipeline id of queueue pair. + */ +uint8_t ionic_dv_qp_get_udma_idx(struct ibv_qp *ibqp); + + +/** + * ionic_dv_pd_set_sqcmb - Specify send queue preference for controller memory bar. + * + * Send queues associated with this pd will use the controller memory bar according to + * this preference at the time of queue creation. + * + * @enable - Allow the use of the controller memory bar. + * @expdb - Allow the use of express doorbell optimizations. + * @require - Require preferences to be met, no fallback. + */ +int ionic_dv_pd_set_sqcmb(struct ibv_pd *ibpd, bool enable, bool expdb, bool require); + +/** + * ionic_dv_pd_set_rqcmb - Specify receive queue preference for controller memory bar. + * + * See ionic_dv_pd_set_sqcmb(). + */ +int ionic_dv_pd_set_rqcmb(struct ibv_pd *ibpd, bool enable, bool expdb, bool require); + + +/** + * ionic_dv_qp_set_gda - Enable or disable GPU-Direct Async (GDA) mode. + * + * In GDA mode, when the application calls ibv_post_send() or ibv_post_recv(), the + * provider writes WQEs in the descriptor ring without ringing the doorbell. + * + * To ring the doorbell, after posting the work the application should query to get the + * doorbell data, and later write that data to the memory mapped doorbell register. + * + * See also: ionic_dv_get_ctx() + * See also: ionic_dv_qp_get_send_dbell_data() + * See also: ionic_dv_qp_get_recv_dbell_data() + * + * @ibqp - Set GDA mode for this queue pair. + * @enable_send - Enable GDA mode for the send queue. + * @enable_recv - Enable GDA mode for the recv queue. + */ +int ionic_dv_qp_set_gda(struct ibv_qp *ibqp, bool enable_send, bool enable_recv); + +/** + * ionic_dv_qp_get_send_dbell_data - Get send queue doorbell data. + * + * In GDA mode, when the application calls ibv_post_send() the provider writes WQEs in + * the descriptor ring without ringing the doorbell. The application should query the + * doorbell data immediately after posting the work. The application requests the + * GPU to fill the source buffers of the data transfer with the result of computation. + * The application requests the GPU to write the doorbell data to the memory mapped + * doorbell register immediately when the computation is complete, triggering the data + * transfer. + * + * It is important that the GPU ring the doorbell in sequential order. If work requests + * are posted in batches A, B, and C, with respective doorbell data, the data path must + * not write B or C before A, and must not write C before B. It is ok to skip writing a + * doorbell, like writing only C, which will trigger the data transfer for all of the + * work up to that point in the sequence. + * + * @ibqp - Get send doorbell data for this queue pair. + * @dbdata - Output parameter for doorbell data. + */ +int ionic_dv_qp_get_send_dbell_data(struct ibv_qp *ibqp, uint64_t *dbdata); + +/** + * ionic_dv_qp_get_recv_dbell_data - Get recv queue doorbell data. + * + * In GDA mode, when the application calls ibv_post_recv() the provider writes WQEs in + * the descriptor ring without ringing the doorbell. After polling recv completions, the + * application can immediately re-post the receive buffers without ringing the doorbell. + * The application should query the doorbell data immediately after posting the buffers. + * The application requests the GPU consume the data from the receive buffers. The + * application requests the GPU to write the doorbell data to the memory mapped doorbell + * register immediately after the received data is consumed, making the buffers available + * for the next data transfer. + * + * It is important that the GPU ring the doorbell in sequential order. If work requests + * are posted in batches A, B, and C, with respective doorbell data, the data path must + * not write B or C before A, and must not write C before B. It is ok to skip writing a + * doorbell, like writing only C, which will make buffers available up to that point in + * the sequence. + * + * @ibqp - Get recv doorbell data for this queue pair. + * @dbdata - Output parameter for doorbell data. + */ +int ionic_dv_qp_get_recv_dbell_data(struct ibv_qp *ibqp, uint64_t *dbdata); + + +/** + * ionic_dv_get_ctx - Extract context information for gpu-initiated rdma. + */ +int ionic_dv_get_ctx(struct ionic_dv_ctx *dvctx, struct ibv_context *ibctx); + +/** + * ionic_dv_get_cq - Extract cq information for gpu-initiated rdma. + */ +int ionic_dv_get_cq(struct ionic_dv_cq *dvcq, struct ibv_cq *ibcq, uint8_t udma_idx); + +/** + * ionic_dv_get_qp - Extract qp information for gpu-initiated rdma. + */ +int ionic_dv_get_qp(struct ionic_dv_qp *dvqp, struct ibv_qp *ibqp); + +/** + * ionic_dv_qp_set_puec_plane_route - set route info for a PUEC plane. + */ +int ionic_dv_qp_set_puec_plane_route(struct ibv_qp *ibqp, uint8_t plane_idx, + struct ionic_dv_puec_route *ipr); + +#endif /* IONIC_DV_H */ diff --git a/projects/rocshmem/src/gda/ionic/ionic_fw.h b/projects/rocshmem/src/gda/ionic/ionic_fw.h new file mode 100644 index 0000000000..a7bd2fdae6 --- /dev/null +++ b/projects/rocshmem/src/gda/ionic/ionic_fw.h @@ -0,0 +1,534 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2018-2022 Pensando Systems, Inc. All rights reserved. + * 2022-2024 Advanced Micro Devices, Inc. All rights reserved. + */ + +#ifndef IONIC_FW_H +#define IONIC_FW_H + +#if !defined(__cplusplus) +#include +#else +#ifndef BIT +#define BIT(n) (1u << (n)) +#endif +#endif + +#define IONIC_EXP_DBELL_SZ 8 + +/* common to all versions */ + +/* wqe scatter gather element */ +struct ionic_sge { + __be64 va; + __be32 len; + __be32 lkey; +}; + +/* admin queue mr type */ +enum ionic_mr_flags { + /* bits that determine mr access */ + IONIC_MRF_LOCAL_WRITE = BIT(0), + IONIC_MRF_REMOTE_WRITE = BIT(1), + IONIC_MRF_REMOTE_READ = BIT(2), + IONIC_MRF_REMOTE_ATOMIC = BIT(3), + IONIC_MRF_MW_BIND = BIT(4), + IONIC_MRF_ZERO_BASED = BIT(5), + IONIC_MRF_ON_DEMAND = BIT(6), + IONIC_MRF_PB = BIT(7), + IONIC_MRF_ACCESS_MASK = BIT(12) - 1, + + /* bits that determine mr type */ + IONIC_MRF_IS_MW = BIT(14), + IONIC_MRF_INV_EN = BIT(15), + + /* base flags combinations for mr types */ + IONIC_MRF_USER_MR = 0, + IONIC_MRF_PHYS_MR = IONIC_MRF_INV_EN, + IONIC_MRF_MW_1 = IONIC_MRF_IS_MW, + IONIC_MRF_MW_2 = IONIC_MRF_IS_MW | IONIC_MRF_INV_EN, +}; + +static inline int to_ionic_mr_flags(int access) +{ + int flags = 0; + + if (access & IBV_ACCESS_LOCAL_WRITE) + flags |= IONIC_MRF_LOCAL_WRITE; + + if (access & IBV_ACCESS_REMOTE_READ) + flags |= IONIC_MRF_REMOTE_READ; + + if (access & IBV_ACCESS_REMOTE_WRITE) + flags |= IONIC_MRF_REMOTE_WRITE; + + if (access & IBV_ACCESS_REMOTE_ATOMIC) + flags |= IONIC_MRF_REMOTE_ATOMIC; + + if (access & IBV_ACCESS_MW_BIND) + flags |= IONIC_MRF_MW_BIND; + + if (access & IBV_ACCESS_ZERO_BASED) + flags |= IONIC_MRF_ZERO_BASED; + + return flags; +} + +/* cqe status indicated in status_length field when err bit is set */ +enum ionic_status { + IONIC_STS_OK, + IONIC_STS_LOCAL_LEN_ERR, + IONIC_STS_LOCAL_QP_OPER_ERR, + IONIC_STS_LOCAL_PROT_ERR, + IONIC_STS_WQE_FLUSHED_ERR, + IONIC_STS_MEM_MGMT_OPER_ERR, + IONIC_STS_BAD_RESP_ERR, + IONIC_STS_LOCAL_ACC_ERR, + IONIC_STS_REMOTE_INV_REQ_ERR, + IONIC_STS_REMOTE_ACC_ERR, + IONIC_STS_REMOTE_OPER_ERR, + IONIC_STS_RETRY_EXCEEDED, + IONIC_STS_RNR_RETRY_EXCEEDED, + IONIC_STS_XRC_VIO_ERR, +}; + +static inline int ionic_to_ibv_status(int sts) +{ + switch (sts) { + case IONIC_STS_OK: + return IBV_WC_SUCCESS; + case IONIC_STS_LOCAL_LEN_ERR: + return IBV_WC_LOC_LEN_ERR; + case IONIC_STS_LOCAL_QP_OPER_ERR: + return IBV_WC_LOC_QP_OP_ERR; + case IONIC_STS_LOCAL_PROT_ERR: + return IBV_WC_LOC_PROT_ERR; + case IONIC_STS_WQE_FLUSHED_ERR: + return IBV_WC_WR_FLUSH_ERR; + case IONIC_STS_MEM_MGMT_OPER_ERR: + return IBV_WC_MW_BIND_ERR; + case IONIC_STS_BAD_RESP_ERR: + return IBV_WC_BAD_RESP_ERR; + case IONIC_STS_LOCAL_ACC_ERR: + return IBV_WC_LOC_ACCESS_ERR; + case IONIC_STS_REMOTE_INV_REQ_ERR: + return IBV_WC_REM_INV_REQ_ERR; + case IONIC_STS_REMOTE_ACC_ERR: + return IBV_WC_REM_ACCESS_ERR; + case IONIC_STS_REMOTE_OPER_ERR: + return IBV_WC_REM_OP_ERR; + case IONIC_STS_RETRY_EXCEEDED: + return IBV_WC_RETRY_EXC_ERR; + case IONIC_STS_RNR_RETRY_EXCEEDED: + return IBV_WC_RNR_RETRY_EXC_ERR; + case IONIC_STS_XRC_VIO_ERR: + default: + return IBV_WC_GENERAL_ERR; + } +} + +/* fw abi v1 */ + +/* data payload part of v1 wqe */ +union ionic_v1_pld { + struct ionic_sge sgl[2]; + __be32 spec32[8]; + __be16 spec16[16]; + __u8 data[32]; +}; + +/* completion queue v1 cqe */ +struct ionic_v1_cqe { + union { + struct { + __le64 wqe_idx_timestamp; + __be32 src_qpn_op; + __u8 src_mac[6]; + __be16 vlan_tag; + __be32 imm_data_rkey; + } recv; + struct { + __u8 rsvd[4]; + __be32 msg_msn; + __u8 rsvd2[8]; + __le64 npg_wqe_idx_timestamp; + } send; + }; + __be32 status_length; + __be32 qid_type_flags; +}; + +/* bits for cqe wqe_idx and timestamp */ +enum ionic_v1_cqe_wqe_idx_timestamp_bits { + IONIC_V1_CQE_WQE_IDX_MASK = 0xffff, + IONIC_V1_CQE_TIMESTAMP_SHIFT = 16, +}; + +/* bits for cqe recv */ +enum ionic_v1_cqe_src_qpn_bits { + IONIC_V1_CQE_RECV_QPN_MASK = 0xffffff, + IONIC_V1_CQE_RECV_OP_SHIFT = 24, + + /* MASK could be 0x3, but need 0x1f for makeshift values: + * OP_TYPE_RDMA_OPER_WITH_IMM, OP_TYPE_SEND_RCVD + */ + IONIC_V1_CQE_RECV_OP_MASK = 0x1f, + IONIC_V1_CQE_RECV_OP_SEND = 0, + IONIC_V1_CQE_RECV_OP_SEND_INV = 1, + IONIC_V1_CQE_RECV_OP_SEND_IMM = 2, + IONIC_V1_CQE_RECV_OP_RDMA_IMM = 3, + + IONIC_V1_CQE_RECV_IS_IPV4 = BIT(7 + IONIC_V1_CQE_RECV_OP_SHIFT), + IONIC_V1_CQE_RECV_IS_VLAN = BIT(6 + IONIC_V1_CQE_RECV_OP_SHIFT), +}; + +/* bits for cqe qid_type_flags */ +enum ionic_v1_cqe_qtf_bits { + IONIC_V1_CQE_COLOR = BIT(0), + IONIC_V1_CQE_ERROR = BIT(1), + IONIC_V1_CQE_TYPE_SHIFT = 5, + IONIC_V1_CQE_TYPE_MASK = 0x7, + IONIC_V1_CQE_QID_SHIFT = 8, + + IONIC_V1_CQE_TYPE_RECV = 1, + IONIC_V1_CQE_TYPE_SEND_MSN = 2, + IONIC_V1_CQE_TYPE_SEND_NPG = 3, + IONIC_V1_CQE_TYPE_RECV_INDIR = 4, +}; + +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_HCC__) + +static inline bool ionic_v1_cqe_color(struct ionic_v1_cqe *cqe) +{ + return !!(cqe->qid_type_flags & htobe32(IONIC_V1_CQE_COLOR)); +} + +static inline bool ionic_v1_cqe_error(struct ionic_v1_cqe *cqe) +{ + return !!(cqe->qid_type_flags & htobe32(IONIC_V1_CQE_ERROR)); +} + +static inline bool ionic_v1_cqe_recv_is_ipv4(struct ionic_v1_cqe *cqe) +{ + return !!(cqe->recv.src_qpn_op & + htobe32(IONIC_V1_CQE_RECV_IS_IPV4)); +} + +static inline bool ionic_v1_cqe_recv_is_vlan(struct ionic_v1_cqe *cqe) +{ + return !!(cqe->recv.src_qpn_op & + htobe32(IONIC_V1_CQE_RECV_IS_VLAN)); +} + +static inline void ionic_v1_cqe_clean(struct ionic_v1_cqe *cqe) +{ + cqe->qid_type_flags |= htobe32(~0u << IONIC_V1_CQE_QID_SHIFT); +} + +static inline uint32_t ionic_v1_cqe_qtf(struct ionic_v1_cqe *cqe) +{ + return be32toh(cqe->qid_type_flags); +} + +#endif // !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_HCC__) + +static inline uint8_t ionic_v1_cqe_qtf_type(uint32_t qtf) +{ + return (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; +} + +static inline uint32_t ionic_v1_cqe_qtf_qid(uint32_t qtf) +{ + return qtf >> IONIC_V1_CQE_QID_SHIFT; +} + +/* v1 base wqe header */ +struct ionic_v1_base_hdr { + __le64 wqe_idx; + __u8 op; + __u8 num_sge_key; + __be16 flags; + __be32 imm_data_key; +}; + +/* v1 receive wqe body */ +struct ionic_v1_recv_bdy { + __u8 rsvd[16]; + union ionic_v1_pld pld; +}; + +/* v1 send/rdma wqe body (common, has sgl) */ +struct ionic_v1_common_bdy { + union { + struct { + __be32 ah_id; + __be32 dest_qpn; + __be32 dest_qkey; + } send; + struct { + __be32 remote_va_high; + __be32 remote_va_low; + __be32 remote_rkey; + } rdma; + }; + __be32 length; + union ionic_v1_pld pld; +}; + +/* v1 atomic wqe body */ +struct ionic_v1_atomic_bdy { + __be32 remote_va_high; + __be32 remote_va_low; + __be32 remote_rkey; + __be32 swap_add_high; + __be32 swap_add_low; + __be32 compare_high; + __be32 compare_low; + __u8 rsvd[4]; + struct ionic_sge sge; +}; + +/* v2 atomic wqe body */ +struct ionic_v2_atomic_bdy { + __be32 remote_va_high; + __be32 remote_va_low; + __be32 remote_rkey; + __be32 swap_add_high; + __be32 swap_add_low; + __be32 compare_high; + __be32 compare_low; + __be32 lkey; + __be64 local_va; + __u8 rsvd_expdb[8]; +}; + +/* v1 bind mw wqe body */ +struct ionic_v1_bind_mw_bdy { + __be64 va; + __be64 length; + __be32 lkey; + __be16 flags; + __u8 rsvd[26]; +}; + +/* v1 send/recv wqe */ +struct ionic_v1_wqe { + struct ionic_v1_base_hdr base; + union { + struct ionic_v1_recv_bdy recv; + struct ionic_v1_common_bdy common; + struct ionic_v1_atomic_bdy atomic; + struct ionic_v2_atomic_bdy atomic_v2; + struct ionic_v1_bind_mw_bdy bind_mw; + }; +}; + +/* queue pair v1 send opcodes */ +enum ionic_v1_op { + IONIC_V1_OP_SEND, + IONIC_V1_OP_SEND_INV, + IONIC_V1_OP_SEND_IMM, + IONIC_V1_OP_RDMA_READ, + IONIC_V1_OP_RDMA_WRITE, + IONIC_V1_OP_RDMA_WRITE_IMM, + IONIC_V1_OP_ATOMIC_CS, + IONIC_V1_OP_ATOMIC_FA, + IONIC_V1_OP_REG_MR, + IONIC_V1_OP_LOCAL_INV, + IONIC_V1_OP_BIND_MW, + + /* flags */ + IONIC_V1_FLAG_FENCE = BIT(0), + IONIC_V1_FLAG_SOL = BIT(1), + IONIC_V1_FLAG_INL = BIT(2), + IONIC_V1_FLAG_SIG = BIT(3), + IONIC_V1_FLAG_COLOR = BIT(4), + + /* flags last four bits for sgl spec format */ + IONIC_V1_FLAG_SPEC32 = (1u << 12), + IONIC_V1_FLAG_SPEC16 = (2u << 12), + IONIC_V1_SPEC_FIRST_SGE = 2, +}; + +/* queue pair v2 send opcodes */ +enum ionic_v2_op { + IONIC_V2_OPSL_OUT = 0x20, + IONIC_V2_OPSL_IMM = 0x40, + IONIC_V2_OPSL_INV = 0x80, + + IONIC_V2_OP_SEND = 0x0 | IONIC_V2_OPSL_OUT, + IONIC_V2_OP_SEND_IMM = IONIC_V2_OP_SEND | IONIC_V2_OPSL_IMM, + IONIC_V2_OP_SEND_INV = IONIC_V2_OP_SEND | IONIC_V2_OPSL_INV, + + IONIC_V2_OP_RDMA_WRITE = 0x1 | IONIC_V2_OPSL_OUT, + IONIC_V2_OP_RDMA_WRITE_IMM = IONIC_V2_OP_RDMA_WRITE | IONIC_V2_OPSL_IMM, + + IONIC_V2_OP_RDMA_READ = 0x2, + + IONIC_V2_OP_ATOMIC_CS = 0x4, + IONIC_V2_OP_ATOMIC_FA = 0x5, + IONIC_V2_OP_REG_MR = 0x6, + IONIC_V2_OP_LOCAL_INV = 0x7, + IONIC_V2_OP_BIND_MW = 0x8, +}; + +#if !defined(__cplusplus) + +static inline size_t ionic_v1_send_wqe_min_size(int min_sge, int min_data, + int spec, bool expdb) +{ + size_t sz_wqe, sz_sgl, sz_data; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + min_sge += IONIC_V1_SPEC_FIRST_SGE; + + if (expdb) { + min_sge += 1; + min_data += IONIC_EXP_DBELL_SZ; + } + + sz_wqe = sizeof(struct ionic_v1_wqe); + sz_sgl = offsetof(struct ionic_v1_wqe, common.pld.sgl[min_sge]); + sz_data = offsetof(struct ionic_v1_wqe, common.pld.data[min_data]); + + if (sz_sgl > sz_wqe) + sz_wqe = sz_sgl; + + if (sz_data > sz_wqe) + sz_wqe = sz_data; + + return roundup_pow_of_two(sz_wqe); +} + +static inline int ionic_v1_send_wqe_max_sge(uint8_t stride_log2, int spec, bool expdb) +{ + struct ionic_v1_wqe *wqe = (void *)0; + struct ionic_sge *sge = (void *)(1ull << stride_log2); + int num_sge = 0; + + if (expdb) + sge -= 1; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + num_sge = IONIC_V1_SPEC_FIRST_SGE; + + num_sge = sge - &wqe->common.pld.sgl[num_sge]; + + if (spec && num_sge > spec) + num_sge = spec; + + return num_sge; +} + +static inline int ionic_v1_send_wqe_max_data(uint8_t stride_log2, bool expdb) +{ + struct ionic_v1_wqe *wqe = (void *)0; + __u8 *data = (void *)(1ull << stride_log2); + + if (expdb) + data -= IONIC_EXP_DBELL_SZ; + + return data - wqe->common.pld.data; +} + +static inline size_t ionic_v1_recv_wqe_min_size(int min_sge, int spec, bool expdb) +{ + size_t sz_wqe, sz_sgl; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + min_sge += IONIC_V1_SPEC_FIRST_SGE; + + if (expdb) + min_sge += 1; + + sz_wqe = sizeof(struct ionic_v1_wqe); + sz_sgl = offsetof(struct ionic_v1_wqe, recv.pld.sgl[min_sge]); + + if (sz_sgl > sz_wqe) + sz_wqe = sz_sgl; + + return sz_wqe; +} + +static inline int ionic_v1_recv_wqe_max_sge(uint8_t stride_log2, int spec, bool expdb) +{ + struct ionic_v1_wqe *wqe = (void *)0; + struct ionic_sge *sge = (void *)(1ull << stride_log2); + int num_sge = 0; + + if (expdb) + sge -= 1; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + num_sge = IONIC_V1_SPEC_FIRST_SGE; + + num_sge = sge - &wqe->recv.pld.sgl[num_sge]; + + if (spec && num_sge > spec) + num_sge = spec; + + return num_sge; +} + +static inline int ionic_v1_use_spec_sge(int min_sge, int spec) +{ + if (!spec || min_sge > spec) + return 0; + + if (min_sge <= IONIC_V1_SPEC_FIRST_SGE) + return IONIC_V1_SPEC_FIRST_SGE; + + return spec; +} + +#define IONIC_RCQ_SIZE 4096 +#define IONIC_RCQ_DEPTH 128 +#define IONIC_RCQ_DEPTH_LOG2 7 +#define IONIC_RCQ_STRIDE_LOG2 4 + +struct ionic_rcq_hdr { + uint8_t pad[60]; + uint32_t seq_pad; +}; + +struct ionic_rcqe { + uint32_t status_length; + uint32_t imm_data; + uint32_t seq_flags; + uint32_t rsvd; +}; + +enum ionic_rcqe_flag { + IONIC_RCQE_C = BIT(7), + IONIC_RCQE_I = BIT(6), +}; + +struct ionic_rcq { + struct ionic_rcq_hdr hdr; + struct ionic_rcqe ring[IONIC_RCQ_DEPTH]; +}; + +static inline uint32_t ionic_rcq_hdr_seq(struct ionic_rcq_hdr *hdr) +{ + return be32toh(hdr->seq_pad) >> 8; +} + +static inline uint32_t ionic_rcqe_seq(struct ionic_rcqe *rcqe) +{ + return be32toh(rcqe->seq_flags) >> 8; +} + +static inline bool ionic_rcqe_color(struct ionic_rcqe *rcqe) +{ + return !!(rcqe->seq_flags & htobe32(IONIC_RCQE_C)); +} + +static inline bool ionic_rcqe_imm(struct ionic_rcqe *rcqe) +{ + return !!(rcqe->seq_flags & htobe32(IONIC_RCQE_I)); +} + +#endif // !defined(__cplusplus) + +#endif /* IONIC_FW_H */ diff --git a/projects/rocshmem/src/gda/ionic/provider_gda_ionic.hpp b/projects/rocshmem/src/gda/ionic/provider_gda_ionic.hpp index adb21c9a2d..417ddfd876 100644 --- a/projects/rocshmem/src/gda/ionic/provider_gda_ionic.hpp +++ b/projects/rocshmem/src/gda/ionic/provider_gda_ionic.hpp @@ -25,11 +25,19 @@ #ifndef LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_ #define LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_ -#ifdef GDA_IONIC extern "C" { -#include -#include +#include "gda/ionic/ionic_dv.h" +#include "gda/ionic/ionic_fw.h" } -#endif + +struct ionicdv_funcs_t { + int (*get_ctx)(struct ionic_dv_ctx *dvctx, struct ibv_context *ibctx); + uint8_t (*qp_get_udma_idx)(struct ibv_qp *ibqp); + int (*get_cq)(struct ionic_dv_cq *dvcq, struct ibv_cq *ibcq, uint8_t udma_idx); + int (*get_qp)(struct ionic_dv_qp *dvqp, struct ibv_qp *ibqp); + int (*pd_set_sqcmb)(struct ibv_pd *ibpd, bool enable, bool expdb, bool require); + int (*pd_set_rqcmb)(struct ibv_pd *ibpd, bool enable, bool expdb, bool require); + int (*pd_set_udma_mask)(struct ibv_pd *ibpd, uint8_t udma_mask); +}; #endif //LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_ diff --git a/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp b/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp new file mode 100644 index 0000000000..59872a3d2e --- /dev/null +++ b/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp @@ -0,0 +1,312 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "gda/queue_pair.hpp" +#include "gda/endian.hpp" +#include "util.hpp" +#include "containers/free_list_impl.hpp" + +namespace rocshmem { + +__device__ uint64_t QueuePair::get_same_qp_lane_mask() { + uint64_t lane_mask = get_active_lane_mask(); + uintptr_t this_val = reinterpret_cast(this); + + // exclude threads operating on a different qp from this thread lane mask + #pragma unroll + for (int i = 0; i < 64; ++i) { + uint64_t bit_i = 1ull << i; + if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) { + lane_mask &= ~bit_i; + } + } + + return lane_mask; +} + +__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) { + uint32_t my_sq_prod = 0; + + // reserve space for wqes in sq + if (is_first_active_lane(activemask)) { + my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask)); + + // wait for that space to be available + ionic_quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask); + + return my_sq_prod; +} + +__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) { + uint32_t dbprod = my_sq_prod + num_wqes; + + spin_lock_acquire_shared(&sq_lock, activemask); + + if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) { + sq_dbprod = dbprod; + + ionic_ring_doorbell(dbprod); + } + + spin_lock_release_shared(&sq_lock, activemask); + + return dbprod; +} + +__device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { + uint32_t my_logical_lane_id = get_active_lane_num(activemask); + uint32_t my_cq_pos = cq_pos + my_logical_lane_id; + + /* Look at the cqe at the current position in the cq buffer */ + struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask]; + + /* Determine expected color based on cq wrap count */ + uint32_t qtf_color_bit = swap_endian_val(IONIC_V1_CQE_COLOR); + uint32_t qtf_color_exp = qtf_color_bit; + if (my_cq_pos & (cq_mask + 1)) { + qtf_color_exp = 0; + } + + /* Check if my cqe color == expected color */ + uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags); + if ((qtf_be & qtf_color_bit) != qtf_color_exp) { + return; + } + + uint32_t msn = swap_endian_val(cqe->send.msg_msn); + + /* Report if the completion indicates an error. */ + if (!!(qtf_be & swap_endian_val(IONIC_V1_CQE_ERROR))) { +#if defined(DEBUG) + uint32_t qtf = swap_endian_val(qtf_be); + uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT; + uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; + uint32_t flag = qtf & 0xf; + uint32_t status = swap_endian_val(cqe->status_length); + uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK; + + printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n", + dev_name, qid, type, flag, status, msn, npg); +#endif + /* No other way to signal an error, so just crash. */ + abort(); + } + + /* Only proceed with the furthest ahead cqe to update the sq state */ + uint64_t my_lane_mask = 1ull << __lane_id(); + uint64_t lesser_lane_mask = my_lane_mask - 1; + if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) { + return; + } + + /* update position in the cq */ + cq_pos = my_cq_pos + 1; + + /* + * Ring cq doorbell frequently enough to avoid cq full. + * + * NB: IONIC_CQ_GRACE is 100 + */ + if (((cq_pos - cq_dbpos) & cq_mask) >= 100) { + cq_dbpos = cq_pos; + __atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed? + } + + sq_msn = msn; +} + +__device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) { + uint32_t greed = 10; + + /* wait for sq_msn to catch up or pass cons. */ + /* 0x800000 - sign bit for 24-bit fields */ + while ((sq_msn - cons) & 0x800000) { + if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) { + continue; + } + + /* with lock acquired, this wave polls cqes until caught up */ + while ((sq_msn - cons) & 0x800000) { + uint32_t old_sq_msn = sq_msn; + + poll_wave_cqes(activemask); + + if (!((sq_msn - cons) & 0x800000)) { + if (sq_msn == old_sq_msn) { + break; + } + if (!greed) { + break; + } + --greed; + } + } + + spin_lock_release_shared(&cq_lock, activemask); + break; + } +} + +__device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) { + // TODO When threads write at once to the same address, not all writes reach the bus. + for (int i = 0; i < 64; ++i) { + if (__lane_id() == i) { + __threadfence(); + __atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST); + } + } + __threadfence(); +} + +__device__ void QueuePair::ionic_quiet() { + ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod); +} + +__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { + uint64_t activemask = get_same_qp_lane_mask(); + uint32_t num_wqes = get_active_lane_count(activemask); + uint32_t my_logical_lane_id = get_active_lane_num(activemask); + uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); + uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; + struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask]; + uint16_t wqe_flags = 0; + + if (!(my_sq_pos & (sq_mask + 1))) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); + } + + if (is_last_active_lane(activemask)) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); + } + + // TODO why is this needed? + if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) { + size = 1; + } + + wqe->base.wqe_idx = my_sq_pos; + wqe->base.op = opcode; + wqe->base.num_sge_key = size ? 1 : 0; + wqe->base.imm_data_key = swap_endian_val(0); + + wqe->common.rdma.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); + wqe->common.rdma.remote_va_low = swap_endian_val(reinterpret_cast(raddr)); + wqe->common.rdma.remote_rkey = swap_endian_val(rkey); + wqe->common.length = swap_endian_val(size); + + if (size) { + if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_INL); + wqe->base.num_sge_key = 0; + if (!laddr) { + // TODO why is this needed? + wqe->common.pld.data[0] = 1; + } else { + memcpy(wqe->common.pld.data, laddr, size); + } + } else { + wqe->common.pld.sgl[0].va = swap_endian_val(reinterpret_cast(laddr)); + wqe->common.pld.sgl[0].len = swap_endian_val(size); + wqe->common.pld.sgl[0].lkey = swap_endian_val(lkey); + } + } + + __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); + + commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); +} + +__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { + uint64_t activemask = get_same_qp_lane_mask(); + uint32_t num_wqes = get_active_lane_count(activemask); + uint32_t my_logical_lane_id = get_active_lane_num(activemask); + bool is_leader{my_logical_lane_id == 0}; + const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); + uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); + uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; + struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask]; + uint16_t wqe_flags = 0; + uint32_t cons; + + uint64_t* wave_fetch_atomic{nullptr}; + if (fetching) { + if (is_leader) { + auto res = fetching_atomic_freelist->pop_front(); + while (!res.success) { + res = fetching_atomic_freelist->pop_front(); + } + wave_fetch_atomic = res.value; + } + wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id); + } + + if (!(my_sq_pos & (sq_mask + 1))) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); + } + + if (is_last_active_lane(activemask)) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); + } + + wqe->base.wqe_idx = my_sq_pos; + wqe->base.op = opcode; + wqe->base.num_sge_key = 1; + wqe->base.imm_data_key = swap_endian_val(0); + + wqe->atomic_v2.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); + wqe->atomic_v2.remote_va_low = swap_endian_val(reinterpret_cast(raddr)); + wqe->atomic_v2.remote_rkey = swap_endian_val(rkey); + wqe->atomic_v2.swap_add_high = swap_endian_val(atomic_data >> 32); + wqe->atomic_v2.swap_add_low = swap_endian_val(atomic_data); + wqe->atomic_v2.compare_high = swap_endian_val(atomic_cmp >> 32); + wqe->atomic_v2.compare_low = swap_endian_val(atomic_cmp); + + if (fetching) { + wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(wave_fetch_atomic + my_logical_lane_id)); + wqe->atomic_v2.lkey = swap_endian_val(fetching_atomic_lkey); + } else { + wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(nonfetching_atomic)); + wqe->atomic_v2.lkey = swap_endian_val(nonfetching_atomic_lkey); + } + + __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); + + cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); + + uint64_t ret{0}; + if (fetching) { + ionic_quiet_internal(activemask, cons); + ret = wave_fetch_atomic[my_logical_lane_id]; + __atomic_signal_fence(__ATOMIC_SEQ_CST); + if (is_leader) { + fetching_atomic_freelist->push_back(wave_fetch_atomic); + } + } + return ret; +} + +} // namespace rocshmem diff --git a/projects/rocshmem/src/gda/mlx5/CMakeLists.txt b/projects/rocshmem/src/gda/mlx5/CMakeLists.txt new file mode 100644 index 0000000000..2debbabeeb --- /dev/null +++ b/projects/rocshmem/src/gda/mlx5/CMakeLists.txt @@ -0,0 +1,37 @@ +############################################################################### +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +############################################################################### +target_sources( + ${PROJECT_NAME} + PRIVATE + backend_gda_mlx5.cpp +) + +if(GDA_MLX5) +target_sources( + ${PROJECT_NAME} + PRIVATE + queue_pair_mlx5.cpp + segment_builder.cpp +) +endif() diff --git a/projects/rocshmem/src/gda/mlx5/backend_gda_mlx5.cpp b/projects/rocshmem/src/gda/mlx5/backend_gda_mlx5.cpp new file mode 100644 index 0000000000..5052808162 --- /dev/null +++ b/projects/rocshmem/src/gda/mlx5/backend_gda_mlx5.cpp @@ -0,0 +1,122 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "gda/backend_gda.hpp" +#include "util.hpp" + +namespace rocshmem { + +void* GDABackend::mlx5_dv_dlopen() { + void* dv_handle{nullptr}; + dv_handle = dlopen("libmlx5.so", RTLD_NOW); + if (!dv_handle) { + DPRINTF("Could not open libmlx5.so. Returning\n"); + } + return dv_handle; +} + +int GDABackend::mlx5_dv_dl_init() { + mlx5dv_handle_ = mlx5_dv_dlopen(); + if (!mlx5dv_handle_) + return ROCSHMEM_ERROR; + + DLSYM_HELPER(mlx5dv, mlx5dv_, mlx5dv_handle_, init_obj); + return ROCSHMEM_SUCCESS; +} + +void GDABackend::mlx5_initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { + mlx5dv_cq cq_out; + mlx5dv_obj mlx_obj; + mlx_obj.cq.in = cqs[conn_num]; + mlx_obj.cq.out = &cq_out; + mlx5dv.init_obj(&mlx_obj, MLX5DV_OBJ_CQ); + dump_mlx5dv_cq(&cq_out, conn_num); + + /* + * struct mlx5dv_cq { + * void *buf; + * __be32 *dbrec; + * uint32_t cqe_cnt; + * uint32_t cqe_size; + * void *cq_uar; + * uint32_t cqn; + * uint64_t comp_mask; + * }; + */ + + gpu_qp->cq_buf = reinterpret_cast(cq_out.buf); + gpu_qp->cq_cnt = cq_out.cqe_cnt; + gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt); + gpu_qp->cq_dbrec = cq_out.dbrec; + + mlx5dv_qp qp_out; + mlx_obj.qp.in = qps[conn_num]; + mlx_obj.qp.out = &qp_out; + mlx5dv.init_obj(&mlx_obj, MLX5DV_OBJ_QP); + dump_mlx5dv_qp(&qp_out, conn_num); + + /* + * struct mlx5dv_qp { + * __be32 *dbrec; + * struct { + * void *buf; + * uint32_t wqe_cnt; + * uint32_t stride; + * } sq; + * struct { + * void *buf; + * uint32_t wqe_cnt; + * uint32_t stride; + * } rq; + * struct { + * void *reg; + * uint32_t size; + * } bf; + * uint64_t comp_mask; + * off_t uar_mmap_offset; + * uint32_t tirn; + * uint32_t tisn; + * uint32_t rqn; + * uint32_t sqn; + * uint64_t tir_icm_addr; + * }; + */ + + gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR + gpu_qp->sq_buf = reinterpret_cast(qp_out.sq.buf); + gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt; + gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]); + gpu_qp->lkey = htobe32(heap_mr->lkey); + gpu_qp->qp_num = qps[conn_num]->qp_num; + gpu_qp->inline_threshold = inline_threshold; + // The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers + + int hip_dev_id{-1}; + CHECK_HIP(hipGetDevice(&hip_dev_id)); + void* gpu_ptr{nullptr}; + rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id); + gpu_qp->db.ptr = reinterpret_cast(gpu_ptr); +} + +} // namespace rocshmem diff --git a/projects/rocshmem/src/gda/mlx5/provider_gda_mlx5.hpp b/projects/rocshmem/src/gda/mlx5/provider_gda_mlx5.hpp index 6c43f59c67..880d51d939 100644 --- a/projects/rocshmem/src/gda/mlx5/provider_gda_mlx5.hpp +++ b/projects/rocshmem/src/gda/mlx5/provider_gda_mlx5.hpp @@ -34,4 +34,8 @@ typedef union db_reg { uintptr_t uint; } db_reg_t; +struct mlx5dv_funcs_t { + int (*init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type); +}; + #endif //LIBRARY_SRC_GDA_MLX5_GDA_PROVIDER_HPP_ diff --git a/projects/rocshmem/src/gda/mlx5/queue_pair_mlx5.cpp b/projects/rocshmem/src/gda/mlx5/queue_pair_mlx5.cpp new file mode 100644 index 0000000000..dcbf3974f2 --- /dev/null +++ b/projects/rocshmem/src/gda/mlx5/queue_pair_mlx5.cpp @@ -0,0 +1,273 @@ +/****************************************************************************** + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "gda/queue_pair.hpp" +#include "util.hpp" +#include "containers/free_list_impl.hpp" +#include "gda/endian.hpp" +#include "segment_builder.hpp" + +namespace rocshmem { + +__device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) { + swap_endian_store(const_cast(dbrec), (uint32_t)my_sq_counter); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + __hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); + uint64_t db_uint = __hip_atomic_load(&db.uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + db_uint ^= 0x100; + __hip_atomic_store(&db.uint, db_uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +} + +__device__ void QueuePair::mlx5_quiet() { + constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE; + __shared__ uint64_t wqe_broadcast[BROADCAST_SIZE]; + uint8_t wavefront_id = get_flat_block_id() / WF_SIZE; + wqe_broadcast[wavefront_id] = 0; + + uint64_t activemask = get_active_lane_mask(); + uint8_t num_active_lanes = get_active_lane_count(activemask); + uint8_t my_logical_lane_id = get_active_lane_num(activemask); + bool is_leader{my_logical_lane_id == 0}; + const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); + + while (true) { + bool done{false}; + uint64_t quiet_amount{0}; + uint64_t wave_cq_consumer{0}; + while (!done) { + uint64_t active = __hip_atomic_load(&quiet_active, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + uint64_t posted = __hip_atomic_load(&quiet_posted, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + uint64_t completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + if (!(posted - completed)) { + return; + } + int64_t quiet_val = posted - active; + if (quiet_val <= 0) { + continue; + } + quiet_amount = min(num_active_lanes, quiet_val); + if (is_leader) { + done = __hip_atomic_compare_exchange_strong(&quiet_active, &active, active + quiet_amount, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + if (done) { + wave_cq_consumer = __hip_atomic_fetch_add(&cq_consumer, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + } + done = __shfl(done, leader_phys_lane_id); + } + wave_cq_consumer = __shfl(wave_cq_consumer, leader_phys_lane_id); + uint64_t my_cq_consumer = wave_cq_consumer + my_logical_lane_id; + uint64_t my_cq_index = my_cq_consumer % cq_cnt; + + if (my_logical_lane_id < quiet_amount) { + volatile mlx5_cqe64 *cqe_entry = &cq_buf[my_cq_index]; + uint16_t be_wqe_counter{0}; + uint8_t op_own{0}; + uint8_t owner_bit = (my_cq_consumer >> cq_log_cnt) & 1; + bool vote_failed{true}; + + while (vote_failed) { + op_own = *((volatile uint8_t*)&cqe_entry->op_own); + bool my_ownership_vote = (op_own & 1) == owner_bit; + bool my_opcode_vote = (op_own >> 4) != MLX5_CQE_INVALID; + uint64_t votes = __ballot(my_ownership_vote && my_opcode_vote); + vote_failed = __popcll(votes) < quiet_amount; + if (!vote_failed) { + be_wqe_counter = *((volatile uint16_t*)&cqe_entry->wqe_counter); + } + } + + uint16_t wqe_counter; + swap_endian_store(const_cast(&wqe_counter), reinterpret_cast(be_wqe_counter)); + uint64_t wqe_id = outstanding_wqes[wqe_counter]; + __hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit; + *((volatile uint8_t*)&cqe_entry->op_own) = mlx5_invld_bits; + __atomic_signal_fence(__ATOMIC_SEQ_CST); + } + if (is_leader) { + uint64_t completed {0}; + do { + completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } while (completed != wave_cq_consumer); + + swap_endian_store(const_cast(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount)); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id]; + __hip_atomic_fetch_max(&sq_sunk, sunk_wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_add(&quiet_completed, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + } +} + +__device__ void QueuePair::mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { + uint64_t activemask = get_active_lane_mask(); + uint8_t num_active_lanes = get_active_lane_count(activemask); + uint8_t my_logical_lane_id = get_active_lane_num(activemask); + bool is_leader{my_logical_lane_id == 0}; + const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); + uint8_t num_wqes{num_active_lanes}; + uint64_t wave_sq_counter{0}; + + if (is_leader) { + wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); + } + wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id); + uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id; + uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt; + + while (true) { + uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + int64_t num_active_sq_entries = db_touched - sunk; + if (num_active_sq_entries < 0) { + continue; + } + uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries; + uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched; + if (num_free_entries > num_entries_until_wave_last_entry) { + break; + } + mlx5_quiet(); + } + + outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; + + SegmentBuilder seg_build(my_sq_index, sq_buf); + seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0); + seg_build.update_raddr_seg(raddr, rkey); + + if (size <= inline_threshold && opcode == gda_op_rdma_write) { + seg_build.update_inl_data_seg(laddr, size); + } else { + seg_build.update_data_seg(laddr, size, lkey); + } + + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + if (is_leader) { + uint64_t db_touched {0}; + do { + db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } while (db_touched != wave_sq_counter); + + uint8_t *base_ptr = reinterpret_cast(sq_buf); + uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]); + mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); + + __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } +} + +__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, + int64_t atomic_data, int64_t atomic_cmp, bool fetching) { + uint64_t activemask = get_active_lane_mask(); + uint8_t num_active_lanes = get_active_lane_count(activemask); + uint8_t my_logical_lane_id = get_active_lane_num(activemask); + bool is_leader{my_logical_lane_id == 0}; + const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); + uint8_t num_wqes{num_active_lanes}; + uint64_t wave_sq_counter{0}; + + if (is_leader) { + wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id); + uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id; + uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt; + + while (true) { + uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + int64_t num_active_sq_entries = db_touched - sunk; + if (num_active_sq_entries < 0) { + continue; + } + uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries; + uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched; + if (num_free_entries > num_entries_until_wave_last_entry) { + break; + } + mlx5_quiet(); + } + + uint64_t* wave_fetch_atomic{nullptr}; + if (fetching) { + if (is_leader) { + uint64_t db_touched {0}; + do { + db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } while (db_touched != wave_sq_counter); + + auto res = fetching_atomic_freelist->pop_front(); + while (!res.success) { + res = fetching_atomic_freelist->pop_front(); + } + wave_fetch_atomic = res.value; + } + wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id); + } + + outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; + + SegmentBuilder seg_build(my_sq_index, sq_buf); + seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 4, 0, 0); + seg_build.update_raddr_seg(raddr, rkey); + seg_build.update_atomic_seg(atomic_data, atomic_cmp); + if (fetching) { + seg_build.update_data_seg(wave_fetch_atomic + my_logical_lane_id, 8, fetching_atomic_lkey); + } else { + seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey); + } + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + if (is_leader) { + uint64_t db_touched {0}; + do { + db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } while (db_touched != wave_sq_counter); + + uint8_t *base_ptr = reinterpret_cast(sq_buf); + uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]); + mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); + + __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + } + + uint64_t ret{0}; + if (fetching) { + mlx5_quiet(); + ret = wave_fetch_atomic[my_logical_lane_id]; + __atomic_signal_fence(__ATOMIC_SEQ_CST); + if (is_leader) { + fetching_atomic_freelist->push_back(wave_fetch_atomic); + } + } + return ret; +} + +} // namespace rocshmem diff --git a/projects/rocshmem/src/gda/segment_builder.cpp b/projects/rocshmem/src/gda/mlx5/segment_builder.cpp similarity index 99% rename from projects/rocshmem/src/gda/segment_builder.cpp rename to projects/rocshmem/src/gda/mlx5/segment_builder.cpp index f9f6c9046a..497474f7a6 100644 --- a/projects/rocshmem/src/gda/segment_builder.cpp +++ b/projects/rocshmem/src/gda/mlx5/segment_builder.cpp @@ -25,7 +25,7 @@ #include "segment_builder.hpp" #include "util.hpp" -#include "endian.hpp" +#include "gda/endian.hpp" namespace rocshmem { diff --git a/projects/rocshmem/src/gda/segment_builder.hpp b/projects/rocshmem/src/gda/mlx5/segment_builder.hpp similarity index 100% rename from projects/rocshmem/src/gda/segment_builder.hpp rename to projects/rocshmem/src/gda/mlx5/segment_builder.hpp diff --git a/projects/rocshmem/src/gda/queue_pair.cpp b/projects/rocshmem/src/gda/queue_pair.cpp index 9fed08998f..36b5c83cdc 100644 --- a/projects/rocshmem/src/gda/queue_pair.cpp +++ b/projects/rocshmem/src/gda/queue_pair.cpp @@ -27,8 +27,6 @@ #include #include "backend_gda.hpp" -#include "endian.hpp" -#include "segment_builder.hpp" #include "constants.hpp" namespace rocshmem { @@ -66,22 +64,33 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_vendor) { } /* Set Correct opcodes for each NIC */ + switch (gda_vendor) { #if defined(GDA_IONIC) - gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE; - gda_op_rdma_read = IONIC_V2_OP_RDMA_READ; - gda_op_atomic_fa = IONIC_V2_OP_ATOMIC_FA; - gda_op_atomic_cs = IONIC_V2_OP_ATOMIC_CS; -#endif - if (gda_vendor == GDAVendor::BNXT) { + case GDAVendor::IONIC: + gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE; + gda_op_rdma_read = IONIC_V2_OP_RDMA_READ; + gda_op_atomic_fa = IONIC_V2_OP_ATOMIC_FA; + gda_op_atomic_cs = IONIC_V2_OP_ATOMIC_CS; + break; +#endif //defined(GDA_IONIC) +#if defined(GDA_BNXT) + case GDAVendor::BNXT: gda_op_rdma_write = BNXT_RE_WR_OPCD_RDMA_WRITE; gda_op_rdma_read = BNXT_RE_WR_OPCD_RDMA_READ; gda_op_atomic_fa = BNXT_RE_WR_OPCD_ATOMIC_FA; gda_op_atomic_cs = BNXT_RE_WR_OPCD_ATOMIC_CS; - } else if (gda_vendor == GDAVendor::MLX5) { + break; +#endif //defined(GDA_BNXT) +#if defined(GDA_MLX5) + case GDAVendor::MLX5: gda_op_rdma_write = MLX5_OPCODE_RDMA_WRITE; gda_op_rdma_read = MLX5_OPCODE_RDMA_READ; gda_op_atomic_fa = MLX5_OPCODE_ATOMIC_FA; gda_op_atomic_cs = MLX5_OPCODE_ATOMIC_CS; + break; +#endif //defined(GDA_MLX5) + default: + assert(false /* invalid nic provider */); } gda_vendor_ = gda_vendor; } @@ -105,264 +114,6 @@ QueuePair::~QueuePair() { /****************************************************************************** ************************ PROVIDER-SPECIFIC HELPERS *************************** *****************************************************************************/ -#if defined(GDA_IONIC) -__device__ uint64_t QueuePair::get_same_qp_lane_mask() { - uint64_t lane_mask = get_active_lane_mask(); - uintptr_t this_val = reinterpret_cast(this); - - // exclude threads operating on a different qp from this thread lane mask - #pragma unroll - for (int i = 0; i < 64; ++i) { - uint64_t bit_i = 1ull << i; - if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) { - lane_mask &= ~bit_i; - } - } - - return lane_mask; -} - -__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) { - uint32_t my_sq_prod = 0; - - // reserve space for wqes in sq - if (is_first_active_lane(activemask)) { - my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } - my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask)); - - // wait for that space to be available - ionic_quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask); - - return my_sq_prod; -} - -__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) { - uint32_t dbprod = my_sq_prod + num_wqes; - - spin_lock_acquire_shared(&sq_lock, activemask); - - if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) { - sq_dbprod = dbprod; - - ionic_ring_doorbell(dbprod); - } - - spin_lock_release_shared(&sq_lock, activemask); - - return dbprod; -} - -__device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { - uint32_t my_logical_lane_id = get_active_lane_num(activemask); - uint32_t my_cq_pos = cq_pos + my_logical_lane_id; - - /* Look at the cqe at the current position in the cq buffer */ - struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask]; - - /* Determine expected color based on cq wrap count */ - uint32_t qtf_color_bit = swap_endian_val(IONIC_V1_CQE_COLOR); - uint32_t qtf_color_exp = qtf_color_bit; - if (my_cq_pos & (cq_mask + 1)) { - qtf_color_exp = 0; - } - - /* Check if my cqe color == expected color */ - uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags); - if ((qtf_be & qtf_color_bit) != qtf_color_exp) { - return; - } - - uint32_t msn = swap_endian_val(cqe->send.msg_msn); - - /* Report if the completion indicates an error. */ - if (!!(qtf_be & swap_endian_val(IONIC_V1_CQE_ERROR))) { -#if defined(DEBUG) - uint32_t qtf = swap_endian_val(qtf_be); - uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT; - uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; - uint32_t flag = qtf & 0xf; - uint32_t status = swap_endian_val(cqe->status_length); - uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK; - - printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n", - dev_name, qid, type, flag, status, msn, npg); -#endif - /* No other way to signal an error, so just crash. */ - abort(); - } - - /* Only proceed with the furthest ahead cqe to update the sq state */ - uint64_t my_lane_mask = 1ull << __lane_id(); - uint64_t lesser_lane_mask = my_lane_mask - 1; - if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) { - return; - } - - /* update position in the cq */ - cq_pos = my_cq_pos + 1; - - /* - * Ring cq doorbell frequently enough to avoid cq full. - * - * NB: IONIC_CQ_GRACE is 100 - */ - if (((cq_pos - cq_dbpos) & cq_mask) >= 100) { - cq_dbpos = cq_pos; - __atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed? - } - - sq_msn = msn; -} - -__device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) { - uint32_t greed = 10; - - /* wait for sq_msn to catch up or pass cons. */ - /* 0x800000 - sign bit for 24-bit fields */ - while ((sq_msn - cons) & 0x800000) { - if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) { - continue; - } - - /* with lock acquired, this wave polls cqes until caught up */ - while ((sq_msn - cons) & 0x800000) { - uint32_t old_sq_msn = sq_msn; - - poll_wave_cqes(activemask); - - if (!((sq_msn - cons) & 0x800000)) { - if (sq_msn == old_sq_msn) { - break; - } - if (!greed) { - break; - } - --greed; - } - } - - spin_lock_release_shared(&cq_lock, activemask); - break; - } -} -#endif // GDA_IONIC - -#if defined(GDA_IONIC) -__device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) { - // TODO When threads write at once to the same address, not all writes reach the bus. - for (int i = 0; i < 64; ++i) { - if (__lane_id() == i) { - __threadfence(); - __atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST); - } - } - __threadfence(); -} -#endif - -#if defined(GDA_MLX5) -__device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) { - swap_endian_store(const_cast(dbrec), (uint32_t)my_sq_counter); - __atomic_signal_fence(__ATOMIC_SEQ_CST); - - __hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); - uint64_t db_uint = __hip_atomic_load(&db.uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - db_uint ^= 0x100; - __hip_atomic_store(&db.uint, db_uint, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); -} -#endif // GDA_MLX5 - -#if defined(GDA_IONIC) -__device__ void QueuePair::ionic_quiet() { - ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod); -} -#endif - -#if defined(GDA_MLX5) -__device__ void QueuePair::mlx5_quiet() { - constexpr size_t BROADCAST_SIZE = 1024 / WF_SIZE; - __shared__ uint64_t wqe_broadcast[BROADCAST_SIZE]; - uint8_t wavefront_id = get_flat_block_id() / WF_SIZE; - wqe_broadcast[wavefront_id] = 0; - - uint64_t activemask = get_active_lane_mask(); - uint8_t num_active_lanes = get_active_lane_count(activemask); - uint8_t my_logical_lane_id = get_active_lane_num(activemask); - bool is_leader{my_logical_lane_id == 0}; - const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); - - while (true) { - bool done{false}; - uint64_t quiet_amount{0}; - uint64_t wave_cq_consumer{0}; - while (!done) { - uint64_t active = __hip_atomic_load(&quiet_active, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - uint64_t posted = __hip_atomic_load(&quiet_posted, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - uint64_t completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - if (!(posted - completed)) { - return; - } - int64_t quiet_val = posted - active; - if (quiet_val <= 0) { - continue; - } - quiet_amount = min(num_active_lanes, quiet_val); - if (is_leader) { - done = __hip_atomic_compare_exchange_strong(&quiet_active, &active, active + quiet_amount, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - if (done) { - wave_cq_consumer = __hip_atomic_fetch_add(&cq_consumer, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } - } - done = __shfl(done, leader_phys_lane_id); - } - wave_cq_consumer = __shfl(wave_cq_consumer, leader_phys_lane_id); - uint64_t my_cq_consumer = wave_cq_consumer + my_logical_lane_id; - uint64_t my_cq_index = my_cq_consumer % cq_cnt; - - if (my_logical_lane_id < quiet_amount) { - volatile mlx5_cqe64 *cqe_entry = &cq_buf[my_cq_index]; - uint16_t be_wqe_counter{0}; - uint8_t op_own{0}; - uint8_t owner_bit = (my_cq_consumer >> cq_log_cnt) & 1; - bool vote_failed{true}; - - while (vote_failed) { - op_own = *((volatile uint8_t*)&cqe_entry->op_own); - bool my_ownership_vote = (op_own & 1) == owner_bit; - bool my_opcode_vote = (op_own >> 4) != MLX5_CQE_INVALID; - uint64_t votes = __ballot(my_ownership_vote && my_opcode_vote); - vote_failed = __popcll(votes) < quiet_amount; - if (!vote_failed) { - be_wqe_counter = *((volatile uint16_t*)&cqe_entry->wqe_counter); - } - } - - uint16_t wqe_counter; - swap_endian_store(const_cast(&wqe_counter), reinterpret_cast(be_wqe_counter)); - uint64_t wqe_id = outstanding_wqes[wqe_counter]; - __hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); - uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit; - *((volatile uint8_t*)&cqe_entry->op_own) = mlx5_invld_bits; - __atomic_signal_fence(__ATOMIC_SEQ_CST); - } - if (is_leader) { - uint64_t completed {0}; - do { - completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } while (completed != wave_cq_consumer); - - swap_endian_store(const_cast(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount)); - __atomic_signal_fence(__ATOMIC_SEQ_CST); - - uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id]; - __hip_atomic_fetch_max(&sq_sunk, sunk_wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - __hip_atomic_fetch_add(&quiet_completed, quiet_amount, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } - } -} -#endif // GDA_MLX5 - __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { switch (gda_vendor_) { #if defined(GDA_MLX5) @@ -428,286 +179,6 @@ __device__ void QueuePair::quiet() { } } -#if defined(GDA_IONIC) -__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { - uint64_t activemask = get_same_qp_lane_mask(); - uint32_t num_wqes = get_active_lane_count(activemask); - uint32_t my_logical_lane_id = get_active_lane_num(activemask); - uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); - uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; - struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask]; - uint16_t wqe_flags = 0; - - if (!(my_sq_pos & (sq_mask + 1))) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); - } - - if (is_last_active_lane(activemask)) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); - } - - // TODO why is this needed? - if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) { - size = 1; - } - - wqe->base.wqe_idx = my_sq_pos; - wqe->base.op = opcode; - wqe->base.num_sge_key = size ? 1 : 0; - wqe->base.imm_data_key = swap_endian_val(0); - - wqe->common.rdma.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); - wqe->common.rdma.remote_va_low = swap_endian_val(reinterpret_cast(raddr)); - wqe->common.rdma.remote_rkey = swap_endian_val(rkey); - wqe->common.length = swap_endian_val(size); - - if (size) { - if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_INL); - wqe->base.num_sge_key = 0; - if (!laddr) { - // TODO why is this needed? - wqe->common.pld.data[0] = 1; - } else { - memcpy(wqe->common.pld.data, laddr, size); - } - } else { - wqe->common.pld.sgl[0].va = swap_endian_val(reinterpret_cast(laddr)); - wqe->common.pld.sgl[0].len = swap_endian_val(size); - wqe->common.pld.sgl[0].lkey = swap_endian_val(lkey); - } - } - - __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); - - commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); -} -#endif - -#if defined (GDA_MLX5) -__device__ void QueuePair::mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) { - uint64_t activemask = get_active_lane_mask(); - uint8_t num_active_lanes = get_active_lane_count(activemask); - uint8_t my_logical_lane_id = get_active_lane_num(activemask); - bool is_leader{my_logical_lane_id == 0}; - const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); - uint8_t num_wqes{num_active_lanes}; - uint64_t wave_sq_counter{0}; - - if (is_leader) { - wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_AGENT); - } - wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id); - uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id; - uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt; - - while (true) { - uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - int64_t num_active_sq_entries = db_touched - sunk; - if (num_active_sq_entries < 0) { - continue; - } - uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries; - uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched; - if (num_free_entries > num_entries_until_wave_last_entry) { - break; - } - mlx5_quiet(); - } - - outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; - - SegmentBuilder seg_build(my_sq_index, sq_buf); - seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0); - seg_build.update_raddr_seg(raddr, rkey); - - if (size <= inline_threshold && opcode == gda_op_rdma_write) { - seg_build.update_inl_data_seg(laddr, size); - } else { - seg_build.update_data_seg(laddr, size, lkey); - } - - __atomic_signal_fence(__ATOMIC_SEQ_CST); - - if (is_leader) { - uint64_t db_touched {0}; - do { - db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } while (db_touched != wave_sq_counter); - - uint8_t *base_ptr = reinterpret_cast(sq_buf); - uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]); - mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); - - __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } -} -#endif // GDA_MLX5 - -#if defined(GDA_IONIC) -__device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, - int64_t atomic_data, int64_t atomic_cmp, bool fetching) { - uint64_t activemask = get_same_qp_lane_mask(); - uint32_t num_wqes = get_active_lane_count(activemask); - uint32_t my_logical_lane_id = get_active_lane_num(activemask); - bool is_leader{my_logical_lane_id == 0}; - const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); - uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); - uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; - struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask]; - uint16_t wqe_flags = 0; - uint32_t cons; - - uint64_t* wave_fetch_atomic{nullptr}; - if (fetching) { - if (is_leader) { - auto res = fetching_atomic_freelist->pop_front(); - while (!res.success) { - res = fetching_atomic_freelist->pop_front(); - } - wave_fetch_atomic = res.value; - } - wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id); - } - - if (!(my_sq_pos & (sq_mask + 1))) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); - } - - if (is_last_active_lane(activemask)) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); - } - - wqe->base.wqe_idx = my_sq_pos; - wqe->base.op = opcode; - wqe->base.num_sge_key = 1; - wqe->base.imm_data_key = swap_endian_val(0); - - wqe->atomic_v2.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); - wqe->atomic_v2.remote_va_low = swap_endian_val(reinterpret_cast(raddr)); - wqe->atomic_v2.remote_rkey = swap_endian_val(rkey); - wqe->atomic_v2.swap_add_high = swap_endian_val(atomic_data >> 32); - wqe->atomic_v2.swap_add_low = swap_endian_val(atomic_data); - wqe->atomic_v2.compare_high = swap_endian_val(atomic_cmp >> 32); - wqe->atomic_v2.compare_low = swap_endian_val(atomic_cmp); - - if (fetching) { - wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(wave_fetch_atomic + my_logical_lane_id)); - wqe->atomic_v2.lkey = swap_endian_val(fetching_atomic_lkey); - } else { - wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(nonfetching_atomic)); - wqe->atomic_v2.lkey = swap_endian_val(nonfetching_atomic_lkey); - } - - __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); - - cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); - - uint64_t ret{0}; - if (fetching) { - ionic_quiet_internal(activemask, cons); - ret = wave_fetch_atomic[my_logical_lane_id]; - __atomic_signal_fence(__ATOMIC_SEQ_CST); - if (is_leader) { - fetching_atomic_freelist->push_back(wave_fetch_atomic); - } - } - return ret; -} -#endif - -#if defined(GDA_MLX5) -__device__ uint64_t QueuePair::mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, - int64_t atomic_data, int64_t atomic_cmp, bool fetching) { - uint64_t activemask = get_active_lane_mask(); - uint8_t num_active_lanes = get_active_lane_count(activemask); - uint8_t my_logical_lane_id = get_active_lane_num(activemask); - bool is_leader{my_logical_lane_id == 0}; - const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); - uint8_t num_wqes{num_active_lanes}; - uint64_t wave_sq_counter{0}; - - if (is_leader) { - wave_sq_counter = __hip_atomic_fetch_add(&sq_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } - wave_sq_counter = __shfl(wave_sq_counter, leader_phys_lane_id); - uint64_t my_sq_counter = wave_sq_counter + my_logical_lane_id; - uint64_t my_sq_index = my_sq_counter % sq_wqe_cnt; - - while (true) { - uint64_t db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - uint64_t sunk = __hip_atomic_load(&sq_sunk, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - int64_t num_active_sq_entries = db_touched - sunk; - if (num_active_sq_entries < 0) { - continue; - } - uint64_t num_free_entries = min(sq_wqe_cnt, cq_cnt) - num_active_sq_entries; - uint64_t num_entries_until_wave_last_entry = wave_sq_counter + num_active_lanes - db_touched; - if (num_free_entries > num_entries_until_wave_last_entry) { - break; - } - mlx5_quiet(); - } - - uint64_t* wave_fetch_atomic{nullptr}; - if (fetching) { - if (is_leader) { - uint64_t db_touched {0}; - do { - db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } while (db_touched != wave_sq_counter); - - auto res = fetching_atomic_freelist->pop_front(); - while (!res.success) { - res = fetching_atomic_freelist->pop_front(); - } - wave_fetch_atomic = res.value; - } - wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id); - } - - outstanding_wqes[my_sq_counter % OUTSTANDING_TABLE_SIZE] = my_sq_counter; - - SegmentBuilder seg_build(my_sq_index, sq_buf); - seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 4, 0, 0); - seg_build.update_raddr_seg(raddr, rkey); - seg_build.update_atomic_seg(atomic_data, atomic_cmp); - if (fetching) { - seg_build.update_data_seg(wave_fetch_atomic + my_logical_lane_id, 8, fetching_atomic_lkey); - } else { - seg_build.update_data_seg(nonfetching_atomic, 8, nonfetching_atomic_lkey); - } - __atomic_signal_fence(__ATOMIC_SEQ_CST); - - if (is_leader) { - uint64_t db_touched {0}; - do { - db_touched = __hip_atomic_load(&sq_db_touched, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } while (db_touched != wave_sq_counter); - - uint8_t *base_ptr = reinterpret_cast(sq_buf); - uint64_t* ctrl_wqe_8B_for_db = reinterpret_cast(&base_ptr[64 * ((wave_sq_counter + num_wqes - 1) % sq_wqe_cnt)]); - mlx5_ring_doorbell(*ctrl_wqe_8B_for_db, wave_sq_counter + num_wqes); - - __hip_atomic_fetch_add(&quiet_posted, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - __hip_atomic_store(&sq_db_touched, wave_sq_counter + num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - } - - uint64_t ret{0}; - if (fetching) { - mlx5_quiet(); - ret = wave_fetch_atomic[my_logical_lane_id]; - __atomic_signal_fence(__ATOMIC_SEQ_CST); - if (is_leader) { - fetching_atomic_freelist->push_back(wave_fetch_atomic); - } - } - return ret; -} -#endif // GDA_MLX5 - /****************************************************************************** ****************************** SHMEM INTERFACE ******************************* *****************************************************************************/ diff --git a/projects/rocshmem/src/gda/queue_pair.hpp b/projects/rocshmem/src/gda/queue_pair.hpp index 1e0d79fda1..3bb17588f4 100644 --- a/projects/rocshmem/src/gda/queue_pair.hpp +++ b/projects/rocshmem/src/gda/queue_pair.hpp @@ -163,14 +163,17 @@ class QueuePair { #if defined(GDA_MLX5) __device__ uint64_t mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); __device__ void mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); + __device__ void mlx5_quiet(); #endif #if defined(GDA_BNXT) __device__ uint64_t bnxt_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); __device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); + __device__ void bnxt_quiet(); #endif #if defined(GDA_IONIC) __device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch); __device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode); + __device__ void ionic_quiet(); #endif /** @@ -189,65 +192,6 @@ class QueuePair { __device__ void ionic_ring_doorbell(uint32_t pos); #endif -#ifdef GDA_IONIC - __device__ uint64_t get_same_qp_lane_mask(); - - /** - * @brief Reserve space in the sq to post this many wqes. - * @param my_tid my logical thread id. - * @param num_wqes number of sq wqes to reserve for this wave. - * @return position of my_tid=0's wqe. - */ - __device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes); - - /** - * @brief Ring the sq doorbell maintaining order between waves. - * @param last this is the last wqe posted in this wave. - * @param my_sq_prod position of my_tid=0's wqe. - * @param num_wqes number of sq wqes posted in this wave. - * @param wqe this thread's wqe. - * @return doorbell producer index. - */ - __device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes); - - /** - * @brief Helper method to poll the next completion queue entry. - */ - __device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask); - - /** - * @brief Helper method to drain completion queue entries. - * @param cons wait for sq_msn to catch up to this position. - */ - __device__ __attribute__((noinline)) void ionic_quiet_internal(uint64_t active_lane_mask, uint32_t cons); - - uint64_t *cq_dbreg{nullptr}; - uint64_t cq_dbval{0}; - uint64_t cq_mask{0}; - struct ionic_v1_cqe *ionic_cq_buf{nullptr}; - uint32_t cq_lock{SPIN_LOCK_UNLOCKED}; - uint32_t cq_pos{0}; - uint32_t cq_dbpos{0}; - - uint64_t *sq_dbreg{nullptr}; - uint64_t sq_dbval{0}; - uint64_t sq_mask{0}; - struct ionic_v1_wqe *ionic_sq_buf{nullptr}; - uint32_t sq_lock{SPIN_LOCK_UNLOCKED}; - uint32_t sq_dbprod{0}; - uint32_t sq_prod{0}; - uint32_t sq_msn{0}; -#endif - -#if defined(GDA_MLX5) - __device__ void mlx5_quiet(); -#endif -#if defined(GDA_BNXT) - __device__ void bnxt_quiet(); -#endif -#if defined(GDA_IONIC) - __device__ void ionic_quiet(); -#endif int gda_vendor_{0}; /* GDAVendor::BNXT START */ @@ -322,6 +266,58 @@ class QueuePair { /* GDAVendor::MLX5 END */ + /* GDAVendor::IONIC START */ + + uint64_t *cq_dbreg{nullptr}; + uint64_t cq_dbval{0}; + uint64_t cq_mask{0}; + struct ionic_v1_cqe *ionic_cq_buf{nullptr}; + uint32_t cq_lock{SPIN_LOCK_UNLOCKED}; + uint32_t cq_pos{0}; + uint32_t cq_dbpos{0}; + + uint64_t *sq_dbreg{nullptr}; + uint64_t sq_dbval{0}; + uint64_t sq_mask{0}; + struct ionic_v1_wqe *ionic_sq_buf{nullptr}; + uint32_t sq_lock{SPIN_LOCK_UNLOCKED}; + uint32_t sq_dbprod{0}; + uint32_t sq_prod{0}; + uint32_t sq_msn{0}; + + __device__ uint64_t get_same_qp_lane_mask(); + + /** + * @brief Reserve space in the sq to post this many wqes. + * @param my_tid my logical thread id. + * @param num_wqes number of sq wqes to reserve for this wave. + * @return position of my_tid=0's wqe. + */ + __device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes); + + /** + * @brief Ring the sq doorbell maintaining order between waves. + * @param last this is the last wqe posted in this wave. + * @param my_sq_prod position of my_tid=0's wqe. + * @param num_wqes number of sq wqes posted in this wave. + * @param wqe this thread's wqe. + * @return doorbell producer index. + */ + __device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes); + + /** + * @brief Helper method to poll the next completion queue entry. + */ + __device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask); + + /** + * @brief Helper method to drain completion queue entries. + * @param cons wait for sq_msn to catch up to this position. + */ + __device__ __attribute__((noinline)) void ionic_quiet_internal(uint64_t active_lane_mask, uint32_t cons); + + /* GDAVendor::IONIC END */ + uint32_t inline_threshold{0}; char dev_name[24];