Dosyalar
rocm-systems/src/gda/backend_gda.cpp
T
Allen Hubbe c84bbc250b gda ionic: restore functionality of ionic gda in rocshmem (#269)
* Revamp findibverbs to find ionic again

* gda ionic: rename ionic_sq_buf ionic_cq_buf

Avoid duplicating member names used by mlx5 gda.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda: move spin lock to util.hpp

Move spin lock out of ionic gda to util.hpp.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: assume latest fwabi changes

There is no firmware abi compatibility in this ionic gda code yet, so
assume we are using the latest firmware abi as of now.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: allow doorbell with incomplete wqes

Use spin lock to ensure doorbell is only written with an increasing
producer index.  Ring the doorbell after this wave has initialized its
wqes.  Wqes of other waves might not be fully initialized, but firmware
will not process them until the phase/color flag is updated in the
respecitve wqes.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: poll cq for additional completions

Keep polling the cq for more than just the minimum number of completions
for this wave of threads to make progress, as long as the cq is not
empty.  A part of wave-optimized cq polling, at the expense of one wave
polling additional completions, it was observed that nearly all other
waves avoid taking the cq lock at all.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda: max_rd_atomic in rts transition

In modify_qp(RTS), specify max_rd_atomic, not max_dest_rd_atomic.

By not speicfying max_rd_atomic (rather, max_rd_atomic=zero), the local
nic may get stuck transmitting the first read or atomic request.  One
read or atomic request is greater than the initiator depth of zero.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: allow specifying traffic class

Allow specifying a traffic class.  The network might have a specific
traffic class configured as no-drop, for example.

Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com>
Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: tweak uxdma assignment

The ideal arrangement will have an equal number of QPs active on each
uxdma pipeline.

Pre-rebase, the better arrangement for rocshmem funcitonal test
benchmarks was [0, 1], [1, 0], [0, 1], [1, 0], ...

Now, following changes that add 'ROCSHMEM_GDA_ALTERNATE_QP_PORTS=1' by
default, the better arrangement is [0, 1], [0, 1], [0, 1], [0, 1], ...

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

---------

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>
Co-authored-by: Aurelien Bouteiller <abouteil@amd.com>
Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com>
2025-10-07 10:08:19 -04:00

1294 satır
38 KiB
C++

/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include <cstring>
#include <hip/hip_runtime.h>
#include <cstdlib>
#include <cassert>
#include "backend_gda.hpp"
#include "envvar.hpp"
#include "gda_team.hpp"
#include "mpi_instance.hpp"
#include "util.hpp"
#include "topology.hpp"
namespace rocshmem {
#define NET_CHECK(cmd) { \
if (cmd != MPI_SUCCESS) { \
fprintf(stderr, "Unrecoverable error: MPI Failure\n"); \
abort(); \
} \
}
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
rocshmem_team_t get_external_team(GDATeam *team) {
return reinterpret_cast<rocshmem_team_t>(team);
}
int get_ls_non_zero_bit(char *bitmask, int mask_length) {
int position{-1};
for (int bit_i = 0; bit_i < mask_length; bit_i++) {
int byte_i = bit_i / CHAR_BIT;
if (bitmask[byte_i] & (1 << (bit_i % CHAR_BIT))) {
position = bit_i;
break;
}
}
return position;
}
GDABackend::GDABackend(MPI_Comm comm): Backend(comm) {
init();
}
GDABackend::GDABackend(TcpBootstrap *bootstrap): Backend(bootstrap) {
init();
}
void GDABackend::init() {
type = BackendType::GDA_BACKEND;
read_env();
//TODO setup_host_interface();
/* Initialize the host interface */
if (MPI_COMM_NULL != backend_comm)
host_interface = std::make_shared<HostInterface>(hdp_proxy_.get(), //TODO: need an hdp proxy?
backend_comm,
&heap);
else
host_interface = std::make_shared<HostInterface>(hdp_proxy_.get(), //TODO: need an hdp proxy?
backend_bootstr,
&heap);
setup_wrk_sync_buffer();
setup_fence_buffer();
setup_collectives();
setup_teams();
setup_team_world();
rte_barrier();
setup_ipc();
setup_ibv();
setup_heap_memory_rkey();
setup_gpu_qps();
setup_ctxs();
rte_barrier();
}
GDABackend::~GDABackend() {
cleanup_ctxs();
cleanup_teams();
auto *team_world{team_tracker.get_team_world()};
team_world->~Team();
CHECK_HIP(hipFree(team_world));
cleanup_wrk_sync_buffer();
cleanup_ipc();
cleanup_gpu_qps();
cleanup_heap_memory_rkey();
cleanup_ibv();
if (bnxtdv_handle_ != nullptr)
dlclose(bnxtdv_handle_);
if (mlx5dv_handle_ != nullptr)
dlclose(mlx5dv_handle_);
}
void GDABackend::read_env() {
if (!envvar::requested_dev.is_default()) {
requested_dev = envvar::requested_dev.get_value().c_str();
} else {
int gpu_dev = 0;
CHECK_HIP(hipGetDevice(&gpu_dev));
int nic_dev = rocshmem::GetClosestNicToGpu(gpu_dev, &requested_dev);
assert (nic_dev != -1);
}
}
void GDABackend::setup_ipc() {
const auto &heap_bases{heap.get_heap_bases()};
if (MPI_COMM_NULL != backend_comm)
ipcImpl.ipcHostInit(my_pe, heap_bases, backend_comm);
else
ipcImpl.ipcHostInit(my_pe, heap_bases, backend_bootstr);
}
void GDABackend::cleanup_ipc() {
ipcImpl.ipcHostStop();
}
void GDABackend::setup_host_ctx() {
default_host_ctx = std::make_unique<GDAHostContext>(this, 0);
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
}
void GDABackend::setup_default_ctx() {
TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world;
default_context_proxy_ = GDADefaultContextProxyT(this, tinfo);
}
void GDABackend::setup_ctxs() {
setup_host_ctx();
setup_default_ctx();
CHECK_HIP(hipMalloc(&ctx_array, sizeof(GDAContext) * envvar::max_num_contexts));
// 0th context is default context
for (size_t i = 0; i < envvar::max_num_contexts; i++) {
new (&ctx_array[i]) GDAContext(this, i + 1);
ctx_free_list.get()->push_back(ctx_array + i);
}
}
void GDABackend::cleanup_ctxs() {
ctx_free_list.~FreeListProxy();
for (size_t i = 0; i < envvar::max_num_contexts; i++) {
ctx_array[i].~GDAContext();
}
CHECK_HIP(hipFree(ctx_array));
}
__device__ bool GDABackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) {
GDAContext *ctx_{nullptr};
auto pop_result = ctx_free_list.get()->pop_front();
if (!pop_result.success) {
return false;
}
ctx_ = pop_result.value;
ctx->ctx_opaque = ctx_;
ctx_->tinfo = reinterpret_cast<TeamInfo *>(ctx->team_opaque);
return true;
}
__device__ void GDABackend::destroy_ctx(rocshmem_ctx_t *ctx) {
ctx_free_list.get()->push_back(static_cast<GDAContext *>(ctx->ctx_opaque));
}
void GDABackend::setup_team_world() {
TeamInfo *team_info_wrt_parent, *team_info_wrt_world;
/**
* Allocate device-side memory for team_world and construct a
* GDA team in it.
*/
CHECK_HIP(hipMalloc(&team_info_wrt_parent, sizeof(TeamInfo)));
CHECK_HIP(hipMalloc(&team_info_wrt_world, sizeof(TeamInfo)));
new (team_info_wrt_parent) TeamInfo(nullptr, 0, 1, num_pes);
new (team_info_wrt_world) TeamInfo(nullptr, 0, 1, num_pes);
GDATeam *team_world{nullptr};
CHECK_HIP(hipMalloc(&team_world, sizeof(GDATeam)));
new (team_world) GDATeam(this, team_info_wrt_parent, team_info_wrt_world,
num_pes, my_pe, backend_comm, 0);
team_tracker.set_team_world(team_world);
/**
* Copy the address to ROCSHMEM_TEAM_WORLD.
*/
ROCSHMEM_TEAM_WORLD = reinterpret_cast<rocshmem_team_t>(team_world);
}
void GDABackend::team_destroy(rocshmem_team_t team) {
GDATeam *team_obj = get_internal_gda_team(team);
/* Mark the pool as available */
int bit = team_obj->pool_index_;
int byte_i = bit / CHAR_BIT;
team_pool_bitmask_[byte_i] |= 1 << (bit % CHAR_BIT);
team_obj->~GDATeam();
CHECK_HIP(hipFree(team_obj));
}
//TODO: factorize somewhere else maybe backend_bc
void GDABackend::Alltoall_char_inplace (char *inoutbuf, size_t num_bytes, rocshmem_team_t team) {
// Implement an Alltoall outside of MPI assuming in_place communication
GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
std::vector<int> pes_in_world;
for (int i = 0; i < num_pes; i++) {
pes_in_world.push_back(team_obj->get_pe_in_world(i));
}
backend_bootstr->groupAlltoall(inoutbuf, num_bytes, pes_in_world);
}
//TODO: factorize somewhere else, maybe backend_bc?
void GDABackend::Allreduce_char_BAND (char* inbuf, char *outbuf, size_t num_bytes,
Team *team) {
// Implement an Allreduce outside of MPI. This is specialized for the scenario
// required for the team creation, i.e. assuming bytes and using BAND operation.
// Implementation uses an Allgather operation followed a local reduction.
GDATeam *team_obj = reinterpret_cast<GDATeam *>(team);
int num_pes = team_obj->num_pes;
std::vector<int> pes_in_world;
char *tmp_buffer = new char[num_pes * num_bytes];
std::memset(tmp_buffer, 0, num_pes * num_bytes);
std::memcpy(&tmp_buffer[my_pe * num_bytes], inbuf, num_bytes);
for (int i = 0; i < num_pes; i++) {
pes_in_world.push_back(team_obj->get_pe_in_world(i));
}
backend_bootstr->groupAllGather(tmp_buffer, num_bytes, pes_in_world);
for (int i = 0; i < num_bytes; i++) {
outbuf[i] = tmp_buffer[i];
for (int j = 1; j < num_pes; j++) {
outbuf[i] &= tmp_buffer[j * num_bytes + i];
}
}
delete[] tmp_buffer;
}
void GDABackend::create_new_team([[maybe_unused]] Team *parent_team,
TeamInfo *team_info_wrt_parent,
TeamInfo *team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
rocshmem_team_t *new_team) {
/**
* Read the bit mask and find out a common index into
* the pool of available work arrays.
*/
if (team_comm != MPI_COMM_NULL) {
NET_CHECK(mpilib_ftable_.Allreduce(team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_,
MPI_CHAR, MPI_BAND, team_comm));
} else {
Allreduce_char_BAND (team_pool_bitmask_, team_reduced_bitmask_, team_bitmask_size_, parent_team);
}
/* Pick the least significant non-zero bit (logical layout) in the reduced
* bitmask */
auto max_num_teams{team_tracker.get_max_num_teams()};
int common_index = get_ls_non_zero_bit(team_reduced_bitmask_, max_num_teams);
if (common_index < 0) {
/* No team available */
printf("Could not create team, all bits in use. Aborting.\n");
abort();
}
/* Mark the team as taken (by unsetting the bit in the pool bitmask) */
int byte = common_index / CHAR_BIT;
team_pool_bitmask_[byte] &= ~(1 << (common_index % CHAR_BIT));
/**
* Allocate device-side memory for team_world and
* construct a GDA team in it
*/
GDATeam *new_team_obj;
CHECK_HIP(hipMalloc(&new_team_obj, sizeof(GDATeam)));
new (new_team_obj)
GDATeam(this, team_info_wrt_parent, team_info_wrt_world, num_pes,
my_pe_in_new_team, team_comm, common_index);
*new_team = get_external_team(new_team_obj);
}
void GDABackend::ctx_create(int64_t options, void **ctx) {
GDAHostContext *new_ctx{nullptr};
new_ctx = new GDAHostContext(this, options);
*ctx = new_ctx;
}
GDAHostContext *get_internal_gda_net_ctx(Context *ctx) {
return reinterpret_cast<GDAHostContext *>(ctx);
}
void GDABackend::ctx_destroy(Context *ctx) {
GDAHostContext *gda_host_ctx{get_internal_gda_net_ctx(ctx)};
delete gda_host_ctx;
}
void GDABackend::reset_backend_stats() {
assert(false);
}
void GDABackend::dump_backend_stats() {
assert(false);
}
__host__ void GDABackend::global_exit(int status) {
if (backend_comm != MPI_COMM_NULL)
mpilib_ftable_.Abort(backend_comm, status);
else
abort();
}
void GDABackend::cleanup_teams() {
free(team_pool_bitmask_);
free(team_reduced_bitmask_);
}
void GDABackend::setup_wrk_sync_buffer() {
/**
* compute work/sync buffer size
*/
auto max_num_teams{team_tracker.get_max_num_teams()};
/**
* size of barrier sync
*/
wrk_sync_pool_size_ += sizeof(*barrier_sync) * ROCSHMEM_BARRIER_SYNC_SIZE;
/**
* Size of sync arrays for the teams
*/
wrk_sync_pool_size_ += sizeof(long) * max_num_teams *
(ROCSHMEM_BARRIER_SYNC_SIZE +
ROCSHMEM_REDUCE_SYNC_SIZE +
ROCSHMEM_BCAST_SYNC_SIZE +
ROCSHMEM_ALLTOALL_SYNC_SIZE);
/**
* Size of work arrays for the teams
* Accommodate largest possible data type for pWrk
*/
wrk_sync_pool_size_ += sizeof(double) * max_num_teams *
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE;
/**
* Size of fence array
*/
wrk_sync_pool_size_ += sizeof(int) * num_pes; //TODO: do we need a fence array?
/**
* Allocate a buffer of size wrk_sync_pool_size_, using heap memory
* (should be uncached fine-grained ideally)
*/
heap.malloc((void**)&wrk_sync_pool_, wrk_sync_pool_size_);
assert(wrk_sync_pool_);
wrk_sync_pool_top_ = wrk_sync_pool_;
}
void GDABackend::cleanup_wrk_sync_buffer() {
heap.free(wrk_sync_pool_);
}
void GDABackend::setup_fence_buffer() { //TODO is this used?
/*
* Reserve memory for fence
*/
fence_pool = reinterpret_cast<int *>(wrk_sync_pool_top_);
wrk_sync_pool_top_ += sizeof(int) * num_pes;
}
void GDABackend::setup_collectives() {
/*
* Allocate heap space for barrier_sync
*/
size_t one_sync_size_bytes {sizeof(*barrier_sync)};
size_t sync_size_bytes {one_sync_size_bytes * ROCSHMEM_BARRIER_SYNC_SIZE};
barrier_sync = reinterpret_cast<int64_t*>(wrk_sync_pool_top_);
wrk_sync_pool_top_ += sync_size_bytes;
/*
* Initialize the barrier synchronization array with default values.
*/
for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
barrier_sync[i] = ROCSHMEM_SYNC_VALUE;
}
/*
* Make sure that all processing elements have done this before
* continuing.
*/
rte_barrier();
}
void GDABackend::setup_teams() {
/**
* Allocate pools for the teams sync and work arrary from the SHEAP.
*/
auto max_num_teams{team_tracker.get_max_num_teams()};
barrier_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BARRIER_SYNC_SIZE
* max_num_teams;
reduce_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_REDUCE_SYNC_SIZE
* max_num_teams;
bcast_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
* max_num_teams;
alltoall_pSync_pool = reinterpret_cast<long *>(wrk_sync_pool_top_);
wrk_sync_pool_top_ += sizeof(long) * ROCSHMEM_BCAST_SYNC_SIZE
* max_num_teams;
/* Accommodating for largest possible data type for pWrk */
pWrk_pool = reinterpret_cast<void *>(wrk_sync_pool_top_);
wrk_sync_pool_top_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
* max_num_teams;
/**
* Initialize the sync arrays in the pool with default values.
*/
long *barrier_pSync, *reduce_pSync, *bcast_pSync, *alltoall_pSync;
for (int team_i = 0; team_i < max_num_teams; team_i++) {
barrier_pSync = reinterpret_cast<long *>(
&barrier_pSync_pool[team_i * ROCSHMEM_BARRIER_SYNC_SIZE]);
reduce_pSync = reinterpret_cast<long *>(
&reduce_pSync_pool[team_i * ROCSHMEM_REDUCE_SYNC_SIZE]);
bcast_pSync = reinterpret_cast<long *>(
&bcast_pSync_pool[team_i * ROCSHMEM_BCAST_SYNC_SIZE]);
alltoall_pSync = reinterpret_cast<long *>(
&alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
for (size_t i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (size_t i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (size_t i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
for (size_t i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
}
}
/**
* Initialize bit mask
*
* Logical:
* MSB..........................................................................LSB
* Physical: MSB...1st least significant 8 bits...LSB MSB...2nd least
* signifant 8 bits...LSB
*
* Description shows only a 2-byte long mask but idea extends to any
* arbitrary size.
*/
team_bitmask_size_ = (max_num_teams % CHAR_BIT) ? (max_num_teams / CHAR_BIT + 1)
: (max_num_teams / CHAR_BIT);
team_pool_bitmask_ = reinterpret_cast<char *>(malloc(team_bitmask_size_));
team_reduced_bitmask_ = reinterpret_cast<char *>(malloc(team_bitmask_size_));
memset(team_pool_bitmask_, 0, team_bitmask_size_);
memset(team_reduced_bitmask_, 0, team_bitmask_size_);
/* Set all to available except the 0th one (reserved for TEAM_WORLD) */
for (int bit_i = 1; bit_i < max_num_teams; bit_i++) {
int byte_i = bit_i / CHAR_BIT;
team_pool_bitmask_[byte_i] |= 1 << (bit_i % CHAR_BIT);
}
/**
* Make sure that all processing elements have done this before
* continuing.
*/
rte_barrier();
}
void GDABackend::rte_barrier() {
if (backend_comm != MPI_COMM_NULL) {
NET_CHECK(mpilib_ftable_.Barrier(backend_comm));
} else {
backend_bootstr->barrier();
}
}
int GDABackend::mlx5_dv_dl_init () {
mlx5dv_handle_ = dlopen("libmlx5.so", RTLD_NOW);
if (!mlx5dv_handle_) {
DPRINTF("Could not open libmlx5.so. Returning\n");
return ROCSHMEM_ERROR;
}
DLSYM_HELPER(mlx5dv_ftable_, mlx5dv_, mlx5dv_handle_, init_obj);
return ROCSHMEM_SUCCESS;
}
void GDABackend::setup_ibv() {
autodetect_dv_libs();
open_ib_device();
create_queues();
exchange_qp_dest_info();
modify_qps_reset_to_init();
modify_qps_init_to_rtr();
modify_qps_rtr_to_rts();
rte_barrier();
}
void GDABackend::cleanup_ibv() {
int err;
if (gda_vendor == GDAVendor::BNXT) {
CHECK_HIP(hipHostUnregister(db_region_attr.dbr));
for (int i = 0; i < qps.size(); i++) {
err = bnxtdv_ftable_.destroy_qp(qps[i]);
CHECK_ZERO(err, "bnxt_re_dv_destroy_qp");
err = bnxtdv_ftable_.umem_dereg(bnxt_qps[i].attr.rq_umem_handle);
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (RQ)");
err = bnxtdv_ftable_.umem_dereg(bnxt_qps[i].attr.sq_umem_handle);
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg (SQ)");
CHECK_HIP(hipFree(bnxt_qps[i].sq_buf));
CHECK_HIP(hipFree(bnxt_qps[i].rq_buf));
err = bnxtdv_ftable_.destroy_cq(cqs[i]);
CHECK_ZERO(err, "bnxt_re_dv_destroy_cq");
err = bnxtdv_ftable_.umem_dereg(bnxt_cqs[i].umem_handle);
CHECK_ZERO(err, "bnxt_re_dv_umem_dereg");
CHECK_HIP(hipFree(bnxt_cqs[i].buf));
}
} else {
for (int i = 0; i < qps.size(); i++) {
err = ibv_destroy_qp(qps[i]);
CHECK_ZERO(err, "ibv_destroy_qp");
err = ibv_destroy_cq(cqs[i]);
CHECK_ZERO(err, "ibv_destroy_cqs");
}
if (gda_vendor == GDAVendor::IONIC) {
err = ibv_dealloc_pd(pd_uxdma[0]);
CHECK_ZERO(err, "ibv_dealloc_pd (uxdma[0])");
err = ibv_dealloc_pd(pd_uxdma[1]);
CHECK_ZERO(err, "ibv_dealloc_pd (uxdma[1])");
}
err = ibv_dealloc_pd(pd_parent);
CHECK_ZERO(err, "ibv_dealloc_pd (pd_parent)");
}
err = ibv_dealloc_pd(pd_orig);
CHECK_ZERO(err, "ibv_dealloc_pd (pd_orig)");
err = ibv_close_device(context);
CHECK_ZERO(err, "ibv_close_device");
}
void GDABackend::autodetect_dv_libs() {
int ret;
#ifdef GDA_IONIC
gda_vendor = GDAVendor::IONIC;
#endif
if (gda_vendor == GDAVendor::NONE) {
ret = bnxt_dv_dl_init();
if (ret == ROCSHMEM_SUCCESS) {
gda_vendor = GDAVendor::BNXT;
} else {
DPRINTF("Initializing rocSHMEM BNXT GDA support failed\n");
}
}
if (gda_vendor == GDAVendor::NONE) {
ret = mlx5_dv_dl_init();
if (ret == ROCSHMEM_SUCCESS) {
gda_vendor = GDAVendor::MLX5;
} else {
DPRINTF("Initializing rocSHMEM MLX5 GDA support failed\n");
}
}
if (gda_vendor == GDAVendor::NONE) {
printf("Initializing rocSHMEM with IONIC, BNXT, or MLX5 GDA support failed\n");
abort();
}
}
void GDABackend::exchange_qp_dest_info() {
for (int i = 0; i < qps.size(); i++) {
dest_info[i].lid = portinfo.lid;
dest_info[i].qpn = qps[i]->qp_num;
dest_info[i].psn = 0;
dest_info[i].gid = gid;
}
for (size_t i = 0; i < envvar::max_num_contexts + 1; i++) {
if (backend_comm != MPI_COMM_NULL) {
mpilib_ftable_.Alltoall(MPI_IN_PLACE, sizeof(dest_info_t), MPI_CHAR, dest_info.data() + i * num_pes, sizeof(dest_info_t), MPI_CHAR, backend_comm);
} else {
Alltoall_char_inplace(reinterpret_cast<char*>(dest_info.data() + i * num_pes), sizeof(dest_info_t), ROCSHMEM_TEAM_WORLD);
}
}
}
void GDABackend::setup_heap_memory_rkey() {
auto *base_heap = heap.get_local_heap_base();
int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
heap_mr = ibv_reg_mr(pd_orig, base_heap, heap.get_size(), access);
CHECK_NNULL(heap_mr, "ibv_reg_mr");
const size_t rkeys_size = sizeof(uint32_t) * num_pes;
uint32_t *host_rkey_cpy = reinterpret_cast<uint32_t*>(malloc(rkeys_size));
if (!host_rkey_cpy) { abort(); }
CHECK_HIP(hipHostMalloc(&heap_rkey, sizeof(uint32_t) * num_pes));
heap_rkey[my_pe] = heap_mr->rkey;
hipStream_t stream;
CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
CHECK_HIP(hipMemcpyAsync(host_rkey_cpy, heap_rkey, rkeys_size, hipMemcpyDeviceToHost, stream));
CHECK_HIP(hipStreamSynchronize(stream));
if (backend_comm != MPI_COMM_NULL)
mpilib_ftable_.Allgather(MPI_IN_PLACE, sizeof(uint32_t), MPI_CHAR, host_rkey_cpy, sizeof(uint32_t), MPI_CHAR, backend_comm);
else
backend_bootstr->allGather(host_rkey_cpy, sizeof(uint32_t));
CHECK_HIP(hipMemcpyAsync(heap_rkey, host_rkey_cpy, rkeys_size, hipMemcpyHostToDevice, stream));
CHECK_HIP(hipStreamSynchronize(stream));
CHECK_HIP(hipStreamDestroy(stream));
free(host_rkey_cpy);
}
void GDABackend::cleanup_heap_memory_rkey() {
int ret = ibv_dereg_mr(heap_mr);
CHECK_ZERO(ret, "ibv_dereg_mr");
CHECK_HIP(hipHostFree(heap_rkey));
}
void GDABackend::setup_gpu_qps() {
size_t qp_objs_count;
size_t qp_objs_mem_size;
qp_objs_count = (envvar::max_num_contexts + 1) * num_pes;
qp_objs_mem_size = sizeof(QueuePair) * qp_objs_count;
CHECK_HIP(hipMalloc(&gpu_qps, qp_objs_mem_size));
host_qps = (QueuePair*) malloc(qp_objs_mem_size);
CHECK_NNULL(host_qps, "malloc (host_qps)");
for (size_t i = 0; i < qp_objs_count; i++) {
new (&host_qps[i]) QueuePair(pd_orig, gda_vendor);
CHECK_HIP(hipMemcpy(&gpu_qps[i], &host_qps[i], sizeof(QueuePair), hipMemcpyDefault));
if (gda_vendor == GDAVendor::BNXT) {
bnxt_initialize_gpu_qp(&gpu_qps[i], i);
} else {
initialize_gpu_qp(&gpu_qps[i], i);
}
}
}
void GDABackend::cleanup_gpu_qps() {
size_t qp_objs_count;
qp_objs_count = (envvar::max_num_contexts + 1) * num_pes;
for (size_t i = 0; i < qp_objs_count; i++) {
host_qps[i].~QueuePair();
}
free(host_qps);
CHECK_HIP(hipFree(gpu_qps));
gpu_qps = nullptr;
}
//TODO this ifdef sequence should go in a nic-specific file, like it is for bnxt, maybe whats above too?
void GDABackend::open_ib_device() {
struct ibv_device **device_list = nullptr;
struct ibv_device *device = nullptr;
int num_devices = 0;
int err;
device_list = ibv_get_device_list(&num_devices);
CHECK_NNULL(device_list, "ibv_get_device_list");
device = device_list[0]; //TODO default to HIP selected device?
if (requested_dev) {
for (int i = 0; i < num_devices; i++) {
const char *select_device = ibv_get_device_name(device_list[i]);
CHECK_NNULL(select_device, "ibv_get_device_name");
if (strstr(select_device, requested_dev)) {
device = device_list[i];
break;
}
}
}
context = ibv_open_device(device);
CHECK_NNULL(context, "ib open device");
dump_ibv_context(context);
dump_ibv_device(context->device);
pd_orig = ibv_alloc_pd(context);
CHECK_NNULL(pd_orig, "ib allocate pd");
dump_ibv_pd(pd_orig);
if (gda_vendor == GDAVendor::IONIC || gda_vendor == GDAVendor::MLX5) {
create_parent_domain();
}
err = ibv_query_port(context, port, &portinfo);
CHECK_ZERO(err, "ibv_query_port");
dump_ibv_port_attr(&portinfo);
/* Must init after querying port */
select_gid_index();
ibv_free_device_list(device_list);
}
void GDABackend::modify_qps_reset_to_init() {
int err;
struct ibv_qp_attr attr;
int attr_mask;
memset(&attr, 0, sizeof(struct ibv_qp_attr));
attr.qp_state = IBV_QPS_INIT;
attr.pkey_index = 0;
attr.port_num = port;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE
| IBV_ACCESS_LOCAL_WRITE
| IBV_ACCESS_REMOTE_READ
| IBV_ACCESS_REMOTE_ATOMIC;
attr_mask = IBV_QP_STATE
| IBV_QP_PKEY_INDEX
| IBV_QP_PORT
| IBV_QP_ACCESS_FLAGS;
for (int i =0; i < qps.size() ; i++) {
if (gda_vendor == GDAVendor::BNXT) {
err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
} else {
err = ibv_modify_qp(qps[i], &attr, attr_mask);
}
CHECK_ZERO(err, "modify_qp (INIT)");
}
}
void GDABackend::modify_qps_init_to_rtr() {
struct ibv_qp_attr attr;
int attr_mask;
int err;
memset(&attr, 0, sizeof(struct ibv_qp_attr));
attr.qp_state = IBV_QPS_RTR;
attr.path_mtu = portinfo.active_mtu;
attr.min_rnr_timer = 12;
attr.ah_attr.port_num = port;
if (gda_vendor == GDAVendor::IONIC) {
attr.max_dest_rd_atomic = 15;
} else {
attr.max_dest_rd_atomic = 1;
}
if (portinfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
attr.ah_attr.grh.sgid_index = gid_index;
attr.ah_attr.is_global = 1;
attr.ah_attr.grh.hop_limit = 1;
attr.ah_attr.sl = 1;
attr.ah_attr.grh.traffic_class = envvar::gda::traffic_class;
}
attr_mask = IBV_QP_STATE
| IBV_QP_PATH_MTU
| IBV_QP_RQ_PSN
| IBV_QP_DEST_QPN
| IBV_QP_AV
| IBV_QP_MAX_DEST_RD_ATOMIC
| IBV_QP_MIN_RNR_TIMER;
for (int i = 0; i < qps.size(); i++) {
attr.rq_psn = dest_info[i].psn;
attr.dest_qp_num = dest_info[i].qpn;
if (portinfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
memcpy(&attr.ah_attr.grh.dgid, &dest_info[i].gid, 16);
} else {
attr.ah_attr.dlid = dest_info[i].lid;
}
if (gda_vendor == GDAVendor::BNXT) {
err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
} else {
err = ibv_modify_qp(qps[i], &attr, attr_mask);
}
CHECK_ZERO(err, "modify_qp (RTR)");
}
}
void GDABackend::modify_qps_rtr_to_rts() {
struct ibv_qp_attr attr;
int attr_mask;
int err;
memset(&attr, 0, sizeof(struct ibv_qp_attr));
attr.qp_state = IBV_QPS_RTS;
attr.timeout = 14;
attr.retry_cnt = 7;
attr.rnr_retry = 7;
if (gda_vendor == GDAVendor::IONIC) {
attr.max_rd_atomic = 15;
} else {
attr.max_rd_atomic = 1;
}
attr_mask = IBV_QP_STATE
| IBV_QP_SQ_PSN
| IBV_QP_MAX_QP_RD_ATOMIC
| IBV_QP_TIMEOUT
| IBV_QP_RETRY_CNT
| IBV_QP_RNR_RETRY;
for (int i = 0; i < qps.size(); i++) {
attr.sq_psn = dest_info[i].psn;
if (gda_vendor == GDAVendor::BNXT) {
err = bnxtdv_ftable_.modify_qp(qps[i], &attr, attr_mask, 0, 0);
} else {
err = ibv_modify_qp(qps[i], &attr, attr_mask);
}
CHECK_ZERO(err, "modify_qp (RTS)");
}
}
void GDABackend::create_queues() {
int ncqes;
size_t resize_length;
if (gda_vendor == GDAVendor::IONIC) {
ncqes = envvar::sq_size << 1;
} else {
ncqes = envvar::sq_size;
}
resize_length = (envvar::max_num_contexts + 1) * num_pes;
dest_info.resize(resize_length);
cqs.resize(resize_length);
qps.resize(resize_length);
bnxt_cqs.resize(resize_length);
bnxt_qps.resize(resize_length);
if (gda_vendor == GDAVendor::BNXT) {
bnxt_create_cqs(ncqes);
bnxt_create_qps(envvar::sq_size);
} else {
create_cqs(ncqes);
create_qps(envvar::sq_size);
}
alternate_qp_ports();
}
void GDABackend::alternate_qp_ports() {
size_t cur_qp_idx;
size_t new_qp_idx;
/* We can't remap anything */
if (envvar::max_num_contexts == 1) {
return;
}
if (envvar::gda::alternate_qp_ports) {
/* If we assume two PEs and a default context and two user context,
* initially QPs are in the following port order:
*
* Labels :| DCTX PE0 | DCTX PE1 | CTX0 PE0 | CTX0 PE1 | CTX1 PE0 | CTX1 PE1 |
* QPs :| QP0 | QP1 | QP2 | QP3 | QP4 | QP5 |
* Port :| 0 | 1 | 0 | 1 | 0 | 1 |
*
* This creates the pattern where PE1 is always mapped to port 0 but we want it
* to use both ports to maximize throughput/bandwidth.
*
* So we reorder our QPs
*
* Labels :| DCTX PE0 | DCTX PE1 | CTX0 PE0 | CTX0 PE1 | CTX1 PE0 | CTX1 PE1 |
* QPs :| QP0 | QP1 | QP2 | QP4 | QP3 | QP5 |
* Port :| 0 | 1 | 1 | 0 | 0 | 1 |
*
* We alternate the ports [0,1] and [1,0] for each context.
* Therefore, when we use two contexts we use both ports
*
*/
/* Re-Map each context */
for (size_t i = 1; i < (envvar::max_num_contexts + 1); i += 2) {
for (size_t p = 0; p < num_pes; p += 2) {
cur_qp_idx = (i * num_pes) + p;
new_qp_idx = cur_qp_idx + 1;
if (new_qp_idx < qps.size()) {
// Swap QPs
std::swap(cqs[cur_qp_idx], cqs[new_qp_idx]);
std::swap(qps[cur_qp_idx], qps[new_qp_idx]);
std::swap(bnxt_cqs[cur_qp_idx], bnxt_cqs[new_qp_idx]);
std::swap(bnxt_qps[cur_qp_idx], bnxt_qps[new_qp_idx]);
}
}
}
}
}
void* GDABackend::pd_alloc_device_uncached(struct ibv_pd* pd, void* pd_context, size_t size, size_t alignment, uint64_t resource_type) {
void* dev_ptr{nullptr};
CHECK_HIP(hipExtMallocWithFlags(reinterpret_cast<void**>(&dev_ptr), size, hipDeviceMallocUncached));
memset(dev_ptr, 0, size);
return dev_ptr;
}
void* GDABackend::pd_alloc_host(struct ibv_pd* pd, void* pd_context, size_t size, size_t alignment, uint64_t resource_type) {
void* dev_ptr{nullptr};
CHECK_HIP(hipHostMalloc(reinterpret_cast<void**>(&dev_ptr), size, hipHostMallocDefault));
memset(dev_ptr, 0, size);
return dev_ptr;
}
void GDABackend::pd_release(struct ibv_pd* pd, void* pd_context, void* ptr, uint64_t resource_type) {
CHECK_HIP(hipFree(ptr));
}
void GDABackend::create_parent_domain() {
struct ibv_parent_domain_init_attr pattr;
memset(&pattr, 0, sizeof(struct ibv_parent_domain_init_attr));
pattr.pd = pd_orig;
pattr.td = nullptr,
pattr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS;
pattr.free = GDABackend::pd_release;
pattr.pd_context = nullptr;
if (gda_vendor == GDAVendor::IONIC) {
pattr.alloc = GDABackend::pd_alloc_device_uncached;
} else {
pattr.alloc = GDABackend::pd_alloc_host;
}
pd_parent = ibv_alloc_parent_domain(context, &pattr);
CHECK_NNULL(pd_parent, "ibv_alloc_parent_domain");
dump_ibv_pd(pd_parent);
#ifdef GDA_IONIC
ionic_dv_pd_set_sqcmb(pd_parent, false, false, false);
ionic_dv_pd_set_rqcmb(pd_parent, false, false, false);
for (int uxdma_i = 0; uxdma_i < 2; ++uxdma_i) {
pd_uxdma[uxdma_i] = ibv_alloc_parent_domain(context, &pattr);
CHECK_NNULL(pd_uxdma[uxdma_i], "ibv_alloc_parent_domain (uxdma)");
ionic_dv_pd_set_sqcmb(pd_uxdma[uxdma_i], false, false, false);
ionic_dv_pd_set_rqcmb(pd_uxdma[uxdma_i], false, false, false);
ionic_dv_pd_set_udma_mask(pd_uxdma[uxdma_i], 1u << uxdma_i);
}
#endif /* GDA_IONIC */
}
void GDABackend::create_cqs(int cqe) {
struct ibv_cq_init_attr_ex cq_attr;
struct ibv_cq_ex *cq_ex;
memset(&cq_attr, 0, sizeof(struct ibv_cq_init_attr_ex));
cq_attr.cqe = cqe;
cq_attr.cq_context = nullptr;
cq_attr.channel = nullptr;
cq_attr.comp_vector = 0;
cq_attr.flags = 0;
cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_PD;
cq_attr.parent_domain = pd_parent;
for (int i = 0; i < qps.size(); i++) {
if (gda_vendor == GDAVendor::IONIC) {
cq_attr.parent_domain = pd_uxdma[i & 1];
}
cq_ex = ibv_create_cq_ex(context, &cq_attr);
CHECK_NNULL(cq_ex, "ibv_create_cq_ex");
cqs[i] = ibv_cq_ex_to_cq(cq_ex);
CHECK_NNULL(cqs[i], "ibv_cq_ex_to_cq");
}
}
void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
int hip_dev_id{-1};
CHECK_HIP(hipGetDevice(&hip_dev_id));
#ifdef GDA_IONIC
ionic_dv_ctx dvctx;
ionic_dv_get_ctx(&dvctx, context);
void* gpu_db_page = nullptr;
rocm_memory_lock_to_fine_grain(dvctx.db_page, 0x1000, &gpu_db_page, hip_dev_id);
uint64_t *db_page_u64 = reinterpret_cast<uint64_t*>(dvctx.db_page);
uint64_t *gpu_db_page_u64 = reinterpret_cast<uint64_t*>(gpu_db_page);
uint64_t *gpu_db_ptr = &gpu_db_page_u64[dvctx.db_ptr - db_page_u64];
gpu_db_page = gpu_db_page;
gpu_db_cq = &gpu_db_ptr[dvctx.cq_qtype];
gpu_db_sq = &gpu_db_ptr[dvctx.sq_qtype];
uint8_t udma_idx = ionic_dv_qp_get_udma_idx(qps[conn_num]);
ionic_dv_cq dvcq;
ionic_dv_get_cq(&dvcq, cqs[conn_num], udma_idx);
gpu_qp->cq_dbreg = gpu_db_cq;
gpu_qp->cq_dbval = dvcq.q.db_val;
gpu_qp->cq_mask = dvcq.q.mask;
gpu_qp->ionic_cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
ionic_dv_qp dvqp;
ionic_dv_get_qp(&dvqp, qps[conn_num]);
gpu_qp->sq_dbreg = gpu_db_sq;
gpu_qp->sq_dbval = dvqp.sq.db_val;
gpu_qp->sq_mask = dvqp.sq.mask;
gpu_qp->ionic_sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
strncpy(gpu_qp->dev_name,
qps[conn_num]->context->device->name,
sizeof(gpu_qp->dev_name));
gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0;
gpu_qp->qp_num = qps[conn_num]->qp_num;
gpu_qp->lkey = heap_mr->lkey;
gpu_qp->rkey = heap_rkey[conn_num % num_pes];
gpu_qp->inline_threshold = 32;
#endif /* GDA_IONIC */
if (gda_vendor == GDAVendor::MLX5) {
mlx5dv_cq cq_out;
mlx5dv_obj mlx_obj;
mlx_obj.cq.in = cqs[conn_num];
mlx_obj.cq.out = &cq_out;
mlx5dv_ftable_.init_obj(&mlx_obj, MLX5DV_OBJ_CQ);
dump_mlx5dv_cq(&cq_out, conn_num);
/*
* struct mlx5dv_cq {
* void *buf;
* __be32 *dbrec;
* uint32_t cqe_cnt;
* uint32_t cqe_size;
* void *cq_uar;
* uint32_t cqn;
* uint64_t comp_mask;
* };
*/
gpu_qp->cq_buf = reinterpret_cast<mlx5_cqe64*>(cq_out.buf);
gpu_qp->cq_cnt = cq_out.cqe_cnt;
gpu_qp->cq_log_cnt = log2(cq_out.cqe_cnt);
gpu_qp->cq_dbrec = cq_out.dbrec;
mlx5dv_qp qp_out;
mlx_obj.qp.in = qps[conn_num];
mlx_obj.qp.out = &qp_out;
mlx5dv_ftable_.init_obj(&mlx_obj, MLX5DV_OBJ_QP);
dump_mlx5dv_qp(&qp_out, conn_num);
/*
* struct mlx5dv_qp {
* __be32 *dbrec;
* struct {
* void *buf;
* uint32_t wqe_cnt;
* uint32_t stride;
* } sq;
* struct {
* void *buf;
* uint32_t wqe_cnt;
* uint32_t stride;
* } rq;
* struct {
* void *reg;
* uint32_t size;
* } bf;
* uint64_t comp_mask;
* off_t uar_mmap_offset;
* uint32_t tirn;
* uint32_t tisn;
* uint32_t rqn;
* uint32_t sqn;
* uint64_t tir_icm_addr;
* };
*/
gpu_qp->dbrec = &qp_out.dbrec[1]; // points to two pointers: 0 -> MLX5_REC_DBR, 1 -> MLX5_SND_DBR
gpu_qp->sq_buf = reinterpret_cast<uint64_t*>(qp_out.sq.buf);
gpu_qp->sq_wqe_cnt = qp_out.sq.wqe_cnt;
gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]);
gpu_qp->lkey = htobe32(heap_mr->lkey);
gpu_qp->qp_num = qps[conn_num]->qp_num;
gpu_qp->inline_threshold = inline_threshold;
// The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers
void* gpu_ptr{nullptr};
rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id);
gpu_qp->db.ptr = reinterpret_cast<uint64_t*>(gpu_ptr);
}
}
void GDABackend::create_qps(int sq_length) {
struct ibv_qp_init_attr_ex attr;
memset(&attr, 0, sizeof(struct ibv_qp_init_attr_ex));
attr.cap.max_send_wr = sq_length;
attr.cap.max_send_sge = 1;
attr.cap.max_inline_data = inline_threshold;
attr.sq_sig_all = 0;
attr.qp_type = IBV_QPT_RC;
attr.comp_mask = IBV_QP_INIT_ATTR_PD;
attr.pd = pd_parent;
if (gda_vendor == GDAVendor::IONIC) {
attr.cap.max_recv_sge = 1; // TODO allow zero sges in the driver
}
for (int i = 0; i < qps.size(); i++) {
if (gda_vendor == GDAVendor::IONIC) {
attr.pd = pd_uxdma[i & 1];
}
attr.send_cq = cqs[i];
attr.recv_cq = cqs[i];
qps[i] = ibv_create_qp_ex(context, &attr);
CHECK_NNULL(qps[i], "ibv_create_qp_ex");
}
}
void GDABackend::select_gid_index() {
struct ibv_gid_entry *gid_entries;
struct ibv_gid_entry *gid_entry;
union ibv_gid current_gid;
union ibv_gid selected_gid;
uint32_t gid_type;
int err;
const uint8_t local_gid_prefix[2] = {0xFE, 0x80};
uint32_t selected_gid_type = IBV_GID_TYPE_ROCE_V1;
int selected_gid_index = -1;
ssize_t gid_tbl_entries = 0;
int gid_tbl_len = portinfo.gid_tbl_len;
gid_entries = (struct ibv_gid_entry*) calloc(gid_tbl_len, sizeof(struct ibv_gid_entry));
gid_tbl_entries = ibv_query_gid_table(context, gid_entries, gid_tbl_len, 0);
if (gid_tbl_entries < 0) {
fprintf(stderr, "[Warning] ibv_query_gid_table failed. No available GIDs\n");
free(gid_entries);
return;
}
for (int i = 0; i < gid_tbl_entries; i++) {
gid_type = gid_entries[i].gid_type;
/* rocSHMEM does not use GIDs for IB mode */
if (gid_type == IBV_GID_TYPE_IB) {
break;
}
current_gid = gid_entries[i].gid;
err = ibv_query_gid(context, port, i, &current_gid);
CHECK_ZERO(err, "ibv_query_gid");
/* We don't want local GIDs */
if (memcmp(current_gid.raw, &local_gid_prefix, 2) == 0) {
continue;
}
/* Initialize using first available GID */
if (selected_gid_index == -1) {
selected_gid_index = i;
selected_gid_type = gid_type;
selected_gid = current_gid;
}
/* Choose RoCEv2 over RoCEv1 */
else if (gid_type > selected_gid_type) {
selected_gid_index = i;
selected_gid_type = gid_type;
selected_gid = current_gid;
}
}
gid_index = selected_gid_index;
gid = selected_gid;
free(gid_entries);
}
int GDABackend::ibv_mtu_to_int(enum ibv_mtu mtu) {
switch (mtu) {
case IBV_MTU_256: return 256;
case IBV_MTU_512: return 512;
case IBV_MTU_1024: return 1024;
case IBV_MTU_2048: return 2048;
case IBV_MTU_4096: return 4096;
default: {
fprintf(stderr, "[ERROR] Invalid ibv_mtu\n");
return 0;
}
}
}
} // namespace rocshmem