gda ionic: restore functionality of ionic gda in rocshmem (#269)

* Revamp findibverbs to find ionic again

* gda ionic: rename ionic_sq_buf ionic_cq_buf

Avoid duplicating member names used by mlx5 gda.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda: move spin lock to util.hpp

Move spin lock out of ionic gda to util.hpp.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: assume latest fwabi changes

There is no firmware abi compatibility in this ionic gda code yet, so
assume we are using the latest firmware abi as of now.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: allow doorbell with incomplete wqes

Use spin lock to ensure doorbell is only written with an increasing
producer index.  Ring the doorbell after this wave has initialized its
wqes.  Wqes of other waves might not be fully initialized, but firmware
will not process them until the phase/color flag is updated in the
respecitve wqes.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: poll cq for additional completions

Keep polling the cq for more than just the minimum number of completions
for this wave of threads to make progress, as long as the cq is not
empty.  A part of wave-optimized cq polling, at the expense of one wave
polling additional completions, it was observed that nearly all other
waves avoid taking the cq lock at all.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda: max_rd_atomic in rts transition

In modify_qp(RTS), specify max_rd_atomic, not max_dest_rd_atomic.

By not speicfying max_rd_atomic (rather, max_rd_atomic=zero), the local
nic may get stuck transmitting the first read or atomic request.  One
read or atomic request is greater than the initiator depth of zero.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: allow specifying traffic class

Allow specifying a traffic class.  The network might have a specific
traffic class configured as no-drop, for example.

Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com>
Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: tweak uxdma assignment

The ideal arrangement will have an equal number of QPs active on each
uxdma pipeline.

Pre-rebase, the better arrangement for rocshmem funcitonal test
benchmarks was [0, 1], [1, 0], [0, 1], [1, 0], ...

Now, following changes that add 'ROCSHMEM_GDA_ALTERNATE_QP_PORTS=1' by
default, the better arrangement is [0, 1], [0, 1], [0, 1], [0, 1], ...

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

---------

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>
Co-authored-by: Aurelien Bouteiller <abouteil@amd.com>
Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com>
This commit is contained in:
Allen Hubbe
2025-10-07 10:08:19 -04:00
committed by GitHub
vanhempi a0fcbf8d35
commit c84bbc250b
9 muutettua tiedostoa jossa 173 lisäystä ja 89 poistoa
+18 -14
Näytä tiedosto
@@ -43,29 +43,31 @@ find_library(IBVerbs_LIBRARY
)
if (GDA_IONIC)
find_library(IBVerbs_PROVIDER_LIBRARY
list(APPEND provider_vars IBVerbs_IONIC_LIBRARY IBVerbs_IONIC_INCLUDE_DIR)
find_path(IBVerbs_IONIC_INCLUDE_DIR infiniband/ionic_dv.h
HINTS ${PC_IBVerbs_INCLUDEDIR} ${PC_IBVerbs_INCLUDE_DIRS}
PATH_SUFFIXES include
)
find_library(IBVerbs_IONIC_LIBRARY
NAMES ionic libionic
HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS}
PATH_SUFFIXES lib lib64
)
find_package_handle_standard_args(IBVerbs DEFAULT_MSG
IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY
add_library(IBVerbs::verbs_ionic UNKNOWN IMPORTED)
set_target_properties(IBVerbs::verbs_ionic PROPERTIES
IMPORTED_LOCATION "${IBVerbs_IONIC_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_IONIC_INCLUDE_DIR}"
)
mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY)
add_library(IBVerbs::verbs_provider UNKNOWN IMPORTED)
set_target_properties(IBVerbs::verbs_provider PROPERTIES
IMPORTED_LOCATION "${IBVerbs_PROVIDER_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_PROVIDER_INCLUDE_DIR}"
)
target_link_libraries(IBVerbs::verbs IBVerbs::verbs_provider)
endif()
find_package_handle_standard_args(IBVerbs DEFAULT_MSG
IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR
IBVerbs_LIBRARY
IBVerbs_INCLUDE_DIR
${provider_vars}
)
mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR)
mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR ${provider_vars})
if (IBVerbs_FOUND)
add_library(IBVerbs::verbs UNKNOWN IMPORTED)
@@ -75,6 +77,8 @@ set_target_properties(IBVerbs::verbs PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_INCLUDE_DIR}"
)
target_link_libraries(IBVerbs::verbs INTERFACE)
target_link_libraries(IBVerbs::verbs INTERFACE
$<TARGET_NAME_IF_EXISTS:IBVerbs::verbs_ionic>
)
endif()
+1
Näytä tiedosto
@@ -63,6 +63,7 @@ namespace envvar {
namespace gda {
const var<bool> alternate_qp_ports("ALTERNATE_QP_PORTS", "", true);
const var<uint8_t> traffic_class("TRAFFIC_CLASS", "", 0);
} // namespace gda
namespace _detail {
+5 -1
Näytä tiedosto
@@ -106,9 +106,12 @@ namespace envvar {
};
using var_types = unique_type_sequence_t<bool,
uint8_t,
int32_t,
uint32_t,
size_t,
int64_t,
uint32_t,
uint64_t,
useconds_t,
std::string,
types::socket_family,
@@ -443,6 +446,7 @@ namespace envvar {
namespace gda {
template <typename T> using var = var<T, category::tag::GDA>;
extern const var<bool> alternate_qp_ports;
extern const var<uint8_t> traffic_class;
} // namespace gda
} // namespace envvar
} // namespace rocshmem
+12 -6
Näytä tiedosto
@@ -841,6 +841,7 @@ void GDABackend::modify_qps_init_to_rtr() {
attr.ah_attr.is_global = 1;
attr.ah_attr.grh.hop_limit = 1;
attr.ah_attr.sl = 1;
attr.ah_attr.grh.traffic_class = envvar::gda::traffic_class;
}
attr_mask = IBV_QP_STATE
@@ -882,9 +883,9 @@ void GDABackend::modify_qps_rtr_to_rts() {
attr.rnr_retry = 7;
if (gda_vendor == GDAVendor::IONIC) {
attr.max_dest_rd_atomic = 15;
attr.max_rd_atomic = 15;
} else {
attr.max_dest_rd_atomic = 1;
attr.max_rd_atomic = 1;
}
attr_mask = IBV_QP_STATE
@@ -1053,7 +1054,7 @@ void GDABackend::create_cqs(int cqe) {
for (int i = 0; i < qps.size(); i++) {
if (gda_vendor == GDAVendor::IONIC) {
cq_attr.parent_domain = pd_uxdma[((i + 1) / 2) & 1];
cq_attr.parent_domain = pd_uxdma[i & 1];
}
cq_ex = ibv_create_cq_ex(context, &cq_attr);
@@ -1093,7 +1094,7 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
gpu_qp->cq_dbval = dvcq.q.db_val;
gpu_qp->cq_mask = dvcq.q.mask;
gpu_qp->cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
gpu_qp->ionic_cq_buf = reinterpret_cast<ionic_v1_cqe*>(dvcq.q.ptr);
ionic_dv_qp dvqp;
ionic_dv_get_qp(&dvqp, qps[conn_num]);
@@ -1101,7 +1102,12 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
gpu_qp->sq_dbreg = gpu_db_sq;
gpu_qp->sq_dbval = dvqp.sq.db_val;
gpu_qp->sq_mask = dvqp.sq.mask;
gpu_qp->sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
gpu_qp->ionic_sq_buf = reinterpret_cast<ionic_v1_wqe *>(dvqp.sq.ptr);
strncpy(gpu_qp->dev_name,
qps[conn_num]->context->device->name,
sizeof(gpu_qp->dev_name));
gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0;
gpu_qp->qp_num = qps[conn_num]->qp_num;
gpu_qp->lkey = heap_mr->lkey;
@@ -1198,7 +1204,7 @@ void GDABackend::create_qps(int sq_length) {
for (int i = 0; i < qps.size(); i++) {
if (gda_vendor == GDAVendor::IONIC) {
attr.pd = pd_uxdma[((i + 1) / 2) & 1];
attr.pd = pd_uxdma[i & 1];
}
attr.send_cq = cqs[i];
attr.recv_cq = cqs[i];
+1 -1
Näytä tiedosto
@@ -97,7 +97,7 @@ class GDABackend : public Backend {
struct ibv_port_attr portinfo;
union ibv_gid gid;
int port = 1;
int gid_index;
int gid_index = 0;
uint32_t *heap_rkey = nullptr;
struct ibv_mr *heap_mr = nullptr;
@@ -32,8 +32,4 @@ extern "C" {
}
#endif
#define SPIN_LOCK_INVALID 0xdead
#define SPIN_LOCK_UNLOCKED 0x1234
#define SPIN_LOCK_LOCKED 0xabcd
#endif //LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_
+60 -57
Näytä tiedosto
@@ -29,7 +29,6 @@
#include "backend_gda.hpp"
#include "endian.hpp"
#include "segment_builder.hpp"
#include "util.hpp"
#include "constants.hpp"
namespace rocshmem {
@@ -69,6 +68,7 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_vendor) {
/* Set Correct opcodes for each NIC */
#if defined(GDA_IONIC)
gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE;
gda_op_rdma_read = IONIC_V2_OP_RDMA_READ;
gda_op_atomic_fa = IONIC_V2_OP_ATOMIC_FA;
gda_op_atomic_cs = IONIC_V2_OP_ATOMIC_CS;
#endif
@@ -122,25 +122,6 @@ __device__ uint64_t QueuePair::get_same_qp_lane_mask() {
return lane_mask;
}
__device__ bool QueuePair::cq_lock_try_acquire(uint64_t activemask) {
uint32_t cq_lock_val = SPIN_LOCK_INVALID;
if (is_first_active_lane(activemask)) {
cq_lock_val = SPIN_LOCK_UNLOCKED;
__hip_atomic_compare_exchange_strong(&cq_lock, &cq_lock_val, SPIN_LOCK_LOCKED,
__ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT);
}
cq_lock_val = __shfl(cq_lock_val, get_first_active_lane_id(activemask));
return (cq_lock_val == SPIN_LOCK_UNLOCKED);
}
__device__ void QueuePair::cq_lock_release(uint64_t activemask) {
if (is_first_active_lane(activemask)) {
__hip_atomic_store(&cq_lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
}
}
__device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) {
uint32_t my_sq_prod = 0;
@@ -156,22 +137,19 @@ __device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes
return my_sq_prod;
}
__device__ uint32_t QueuePair::commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe) {
__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) {
uint32_t dbprod = my_sq_prod + num_wqes;
if (last) {
// signal last wqe before the doorbell
wqe->base.flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
spin_lock_acquire_shared(&sq_lock, activemask);
while (__hip_atomic_load(&sq_dbprod, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT) != my_sq_prod) {
// spin
}
if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) {
sq_dbprod = dbprod;
ionic_ring_doorbell(dbprod);
__hip_atomic_exchange(&sq_dbprod, dbprod, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
}
spin_lock_release_shared(&sq_lock, activemask);
return dbprod;
}
@@ -180,7 +158,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
uint32_t my_cq_pos = cq_pos + my_logical_lane_id;
/* Look at the cqe at the current position in the cq buffer */
struct ionic_v1_cqe *cqe = &cq_buf[my_cq_pos & cq_mask];
struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask];
/* Determine expected color based on cq wrap count */
uint32_t qtf_color_bit = swap_endian_val<uint32_t>(IONIC_V1_CQE_COLOR);
@@ -189,18 +167,9 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
qtf_color_exp = 0;
}
/* Wait for at least one thread cqe color == expected color */
uint32_t qtf_be;
bool ready;
uint64_t ballot_ready;
do {
qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
ready = (qtf_be & qtf_color_bit) == qtf_color_exp;
ballot_ready = __ballot(ready);
} while (!ballot_ready);
/* Other threads saw a ready cqe, but not this thread */
if (!ready) {
/* Check if my cqe color == expected color */
uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags);
if ((qtf_be & qtf_color_bit) != qtf_color_exp) {
return;
}
@@ -214,10 +183,10 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
uint32_t flag = qtf & 0xf;
uint32_t status = swap_endian_val<uint32_t>(cqe->status_length);
uint64_t npg = swap_endian_val<uint64_t>(cqe->send.npg_wqe_id);
uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK;
printf("QUIET ERROR: qid %u type %u flag %#x status %u msn %u npg %lu\n",
qid, type, flag, status, msn, npg);
printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n",
dev_name, qid, type, flag, status, msn, npg);
#endif
/* No other way to signal an error, so just crash. */
abort();
@@ -226,7 +195,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
/* Only proceed with the furthest ahead cqe to update the sq state */
uint64_t my_lane_mask = 1ull << __lane_id();
uint64_t lesser_lane_mask = my_lane_mask - 1;
if (my_lane_mask != (ballot_ready & ~lesser_lane_mask)) {
if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) {
return;
}
@@ -247,19 +216,33 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
}
__device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) {
uint32_t greed = 10;
/* wait for sq_msn to catch up or pass cons. */
/* 0x800000 - sign bit for 24-bit fields */
while ((sq_msn - cons) & 0x800000) {
if (!cq_lock_try_acquire(activemask)) {
if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) {
continue;
}
/* with lock acquired, this wave polls cqes until caught up */
while ((sq_msn - cons) & 0x800000) {
uint32_t old_sq_msn = sq_msn;
poll_wave_cqes(activemask);
if (!((sq_msn - cons) & 0x800000)) {
if (sq_msn == old_sq_msn) {
break;
}
if (!greed) {
break;
}
--greed;
}
}
cq_lock_release(activemask);
spin_lock_release_shared(&cq_lock, activemask);
break;
}
}
@@ -452,17 +435,25 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask];
struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
uint16_t wqe_flags = 0;
if (!(my_sq_pos & (sq_mask + 1))) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
}
if (is_last_active_lane(activemask)) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
}
// TODO why is this needed?
if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) {
size = 1;
}
wqe->base.wqe_id = my_sq_pos;
wqe->base.wqe_idx = my_sq_pos;
wqe->base.op = opcode;
wqe->base.num_sge_key = size ? 1 : 0;
wqe->base.flags = swap_endian_val<uint16_t>(0);
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
@@ -472,7 +463,7 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
if (size) {
if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) {
wqe->base.flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
wqe->base.num_sge_key = 0;
if (!laddr) {
// TODO why is this needed?
@@ -487,7 +478,9 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l
}
}
commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe);
__hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
}
#endif
@@ -563,7 +556,8 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask);
uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask];
struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
uint16_t wqe_flags = 0;
uint32_t cons;
uint64_t* wave_fetch_atomic{nullptr};
@@ -578,10 +572,17 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id);
}
wqe->base.wqe_id = my_sq_pos;
if (!(my_sq_pos & (sq_mask + 1))) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
}
if (is_last_active_lane(activemask)) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
}
wqe->base.wqe_idx = my_sq_pos;
wqe->base.op = opcode;
wqe->base.num_sge_key = 1;
wqe->base.flags = swap_endian_val<uint16_t>(0);
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(reinterpret_cast<uint64_t>(raddr) >> 32);
@@ -600,7 +601,9 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(nonfetching_atomic_lkey);
}
cons = commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe);
__hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes);
uint64_t ret{0};
if (fetching) {
+6 -6
Näytä tiedosto
@@ -37,6 +37,7 @@
#include "rocshmem_config.h"
#include "endian.h"
#include "constants.hpp"
#include "util.hpp"
#include "gda/ionic/provider_gda_ionic.hpp"
#include "gda/mlx5/provider_gda_mlx5.hpp"
@@ -191,9 +192,6 @@ class QueuePair {
#ifdef GDA_IONIC
__device__ uint64_t get_same_qp_lane_mask();
__device__ bool cq_lock_try_acquire(uint64_t active_lane_mask);
__device__ void cq_lock_release(uint64_t active_lane_mask);
/**
* @brief Reserve space in the sq to post this many wqes.
* @param my_tid my logical thread id.
@@ -210,7 +208,7 @@ class QueuePair {
* @param wqe this thread's wqe.
* @return doorbell producer index.
*/
__device__ uint32_t commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe);
__device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes);
/**
* @brief Helper method to poll the next completion queue entry.
@@ -226,7 +224,7 @@ class QueuePair {
uint64_t *cq_dbreg{nullptr};
uint64_t cq_dbval{0};
uint64_t cq_mask{0};
struct ionic_v1_cqe *cq_buf{nullptr};
struct ionic_v1_cqe *ionic_cq_buf{nullptr};
uint32_t cq_lock{SPIN_LOCK_UNLOCKED};
uint32_t cq_pos{0};
uint32_t cq_dbpos{0};
@@ -234,7 +232,8 @@ class QueuePair {
uint64_t *sq_dbreg{nullptr};
uint64_t sq_dbval{0};
uint64_t sq_mask{0};
struct ionic_v1_wqe *sq_buf{nullptr};
struct ionic_v1_wqe *ionic_sq_buf{nullptr};
uint32_t sq_lock{SPIN_LOCK_UNLOCKED};
uint32_t sq_dbprod{0};
uint32_t sq_prod{0};
uint32_t sq_msn{0};
@@ -325,6 +324,7 @@ class QueuePair {
uint32_t inline_threshold{0};
char dev_name[24];
uint32_t qp_num{0};
uint32_t rkey{0};
uint32_t lkey{0};
+70
Näytä tiedosto
@@ -265,6 +265,76 @@ __device__ __forceinline__ bool is_last_active_lane() {
return is_last_active_lane(get_active_lane_mask());
}
#define SPIN_LOCK_INVALID 0xdead
#define SPIN_LOCK_UNLOCKED 0x1234
#define SPIN_LOCK_LOCKED 0xabcd
/*
* Each thread in wave tries to acquire a different lock.
*/
__device__ __forceinline__ bool spin_lock_try_acquire_unique(uint32_t *lock) {
uint32_t lock_val = SPIN_LOCK_UNLOCKED;
__hip_atomic_compare_exchange_strong(lock, &lock_val, SPIN_LOCK_LOCKED,
__ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
__HIP_MEMORY_SCOPE_AGENT);
return lock_val == SPIN_LOCK_UNLOCKED;
}
/*
* Each thread in wave acquires a different lock.
* (deadlock if locks are not different)
*/
__device__ __forceinline__ void spin_lock_acquire_unique(uint32_t *lock) {
while (!spin_lock_try_acquire_unique(lock)) {
// spin
}
}
/*
* Each thread in wave releases a different lock.
*/
__device__ __forceinline__ void spin_lock_release_unique(uint32_t *lock) {
__hip_atomic_store(lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE,
__HIP_MEMORY_SCOPE_AGENT);
}
/*
* Threads in activemask together try to acquire the same lock.
*/
__device__ __forceinline__ bool spin_lock_try_acquire_shared(uint32_t *lock, uint64_t activemask) {
uint32_t lock_val = SPIN_LOCK_INVALID;
if (is_first_active_lane(activemask)) {
lock_val = SPIN_LOCK_UNLOCKED;
__hip_atomic_compare_exchange_strong(lock, &lock_val, SPIN_LOCK_LOCKED,
__ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
__HIP_MEMORY_SCOPE_AGENT);
}
lock_val = __shfl(lock_val, get_first_active_lane_id(activemask));
return lock_val == SPIN_LOCK_UNLOCKED;
}
/*
* Threads in activemask together acquire the same lock.
*/
__device__ __forceinline__ void spin_lock_acquire_shared(uint32_t *lock, uint64_t activemask) {
while (!spin_lock_try_acquire_shared(lock, activemask)) {
// spin
}
}
/*
* Threads in activemask together release the same lock.
*/
__device__ __forceinline__ void spin_lock_release_shared(uint32_t *lock, uint64_t activemask) {
if (is_first_active_lane(activemask)) {
__hip_atomic_store(lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE,
__HIP_MEMORY_SCOPE_AGENT);
}
}
extern __constant__ int* print_lock;