diff --git a/cmake/FindIBVerbs.cmake b/cmake/FindIBVerbs.cmake index c295443a25..258e91687c 100644 --- a/cmake/FindIBVerbs.cmake +++ b/cmake/FindIBVerbs.cmake @@ -43,29 +43,31 @@ find_library(IBVerbs_LIBRARY ) if (GDA_IONIC) -find_library(IBVerbs_PROVIDER_LIBRARY +list(APPEND provider_vars IBVerbs_IONIC_LIBRARY IBVerbs_IONIC_INCLUDE_DIR) +find_path(IBVerbs_IONIC_INCLUDE_DIR infiniband/ionic_dv.h + HINTS ${PC_IBVerbs_INCLUDEDIR} ${PC_IBVerbs_INCLUDE_DIRS} + PATH_SUFFIXES include +) + +find_library(IBVerbs_IONIC_LIBRARY NAMES ionic libionic HINTS ${PC_IBVerbs_LIBDIR} ${PC_IBVerbs_LIBRARY_DIRS} PATH_SUFFIXES lib lib64 ) -find_package_handle_standard_args(IBVerbs DEFAULT_MSG - IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY +add_library(IBVerbs::verbs_ionic UNKNOWN IMPORTED) +set_target_properties(IBVerbs::verbs_ionic PROPERTIES + IMPORTED_LOCATION "${IBVerbs_IONIC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_IONIC_INCLUDE_DIR}" ) -mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR IBVerbs_PROVIDER_LIBRARY) - -add_library(IBVerbs::verbs_provider UNKNOWN IMPORTED) -set_target_properties(IBVerbs::verbs_provider PROPERTIES - IMPORTED_LOCATION "${IBVerbs_PROVIDER_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_PROVIDER_INCLUDE_DIR}" -) -target_link_libraries(IBVerbs::verbs IBVerbs::verbs_provider) endif() find_package_handle_standard_args(IBVerbs DEFAULT_MSG - IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR + IBVerbs_LIBRARY + IBVerbs_INCLUDE_DIR + ${provider_vars} ) -mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR) +mark_as_advanced(IBVerbs_LIBRARY IBVerbs_INCLUDE_DIR ${provider_vars}) if (IBVerbs_FOUND) add_library(IBVerbs::verbs UNKNOWN IMPORTED) @@ -75,6 +77,8 @@ set_target_properties(IBVerbs::verbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVerbs_INCLUDE_DIR}" ) -target_link_libraries(IBVerbs::verbs INTERFACE) +target_link_libraries(IBVerbs::verbs INTERFACE + $ +) endif() diff --git a/src/envvar.cpp b/src/envvar.cpp index 158b45ca06..bdaa53fbe9 100644 --- a/src/envvar.cpp +++ b/src/envvar.cpp @@ -63,6 +63,7 @@ namespace envvar { namespace gda { const var alternate_qp_ports("ALTERNATE_QP_PORTS", "", true); + const var traffic_class("TRAFFIC_CLASS", "", 0); } // namespace gda namespace _detail { diff --git a/src/envvar.hpp b/src/envvar.hpp index b22748e4de..14b587ace3 100644 --- a/src/envvar.hpp +++ b/src/envvar.hpp @@ -106,9 +106,12 @@ namespace envvar { }; using var_types = unique_type_sequence_t using var = var; extern const var alternate_qp_ports; + extern const var traffic_class; } // namespace gda } // namespace envvar } // namespace rocshmem diff --git a/src/gda/backend_gda.cpp b/src/gda/backend_gda.cpp index 979dd8daa0..6565fd7b66 100644 --- a/src/gda/backend_gda.cpp +++ b/src/gda/backend_gda.cpp @@ -841,6 +841,7 @@ void GDABackend::modify_qps_init_to_rtr() { attr.ah_attr.is_global = 1; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.sl = 1; + attr.ah_attr.grh.traffic_class = envvar::gda::traffic_class; } attr_mask = IBV_QP_STATE @@ -882,9 +883,9 @@ void GDABackend::modify_qps_rtr_to_rts() { attr.rnr_retry = 7; if (gda_vendor == GDAVendor::IONIC) { - attr.max_dest_rd_atomic = 15; + attr.max_rd_atomic = 15; } else { - attr.max_dest_rd_atomic = 1; + attr.max_rd_atomic = 1; } attr_mask = IBV_QP_STATE @@ -1053,7 +1054,7 @@ void GDABackend::create_cqs(int cqe) { for (int i = 0; i < qps.size(); i++) { if (gda_vendor == GDAVendor::IONIC) { - cq_attr.parent_domain = pd_uxdma[((i + 1) / 2) & 1]; + cq_attr.parent_domain = pd_uxdma[i & 1]; } cq_ex = ibv_create_cq_ex(context, &cq_attr); @@ -1093,7 +1094,7 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { gpu_qp->cq_dbval = dvcq.q.db_val; gpu_qp->cq_mask = dvcq.q.mask; - gpu_qp->cq_buf = reinterpret_cast(dvcq.q.ptr); + gpu_qp->ionic_cq_buf = reinterpret_cast(dvcq.q.ptr); ionic_dv_qp dvqp; ionic_dv_get_qp(&dvqp, qps[conn_num]); @@ -1101,7 +1102,12 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) { gpu_qp->sq_dbreg = gpu_db_sq; gpu_qp->sq_dbval = dvqp.sq.db_val; gpu_qp->sq_mask = dvqp.sq.mask; - gpu_qp->sq_buf = reinterpret_cast(dvqp.sq.ptr); + gpu_qp->ionic_sq_buf = reinterpret_cast(dvqp.sq.ptr); + + strncpy(gpu_qp->dev_name, + qps[conn_num]->context->device->name, + sizeof(gpu_qp->dev_name)); + gpu_qp->dev_name[sizeof(gpu_qp->dev_name) - 1] = 0; gpu_qp->qp_num = qps[conn_num]->qp_num; gpu_qp->lkey = heap_mr->lkey; @@ -1198,7 +1204,7 @@ void GDABackend::create_qps(int sq_length) { for (int i = 0; i < qps.size(); i++) { if (gda_vendor == GDAVendor::IONIC) { - attr.pd = pd_uxdma[((i + 1) / 2) & 1]; + attr.pd = pd_uxdma[i & 1]; } attr.send_cq = cqs[i]; attr.recv_cq = cqs[i]; diff --git a/src/gda/backend_gda.hpp b/src/gda/backend_gda.hpp index d2fe108cf3..74cba5974d 100644 --- a/src/gda/backend_gda.hpp +++ b/src/gda/backend_gda.hpp @@ -97,7 +97,7 @@ class GDABackend : public Backend { struct ibv_port_attr portinfo; union ibv_gid gid; int port = 1; - int gid_index; + int gid_index = 0; uint32_t *heap_rkey = nullptr; struct ibv_mr *heap_mr = nullptr; diff --git a/src/gda/ionic/provider_gda_ionic.hpp b/src/gda/ionic/provider_gda_ionic.hpp index 73397a9464..adb21c9a2d 100644 --- a/src/gda/ionic/provider_gda_ionic.hpp +++ b/src/gda/ionic/provider_gda_ionic.hpp @@ -32,8 +32,4 @@ extern "C" { } #endif -#define SPIN_LOCK_INVALID 0xdead -#define SPIN_LOCK_UNLOCKED 0x1234 -#define SPIN_LOCK_LOCKED 0xabcd - #endif //LIBRARY_SRC_GDA_IONIC_GDA_PROVIDER_HPP_ diff --git a/src/gda/queue_pair.cpp b/src/gda/queue_pair.cpp index 0d60b3c544..9fed08998f 100644 --- a/src/gda/queue_pair.cpp +++ b/src/gda/queue_pair.cpp @@ -29,7 +29,6 @@ #include "backend_gda.hpp" #include "endian.hpp" #include "segment_builder.hpp" -#include "util.hpp" #include "constants.hpp" namespace rocshmem { @@ -69,6 +68,7 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_vendor) { /* Set Correct opcodes for each NIC */ #if defined(GDA_IONIC) gda_op_rdma_write = IONIC_V2_OP_RDMA_WRITE; + gda_op_rdma_read = IONIC_V2_OP_RDMA_READ; gda_op_atomic_fa = IONIC_V2_OP_ATOMIC_FA; gda_op_atomic_cs = IONIC_V2_OP_ATOMIC_CS; #endif @@ -122,25 +122,6 @@ __device__ uint64_t QueuePair::get_same_qp_lane_mask() { return lane_mask; } -__device__ bool QueuePair::cq_lock_try_acquire(uint64_t activemask) { - uint32_t cq_lock_val = SPIN_LOCK_INVALID; - - if (is_first_active_lane(activemask)) { - cq_lock_val = SPIN_LOCK_UNLOCKED; - __hip_atomic_compare_exchange_strong(&cq_lock, &cq_lock_val, SPIN_LOCK_LOCKED, - __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT); - } - cq_lock_val = __shfl(cq_lock_val, get_first_active_lane_id(activemask)); - - return (cq_lock_val == SPIN_LOCK_UNLOCKED); -} - -__device__ void QueuePair::cq_lock_release(uint64_t activemask) { - if (is_first_active_lane(activemask)) { - __hip_atomic_store(&cq_lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); - } -} - __device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) { uint32_t my_sq_prod = 0; @@ -156,22 +137,19 @@ __device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes return my_sq_prod; } -__device__ uint32_t QueuePair::commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe) { +__device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) { uint32_t dbprod = my_sq_prod + num_wqes; - if (last) { - // signal last wqe before the doorbell - wqe->base.flags |= swap_endian_val(IONIC_V1_FLAG_SIG); + spin_lock_acquire_shared(&sq_lock, activemask); - while (__hip_atomic_load(&sq_dbprod, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT) != my_sq_prod) { - // spin - } + if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) { + sq_dbprod = dbprod; ionic_ring_doorbell(dbprod); - - __hip_atomic_exchange(&sq_dbprod, dbprod, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); } + spin_lock_release_shared(&sq_lock, activemask); + return dbprod; } @@ -180,7 +158,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { uint32_t my_cq_pos = cq_pos + my_logical_lane_id; /* Look at the cqe at the current position in the cq buffer */ - struct ionic_v1_cqe *cqe = &cq_buf[my_cq_pos & cq_mask]; + struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask]; /* Determine expected color based on cq wrap count */ uint32_t qtf_color_bit = swap_endian_val(IONIC_V1_CQE_COLOR); @@ -189,18 +167,9 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { qtf_color_exp = 0; } - /* Wait for at least one thread cqe color == expected color */ - uint32_t qtf_be; - bool ready; - uint64_t ballot_ready; - do { - qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags); - ready = (qtf_be & qtf_color_bit) == qtf_color_exp; - ballot_ready = __ballot(ready); - } while (!ballot_ready); - - /* Other threads saw a ready cqe, but not this thread */ - if (!ready) { + /* Check if my cqe color == expected color */ + uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags); + if ((qtf_be & qtf_color_bit) != qtf_color_exp) { return; } @@ -214,10 +183,10 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; uint32_t flag = qtf & 0xf; uint32_t status = swap_endian_val(cqe->status_length); - uint64_t npg = swap_endian_val(cqe->send.npg_wqe_id); + uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK; - printf("QUIET ERROR: qid %u type %u flag %#x status %u msn %u npg %lu\n", - qid, type, flag, status, msn, npg); + printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n", + dev_name, qid, type, flag, status, msn, npg); #endif /* No other way to signal an error, so just crash. */ abort(); @@ -226,7 +195,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { /* Only proceed with the furthest ahead cqe to update the sq state */ uint64_t my_lane_mask = 1ull << __lane_id(); uint64_t lesser_lane_mask = my_lane_mask - 1; - if (my_lane_mask != (ballot_ready & ~lesser_lane_mask)) { + if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) { return; } @@ -247,19 +216,33 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { } __device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) { + uint32_t greed = 10; + /* wait for sq_msn to catch up or pass cons. */ /* 0x800000 - sign bit for 24-bit fields */ while ((sq_msn - cons) & 0x800000) { - if (!cq_lock_try_acquire(activemask)) { + if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) { continue; } /* with lock acquired, this wave polls cqes until caught up */ while ((sq_msn - cons) & 0x800000) { + uint32_t old_sq_msn = sq_msn; + poll_wave_cqes(activemask); + + if (!((sq_msn - cons) & 0x800000)) { + if (sq_msn == old_sq_msn) { + break; + } + if (!greed) { + break; + } + --greed; + } } - cq_lock_release(activemask); + spin_lock_release_shared(&cq_lock, activemask); break; } } @@ -452,17 +435,25 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l uint32_t my_logical_lane_id = get_active_lane_num(activemask); uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; - struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask]; + struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask]; + uint16_t wqe_flags = 0; + + if (!(my_sq_pos & (sq_mask + 1))) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); + } + + if (is_last_active_lane(activemask)) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); + } // TODO why is this needed? if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) { size = 1; } - wqe->base.wqe_id = my_sq_pos; + wqe->base.wqe_idx = my_sq_pos; wqe->base.op = opcode; wqe->base.num_sge_key = size ? 1 : 0; - wqe->base.flags = swap_endian_val(0); wqe->base.imm_data_key = swap_endian_val(0); wqe->common.rdma.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); @@ -472,7 +463,7 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l if (size) { if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) { - wqe->base.flags |= swap_endian_val(IONIC_V1_FLAG_INL); + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_INL); wqe->base.num_sge_key = 0; if (!laddr) { // TODO why is this needed? @@ -487,7 +478,9 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *l } } - commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe); + __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); + + commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); } #endif @@ -563,7 +556,8 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_ const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; - struct ionic_v1_wqe *wqe = &sq_buf[my_sq_pos & sq_mask]; + struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask]; + uint16_t wqe_flags = 0; uint32_t cons; uint64_t* wave_fetch_atomic{nullptr}; @@ -578,10 +572,17 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_ wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id); } - wqe->base.wqe_id = my_sq_pos; + if (!(my_sq_pos & (sq_mask + 1))) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); + } + + if (is_last_active_lane(activemask)) { + wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); + } + + wqe->base.wqe_idx = my_sq_pos; wqe->base.op = opcode; wqe->base.num_sge_key = 1; - wqe->base.flags = swap_endian_val(0); wqe->base.imm_data_key = swap_endian_val(0); wqe->atomic_v2.remote_va_high = swap_endian_val(reinterpret_cast(raddr) >> 32); @@ -600,7 +601,9 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_ wqe->atomic_v2.lkey = swap_endian_val(nonfetching_atomic_lkey); } - cons = commit_sq(is_last_active_lane(activemask), my_sq_prod, num_wqes, wqe); + __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); + + cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); uint64_t ret{0}; if (fetching) { diff --git a/src/gda/queue_pair.hpp b/src/gda/queue_pair.hpp index e0987ea133..1e0d79fda1 100644 --- a/src/gda/queue_pair.hpp +++ b/src/gda/queue_pair.hpp @@ -37,6 +37,7 @@ #include "rocshmem_config.h" #include "endian.h" #include "constants.hpp" +#include "util.hpp" #include "gda/ionic/provider_gda_ionic.hpp" #include "gda/mlx5/provider_gda_mlx5.hpp" @@ -191,9 +192,6 @@ class QueuePair { #ifdef GDA_IONIC __device__ uint64_t get_same_qp_lane_mask(); - __device__ bool cq_lock_try_acquire(uint64_t active_lane_mask); - __device__ void cq_lock_release(uint64_t active_lane_mask); - /** * @brief Reserve space in the sq to post this many wqes. * @param my_tid my logical thread id. @@ -210,7 +208,7 @@ class QueuePair { * @param wqe this thread's wqe. * @return doorbell producer index. */ - __device__ uint32_t commit_sq(bool last, uint32_t my_sq_prod, uint32_t num_wqes, struct ionic_v1_wqe *wqe); + __device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes); /** * @brief Helper method to poll the next completion queue entry. @@ -226,7 +224,7 @@ class QueuePair { uint64_t *cq_dbreg{nullptr}; uint64_t cq_dbval{0}; uint64_t cq_mask{0}; - struct ionic_v1_cqe *cq_buf{nullptr}; + struct ionic_v1_cqe *ionic_cq_buf{nullptr}; uint32_t cq_lock{SPIN_LOCK_UNLOCKED}; uint32_t cq_pos{0}; uint32_t cq_dbpos{0}; @@ -234,7 +232,8 @@ class QueuePair { uint64_t *sq_dbreg{nullptr}; uint64_t sq_dbval{0}; uint64_t sq_mask{0}; - struct ionic_v1_wqe *sq_buf{nullptr}; + struct ionic_v1_wqe *ionic_sq_buf{nullptr}; + uint32_t sq_lock{SPIN_LOCK_UNLOCKED}; uint32_t sq_dbprod{0}; uint32_t sq_prod{0}; uint32_t sq_msn{0}; @@ -325,6 +324,7 @@ class QueuePair { uint32_t inline_threshold{0}; + char dev_name[24]; uint32_t qp_num{0}; uint32_t rkey{0}; uint32_t lkey{0}; diff --git a/src/util.hpp b/src/util.hpp index 6942ec895b..ca4191660b 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -265,6 +265,76 @@ __device__ __forceinline__ bool is_last_active_lane() { return is_last_active_lane(get_active_lane_mask()); } +#define SPIN_LOCK_INVALID 0xdead +#define SPIN_LOCK_UNLOCKED 0x1234 +#define SPIN_LOCK_LOCKED 0xabcd + +/* + * Each thread in wave tries to acquire a different lock. + */ +__device__ __forceinline__ bool spin_lock_try_acquire_unique(uint32_t *lock) { + uint32_t lock_val = SPIN_LOCK_UNLOCKED; + + __hip_atomic_compare_exchange_strong(lock, &lock_val, SPIN_LOCK_LOCKED, + __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, + __HIP_MEMORY_SCOPE_AGENT); + + return lock_val == SPIN_LOCK_UNLOCKED; +} + +/* + * Each thread in wave acquires a different lock. + * (deadlock if locks are not different) + */ +__device__ __forceinline__ void spin_lock_acquire_unique(uint32_t *lock) { + while (!spin_lock_try_acquire_unique(lock)) { + // spin + } +} + +/* + * Each thread in wave releases a different lock. + */ +__device__ __forceinline__ void spin_lock_release_unique(uint32_t *lock) { + __hip_atomic_store(lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE, + __HIP_MEMORY_SCOPE_AGENT); +} + +/* + * Threads in activemask together try to acquire the same lock. + */ +__device__ __forceinline__ bool spin_lock_try_acquire_shared(uint32_t *lock, uint64_t activemask) { + uint32_t lock_val = SPIN_LOCK_INVALID; + + if (is_first_active_lane(activemask)) { + lock_val = SPIN_LOCK_UNLOCKED; + __hip_atomic_compare_exchange_strong(lock, &lock_val, SPIN_LOCK_LOCKED, + __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, + __HIP_MEMORY_SCOPE_AGENT); + } + lock_val = __shfl(lock_val, get_first_active_lane_id(activemask)); + + return lock_val == SPIN_LOCK_UNLOCKED; +} + +/* + * Threads in activemask together acquire the same lock. + */ +__device__ __forceinline__ void spin_lock_acquire_shared(uint32_t *lock, uint64_t activemask) { + while (!spin_lock_try_acquire_shared(lock, activemask)) { + // spin + } +} + +/* + * Threads in activemask together release the same lock. + */ +__device__ __forceinline__ void spin_lock_release_shared(uint32_t *lock, uint64_t activemask) { + if (is_first_active_lane(activemask)) { + __hip_atomic_store(lock, SPIN_LOCK_UNLOCKED, __ATOMIC_RELEASE, + __HIP_MEMORY_SCOPE_AGENT); + } +} extern __constant__ int* print_lock;