diff --git a/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp b/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp index c827bdf42f..789d47ec8b 100644 --- a/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp +++ b/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp @@ -30,19 +30,11 @@ namespace rocshmem { __device__ uint64_t QueuePair::get_same_qp_lane_mask() { - uint64_t lane_mask = get_active_lane_mask(); - uintptr_t this_val = reinterpret_cast(this); - - // exclude threads operating on a different qp from this thread lane mask - #pragma unroll - for (int i = 0; i < 64; ++i) { - uint64_t bit_i = 1ull << i; - if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) { - lane_mask &= ~bit_i; - } - } - - return lane_mask; + uint64_t active = get_active_lane_mask(); + uintptr_t this_qp = reinterpret_cast(this); + // Bitmask of lanes in this warp whose value == this_qp + uint64_t same_qp_mask = __match_any_sync(active, this_qp); + return same_qp_mask; } __device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) { @@ -173,7 +165,7 @@ __device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t co __device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) { // When threads write at once to the same address, not all writes reach the bus. // Take turns and insert a thread fence between writes to the same address. - for (int i = 0; i < 64; ++i) { + for (int i = 0; i < WF_SIZE; ++i) { if (__lane_id() == i) { __threadfence(); __atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST);