/****************************************************************************** * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. *****************************************************************************/ #include "gda/queue_pair.hpp" #include "gda/endian.hpp" #include "util.hpp" #include "containers/free_list_impl.hpp" namespace rocshmem { __device__ uint64_t QueuePair::get_same_qp_lane_mask() { uint64_t lane_mask = get_active_lane_mask(); uintptr_t this_val = reinterpret_cast(this); // exclude threads operating on a different qp from this thread lane mask #pragma unroll for (int i = 0; i < 64; ++i) { uint64_t bit_i = 1ull << i; if ((lane_mask & bit_i) && __shfl(this_val, i) != this_val) { lane_mask &= ~bit_i; } } return lane_mask; } __device__ uint32_t QueuePair::reserve_sq(uint64_t activemask, uint32_t num_wqes) { uint32_t my_sq_prod = 0; // reserve space for wqes in sq if (is_first_active_lane(activemask)) { my_sq_prod = __hip_atomic_fetch_add(&sq_prod, num_wqes, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } my_sq_prod = __shfl(my_sq_prod, get_first_active_lane_id(activemask)); // wait for that space to be available ionic_quiet_internal(activemask, my_sq_prod + num_wqes - sq_mask); return my_sq_prod; } __device__ uint32_t QueuePair::commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes) { uint32_t dbprod = my_sq_prod + num_wqes; spin_lock_acquire_shared(&sq_lock, activemask); if (is_first_active_lane(activemask) && ((sq_dbprod - dbprod) & (1u << 31))) { sq_dbprod = dbprod; ionic_ring_doorbell(dbprod); } spin_lock_release_shared(&sq_lock, activemask); return dbprod; } __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { uint32_t my_logical_lane_id = get_active_lane_num(activemask); uint32_t my_cq_pos = cq_pos + my_logical_lane_id; /* Look at the cqe at the current position in the cq buffer */ struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask]; /* Determine expected color based on cq wrap count */ uint32_t qtf_color_bit = swap_endian_val(IONIC_V1_CQE_COLOR); uint32_t qtf_color_exp = qtf_color_bit; if (my_cq_pos & (cq_mask + 1)) { qtf_color_exp = 0; } /* Check if my cqe color == expected color */ uint32_t qtf_be = *(volatile uint32_t *)(&cqe->qid_type_flags); if ((qtf_be & qtf_color_bit) != qtf_color_exp) { return; } uint32_t msn = swap_endian_val(cqe->send.msg_msn); /* Report if the completion indicates an error. */ if (!!(qtf_be & swap_endian_val(IONIC_V1_CQE_ERROR))) { #if defined(DEBUG) uint32_t qtf = swap_endian_val(qtf_be); uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT; uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; uint32_t flag = qtf & 0xf; uint32_t status = swap_endian_val(cqe->status_length); uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK; printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n", dev_name, qid, type, flag, status, msn, npg); #endif /* No other way to signal an error, so just crash. */ abort(); } /* Only proceed with the furthest ahead cqe to update the sq state */ uint64_t my_lane_mask = 1ull << __lane_id(); uint64_t lesser_lane_mask = my_lane_mask - 1; if (my_lane_mask != (__ballot(true) & activemask & ~lesser_lane_mask)) { return; } /* update position in the cq */ cq_pos = my_cq_pos + 1; /* * Ring cq doorbell frequently enough to avoid cq full. * * NB: IONIC_CQ_GRACE is 100 */ if (((cq_pos - cq_dbpos) & cq_mask) >= 100) { cq_dbpos = cq_pos; __atomic_store_n(cq_dbreg, cq_dbval | (cq_mask & cq_dbpos), __ATOMIC_SEQ_CST); //TODO:maybe relaxed? } sq_msn = msn; } __device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t cons) { uint32_t greed = 10; /* wait for sq_msn to catch up or pass cons. */ /* 0x800000 - sign bit for 24-bit fields */ while ((sq_msn - cons) & 0x800000) { if (!spin_lock_try_acquire_shared(&cq_lock, activemask)) { continue; } /* with lock acquired, this wave polls cqes until caught up */ while ((sq_msn - cons) & 0x800000) { uint32_t old_sq_msn = sq_msn; poll_wave_cqes(activemask); if (!((sq_msn - cons) & 0x800000)) { if (sq_msn == old_sq_msn) { break; } if (!greed) { break; } --greed; } } spin_lock_release_shared(&cq_lock, activemask); break; } } __device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) { // When threads write at once to the same address, not all writes reach the bus. // Take turns and insert a thread fence between writes to the same address. for (int i = 0; i < 64; ++i) { if (__lane_id() == i) { __threadfence(); __atomic_store_n(sq_dbreg, sq_dbval | (sq_mask & pos), __ATOMIC_SEQ_CST); } } __threadfence(); } __device__ void QueuePair::ionic_quiet() { ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod); } __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t laddr, uintptr_t raddr, uint8_t opcode, Collectivity cy) { uint64_t activemask = get_same_qp_lane_mask(); uint32_t my_logical_lane_id = get_active_lane_num(activemask); uint32_t num_wqes = 1; if (cy == THREAD) { num_wqes = get_active_lane_count(activemask); } uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); if (cy == WAVE) { if (!is_first_active_lane(activemask)) { return; } activemask &= activemask ^ (activemask - 1); } uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask]; uint16_t wqe_flags = 0; if (!(my_sq_pos & (sq_mask + 1))) { wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); } if (is_last_active_lane(activemask)) { wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); } // TODO why is this needed? if (size && !laddr && opcode == IONIC_V2_OP_RDMA_WRITE) { size = 1; } wqe->base.wqe_idx = my_sq_pos; wqe->base.op = opcode; wqe->base.num_sge_key = size ? 1 : 0; wqe->base.imm_data_key = swap_endian_val(0); wqe->common.rdma.remote_va_high = swap_endian_val(raddr >> 32); wqe->common.rdma.remote_va_low = swap_endian_val(raddr); wqe->common.rdma.remote_rkey = swap_endian_val(rkey); wqe->common.length = swap_endian_val(size); if (size) { if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) { wqe_flags |= swap_endian_val(IONIC_V1_FLAG_INL); wqe->base.num_sge_key = 0; if (!laddr) { // TODO why is this needed? wqe->common.pld.data[0] = 1; } else { memcpy(wqe->common.pld.data, reinterpret_cast(laddr), size); } } else { wqe->common.pld.sgl[0].va = swap_endian_val(laddr); wqe->common.pld.sgl[0].len = swap_endian_val(size); wqe->common.pld.sgl[0].lkey = swap_endian_val(lkey); } } __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); } __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_t raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetching) { uint64_t activemask = get_same_qp_lane_mask(); uint32_t num_wqes = get_active_lane_count(activemask); uint32_t my_logical_lane_id = get_active_lane_num(activemask); bool is_leader{my_logical_lane_id == 0}; const uint64_t leader_phys_lane_id = get_first_active_lane_id(activemask); uint32_t my_sq_prod = reserve_sq(activemask, num_wqes); uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id; struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask]; uint16_t wqe_flags = 0; uint32_t cons; uint64_t* wave_fetch_atomic{nullptr}; if (fetching) { if (is_leader) { auto res = fetching_atomic_freelist->pop_front(); while (!res.success) { res = fetching_atomic_freelist->pop_front(); } wave_fetch_atomic = res.value; } wave_fetch_atomic = (uint64_t*)__shfl((uint64_t)wave_fetch_atomic, leader_phys_lane_id); } if (!(my_sq_pos & (sq_mask + 1))) { wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); } if (is_last_active_lane(activemask)) { wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); } wqe->base.wqe_idx = my_sq_pos; wqe->base.op = opcode; wqe->base.num_sge_key = 1; wqe->base.imm_data_key = swap_endian_val(0); wqe->atomic_v2.remote_va_high = swap_endian_val(raddr >> 32); wqe->atomic_v2.remote_va_low = swap_endian_val(raddr); wqe->atomic_v2.remote_rkey = swap_endian_val(rkey); wqe->atomic_v2.swap_add_high = swap_endian_val(atomic_data >> 32); wqe->atomic_v2.swap_add_low = swap_endian_val(atomic_data); wqe->atomic_v2.compare_high = swap_endian_val(atomic_cmp >> 32); wqe->atomic_v2.compare_low = swap_endian_val(atomic_cmp); if (fetching) { wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(wave_fetch_atomic + my_logical_lane_id)); wqe->atomic_v2.lkey = swap_endian_val(fetching_atomic_lkey); } else { wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(nonfetching_atomic)); wqe->atomic_v2.lkey = swap_endian_val(nonfetching_atomic_lkey); } __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); cons = commit_sq(activemask, my_sq_prod, my_sq_pos, num_wqes); uint64_t ret{0}; if (fetching) { ionic_quiet_internal(activemask, cons); ret = wave_fetch_atomic[my_logical_lane_id]; __atomic_signal_fence(__ATOMIC_SEQ_CST); if (is_leader) { fetching_atomic_freelist->push_back(wave_fetch_atomic); } } return ret; } } // namespace rocshmem