Files
rocm-systems/src/gda/queue_pair.hpp
T
Allen Hubbe c84bbc250b gda ionic: restore functionality of ionic gda in rocshmem (#269)
* Revamp findibverbs to find ionic again

* gda ionic: rename ionic_sq_buf ionic_cq_buf

Avoid duplicating member names used by mlx5 gda.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda: move spin lock to util.hpp

Move spin lock out of ionic gda to util.hpp.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: assume latest fwabi changes

There is no firmware abi compatibility in this ionic gda code yet, so
assume we are using the latest firmware abi as of now.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: allow doorbell with incomplete wqes

Use spin lock to ensure doorbell is only written with an increasing
producer index.  Ring the doorbell after this wave has initialized its
wqes.  Wqes of other waves might not be fully initialized, but firmware
will not process them until the phase/color flag is updated in the
respecitve wqes.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: poll cq for additional completions

Keep polling the cq for more than just the minimum number of completions
for this wave of threads to make progress, as long as the cq is not
empty.  A part of wave-optimized cq polling, at the expense of one wave
polling additional completions, it was observed that nearly all other
waves avoid taking the cq lock at all.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda: max_rd_atomic in rts transition

In modify_qp(RTS), specify max_rd_atomic, not max_dest_rd_atomic.

By not speicfying max_rd_atomic (rather, max_rd_atomic=zero), the local
nic may get stuck transmitting the first read or atomic request.  One
read or atomic request is greater than the initiator depth of zero.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: allow specifying traffic class

Allow specifying a traffic class.  The network might have a specific
traffic class configured as no-drop, for example.

Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com>
Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

* gda ionic: tweak uxdma assignment

The ideal arrangement will have an equal number of QPs active on each
uxdma pipeline.

Pre-rebase, the better arrangement for rocshmem funcitonal test
benchmarks was [0, 1], [1, 0], [0, 1], [1, 0], ...

Now, following changes that add 'ROCSHMEM_GDA_ALTERNATE_QP_PORTS=1' by
default, the better arrangement is [0, 1], [0, 1], [0, 1], [0, 1], ...

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>

---------

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>
Co-authored-by: Aurelien Bouteiller <abouteil@amd.com>
Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com>
2025-10-07 10:08:19 -04:00

357 lines
11 KiB
C++

/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_
#define LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_
/**
* @file queue_pair.hpp
*
* @section DESCRIPTION
* An IB QueuePair (SQ and CQ) that the device can use to perform network
* operations. Most important rocSHMEM operations are performed by this
* class.
*/
#include "rocshmem_config.h"
#include "endian.h"
#include "constants.hpp"
#include "util.hpp"
#include "gda/ionic/provider_gda_ionic.hpp"
#include "gda/mlx5/provider_gda_mlx5.hpp"
#include "gda/bnxt/provider_gda_bnxt.hpp"
#include "containers/free_list.hpp"
#include "memory/hip_allocator.hpp"
namespace rocshmem {
class GDABackend;
class QueuePair {
public:
friend GDABackend;
/**
* @brief Constructor.
*/
explicit QueuePair(struct ibv_pd* pd, int gda_vendor);
/**
* @brief Destructor.
*/
virtual ~QueuePair();
/**
* @brief Create and enqueue a non-blocking put work queue entry (wqe).
*
* @param[in] dest Destination address for data transmission.
* @param[in] source Source address for data transmission.
* @param[in] nelems Size in bytes of data transmission.
* @param[in] pe Destination processing element of data transmission.
*/
__device__ void put_nbi(void *dest, const void *source, size_t nelems, int pe);
/**
* @brief Create and enqueue a non-blocking get work queue entry (wqe).
*
* @param[in] dest Destination address for data transmission.
* @param[in] source Source address for data transmission.
* @param[in] nelems Size in bytes of data transmission.
* @param[in] pe Destination processing element of data transmission.
*/
__device__ void get_nbi(void *dest, const void *source, size_t nelems, int pe);
/**
* @brief Empty all completions from the completion queue.
*/
__device__ void quiet();
/**
* @brief Create and enqueue an atomic fetch work queue entry (wqe).
*
* @param[in] dest Destination address for data transmission.
* @param[in] value Data value for the atomic operation.
* @param[in] cond Used in atomic comparisons.
* @param[in] pe Destination processing element of data transmission.
*
* @return An atomic value
*/
__device__ int64_t atomic_fetch(void *dest, int64_t value, int64_t cond, int pe);
/**
* @brief Create and enqueue an atomic fetch work queue entry (wqe).
*
* @param[in] dest Destination address for data transmission.
* @param[in] value Data value for the atomic operation.
* @param[in] cond Used in atomic comparisons.
* @param[in] pe Destination processing element of data transmission.
*/
__device__ void atomic_nofetch(void *dest, int64_t value, int64_t cond, int pe);
/**
* @brief Create and enqueue an atomic cas work queue entry (wqe).
*
* @param[in] dest Destination address for data transmission.
* @param[in] value Data value for the atomic operation.
* @param[in] cond Used in atomic comparisons.
* @param[in] pe Destination processing element of data transmission.
*
* @return An atomic value
*/
__device__ int64_t atomic_cas(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe);
/**
* @brief Create and enqueue an atomic cas work queue entry (wqe).
*
* @param[in] dest Destination address for data transmission.
* @param[in] value Data value for the atomic operation.
* @param[in] cond Used in atomic comparisons.
* @param[in] pe Destination processing element of data transmission.
*/
__device__ int64_t atomic_cas_nofetch(void *dest, int64_t atomic_data, int64_t atomic_cmp, int pe);
char *const *base_heap{nullptr};
private:
/**
* @brief Helper method to build work requests for the send queue.
*
* @param[in] pe Destination processing element of data transmission.
* @param[in] size Size in bytes of data transmission.
* @param[in] raddr Remote address.
* @param[in] opcode Operation to be performed.
* @param[in] atomic_data An atomic data value to be used.
* @param[in] atomic_cmp An atomic comparison operation to be performed.
* @param[in] fetching True if the operation returns a value.
*/
__device__ __attribute__((noinline)) uint64_t post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
/**
* @brief Helper method to build work requests for the send queue.
*
* @param[in] pe Destination processing element of data transmission.
* @param[in] size Size in bytes of data transmission.
* @param[in] laddr Local address.
* @param[in] raddr Remote address.
* @param[in] opcode Operation to be performed.
*/
__device__ __attribute__((noinline)) void post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
#if defined(GDA_MLX5)
__device__ uint64_t mlx5_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void mlx5_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
#endif
#if defined(GDA_BNXT)
__device__ uint64_t bnxt_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void bnxt_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
#endif
#if defined(GDA_IONIC)
__device__ uint64_t ionic_post_wqe_amo(int pe, int32_t size, uintptr_t *raddr, uint8_t opcode, int64_t atomic_data, int64_t atomic_cmp, bool fetch);
__device__ void ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode);
#endif
/**
* @brief Helper method to ring the doorbell
*
* @param[in] db_val Doorbell value is written by method.
*/
#if defined(GDA_MLX5)
__device__ void mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter);
#endif
#if defined(GDA_BNXT)
__device__ void ring_sq_doorbell(uint32_t slot_idx);
__device__ void ring_cq_doorbell(uint32_t slot_idx);
#endif
#if defined(GDA_IONIC)
__device__ void ionic_ring_doorbell(uint32_t pos);
#endif
#ifdef GDA_IONIC
__device__ uint64_t get_same_qp_lane_mask();
/**
* @brief Reserve space in the sq to post this many wqes.
* @param my_tid my logical thread id.
* @param num_wqes number of sq wqes to reserve for this wave.
* @return position of my_tid=0's wqe.
*/
__device__ uint32_t reserve_sq(uint64_t active_lane_mask, uint32_t num_wqes);
/**
* @brief Ring the sq doorbell maintaining order between waves.
* @param last this is the last wqe posted in this wave.
* @param my_sq_prod position of my_tid=0's wqe.
* @param num_wqes number of sq wqes posted in this wave.
* @param wqe this thread's wqe.
* @return doorbell producer index.
*/
__device__ uint32_t commit_sq(uint64_t activemask, uint32_t my_sq_prod, uint32_t my_sq_pos, uint32_t num_wqes);
/**
* @brief Helper method to poll the next completion queue entry.
*/
__device__ __attribute__((noinline)) void poll_wave_cqes(uint64_t active_lane_mask);
/**
* @brief Helper method to drain completion queue entries.
* @param cons wait for sq_msn to catch up to this position.
*/
__device__ __attribute__((noinline)) void ionic_quiet_internal(uint64_t active_lane_mask, uint32_t cons);
uint64_t *cq_dbreg{nullptr};
uint64_t cq_dbval{0};
uint64_t cq_mask{0};
struct ionic_v1_cqe *ionic_cq_buf{nullptr};
uint32_t cq_lock{SPIN_LOCK_UNLOCKED};
uint32_t cq_pos{0};
uint32_t cq_dbpos{0};
uint64_t *sq_dbreg{nullptr};
uint64_t sq_dbval{0};
uint64_t sq_mask{0};
struct ionic_v1_wqe *ionic_sq_buf{nullptr};
uint32_t sq_lock{SPIN_LOCK_UNLOCKED};
uint32_t sq_dbprod{0};
uint32_t sq_prod{0};
uint32_t sq_msn{0};
#endif
#if defined(GDA_MLX5)
__device__ void mlx5_quiet();
#endif
#if defined(GDA_BNXT)
__device__ void bnxt_quiet();
#endif
#if defined(GDA_IONIC)
__device__ void ionic_quiet();
#endif
int gda_vendor_{0};
/* GDAVendor::BNXT START */
uint64_t *dbr;
struct bnxt_device_cq cq;
struct bnxt_device_sq sq;
__device__ int poll_cq();
/* GDAVendor::BNXT END */
/* GDAVendor::MLX5 START */
db_reg_t db{};
uint64_t cq_consumer{0};
uint64_t quiet_posted{0};
uint64_t quiet_active{0};
uint64_t quiet_completed{0};
/*
* struct mlx5dv_cq {
* void *buf;
* __be32 *dbrec;
* uint32_t cqe_cnt;
* uint32_t cqe_size;
* void *cq_uar;
* uint32_t cqn;
* uint64_t comp_mask;
* };
*/
mlx5_cqe64 *cq_buf{nullptr};
volatile uint32_t *cq_dbrec{nullptr};
uint32_t cq_cnt{0};
uint32_t cq_log_cnt{0};
/*
* struct mlx5dv_qp {
* __be32 *dbrec;
* struct {
* void *buf;
* uint32_t wqe_cnt;
* uint32_t stride;
* } sq;
* struct {
* void *buf;
* uint32_t wqe_cnt;
* uint32_t stride;
* } rq;
* struct {
* void *reg;
* uint32_t size;
* } bf;
* uint64_t comp_mask;
* off_t uar_mmap_offset;
* uint32_t tirn;
* uint32_t tisn;
* uint32_t rqn;
* uint32_t sqn;
* uint64_t tir_icm_addr;
* };
*/
volatile uint32_t *dbrec{nullptr};
uint64_t *sq_buf{nullptr};
uint16_t sq_wqe_cnt{0};
uint64_t sq_posted{0};
uint64_t sq_db_touched{0};
uint64_t sq_sunk{0};
static constexpr size_t OUTSTANDING_TABLE_SIZE = 65536;
uint64_t outstanding_wqes[OUTSTANDING_TABLE_SIZE]{0};
/* GDAVendor::MLX5 END */
uint32_t inline_threshold{0};
char dev_name[24];
uint32_t qp_num{0};
uint32_t rkey{0};
uint32_t lkey{0};
uint64_t* nonfetching_atomic{nullptr};
uint32_t nonfetching_atomic_lkey{0};
struct ibv_mr *mr_nonfetching_atomic;
uint64_t* fetching_atomic{nullptr};
uint32_t fetching_atomic_lkey{0};
uint32_t fetching_atomic_idx{0};
struct ibv_mr *mr_fetching_atomic;
static const uint32_t FETCHING_ATOMIC_CNT{1024};
static_assert(FETCHING_ATOMIC_CNT % WF_SIZE == 0);
using FreeListT = FreeList<uint64_t*, HIPAllocator>;
FreeListT* fetching_atomic_freelist{nullptr};
HIPAllocator allocator{};
uint8_t gda_op_rdma_write;
uint8_t gda_op_rdma_read;
uint8_t gda_op_atomic_fa;
uint8_t gda_op_atomic_cs;
};
} // namespace rocshmem
#endif // LIBRARY_SRC_GDA_QUEUE_PAIR_HPP_