rocm-systems/src/gpu_ib/queue_pair.cpp

/******************************************************************************
 * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *****************************************************************************/

#include "queue_pair.hpp"

#include <hip/hip_runtime.h>

#include "rocshmem_config.h"  // NOLINT(build/include_subdir)
#include "backend_ib.hpp"
#include "endian.hpp"
#include "segment_builder.hpp"
#include "../util.hpp"

namespace rocshmem {

QueuePair::QueuePair(GPUIBBackend *backend)
    : hdp_policy(backend->hdp_policy),
      connection_policy(*backend->networkImpl.connection_policy) {
  hdp_rkey = backend->networkImpl.hdp_rkey;
  hdp_address = backend->networkImpl.hdp_address;

  atomic_ret.atomic_lkey = backend->networkImpl.atomic_ret->atomic_lkey;
  atomic_ret.atomic_counter = 0;
}

__device__ QueuePair::~QueuePair() {
  uint64_t start = profiler.startTimer();

  global_qp->sq_counter = sq_counter;
  global_qp->local_sq_cnt = local_sq_cnt;
  global_qp->cq_consumer_counter = cq_consumer_counter;
  global_qp->current_sq = current_sq;
  global_qp->current_cq_q = current_cq_q;
  global_qp->sq_overflow = sq_overflow;
  global_qp->quiet_counter = quiet_counter;
  profiler.endTimer(start, FINALIZE);

  global_qp->profiler.accumulateStats(profiler);

  __syncthreads();
}

__device__ uint8_t QueuePair::get_cq_error_syndrome(mlx5_cqe64 *cqe_entry) {
  mlx5_err_cqe *cqe_err = reinterpret_cast<mlx5_err_cqe *>(cqe_entry);
  return cqe_err->syndrome;
}

__device__ void QueuePair::ring_doorbell(uint64_t db_val) {
  swap_endian_store(const_cast<uint32_t *>(dbrec_send),
                    reinterpret_cast<uint32_t>(sq_counter));
  STORE(db.ptr, db_val);
  db.uint ^= 256;
}

__device__ void QueuePair::set_completion_flag_on_wqe(int num_wqes) {
  uint64_t *wqe = &current_sq[8 * ((sq_counter - num_wqes) % max_nwqe)];
  uint8_t *wqe_ce = reinterpret_cast<uint8_t *>(wqe) + 11;
  *wqe_ce = 8;
}

template <>
__device__ void QueuePair::update_wqe_ce_single<false>(int num_wqes) {
  if (sq_counter % max_nwqe == (max_nwqe - 2)) {
    set_completion_flag_on_wqe(num_wqes);
    quiet_counter++;
  }
}

template <>
__device__ void QueuePair::update_wqe_ce_single<true>(int num_wqes) {
  set_completion_flag_on_wqe(num_wqes);
  quiet_counter++;
}

template <>
__device__ void QueuePair::update_wqe_ce_thread<false>(int num_wqes) {}

template <>
__device__ void QueuePair::update_wqe_ce_thread<true>(int num_wqes) {
  set_completion_flag_on_wqe(num_wqes);
  atomicAdd(&quiet_counter, 1);
}

__device__ void QueuePair::compute_db_val_opcode(uint64_t *db_val,
                                                 uint16_t dbrec_val,
                                                 uint8_t opcode) {
  uint64_t opcode64 = opcode;
  opcode64 = opcode64 << 24 & 0x000000FFFF000000;

  uint64_t dbrec = dbrec_val << 8;
  dbrec = dbrec & 0x0000000000FFFF00;

  uint64_t val = *db_val;
  val = val & 0xFFFFFFFFFF0000FF;

  *db_val = val | dbrec | opcode64;
}

template <class level>
__device__ void QueuePair::quiet_internal() {
  /*
   * If there are nothing to quiet, just return early.
   */
  uint32_t quiet_val = quiet_counter;
  if (!quiet_val) {
    return;
  }

  profiler.incStat(QUIET_COUNT);
  uint64_t start = profiler.startTimer();

  /*
   * Generate a pointer to the completion queue entry.
   */
  cq_consumer_counter = cq_consumer_counter + quiet_val - 1;
  uint32_t indx = (cq_consumer_counter % cq_size);
  mlx5_cqe64 *cqe_entry = &current_cq_q[indx];

  /*
   * Access the op_own value in the completion queue entry.
   */
  int val_ld = uncached_load_ubyte(&(cqe_entry->op_own));
  uint8_t val_op_own = val_ld;

  /*
   * If the completion queue entry is not valid, wait for it to become so.
   */
  while (!((val_op_own & 0x1) == ((cq_consumer_counter >> cq_log_size) & 1)) ||
         ((val_op_own) >> 4) == 0xF) {
    val_ld = uncached_load_ubyte(&(cqe_entry->op_own));
    val_op_own = val_ld;
  }

  /*
   * Grab the opcode from the op_own field and report if it is an error.
   */
  uint8_t opcode = val_op_own >> 4;
  if (opcode != 0) {
    uint8_t syndrome = get_cq_error_syndrome(cqe_entry);
    mlx5_err_cqe *cqe_err = reinterpret_cast<mlx5_err_cqe *>(cqe_entry);
    GPU_DPRINTF("QUIET ERROR: signature %d opcode_qpn %llx wqe_cnt %llx \n",
                syndrome, cqe_err->s_wqe_opcode_qpn, cqe_err->wqe_counter);
  }

  /*
   * Decrement the quiet count by the amount determined at the beginning
   * of this method.
   *
   * bpotter - There are two areas of concern in this method for me.
   * 1) In multithreaded builds, we may need to make this method a critical
   * section to prevent data races on these variables.
   *
   * 2) Is there a data race in the API if a one remote process calls quiet
   * while another process continues adding events? Is it ever possible for
   * a quiet to complete, but the quiet_counter decrement here is not set
   * to zero?
   */
  level L;
  L.decQuietCounter(&quiet_counter, quiet_val);

  profiler.endTimer(start, POLL_CQ);
  start = profiler.startTimer();

  /*
   * Increment the trailing index counter which tracks our spot in the
   * completion queue.
   */
  cq_consumer_counter++;
  swap_endian_store(const_cast<uint32_t *>(dbrec_cq), cq_consumer_counter);

  profiler.endTimer(start, NEXT_CQ);
}

template <class level>
__device__ void QueuePair::quiet_single() {
  level L;
  L.quiet(this);
}

template <class level>
__device__ void QueuePair::quiet_single_heavy(int pe) {
  level L;
  L.quiet_heavy(this, pe);
}

template <class level, bool cqe>
__device__ void QueuePair::update_posted_wqe_generic(
    int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode,
    int64_t atomic_data, int64_t atomic_cmp, bool ring_db,
    uint64_t atomic_ret_pos, bool zero_byte_rd) {
  uint64_t start = profiler.startTimer();

  level L;
  L.postLock(this, pe);
  uint32_t num_wqes = connection_policy.getNumWqes(opcode);

  // Get the index for my thread's put in the SQ.
  uint64_t my_sq_counter = L.threadAtomicAdd(&sq_counter, num_wqes);
  uint64_t my_sq_index = my_sq_counter % max_nwqe;

  // 16-bit little endian version of the SQ index needed to build the cntrl
  // segment in the WQE.
  uint16_t le_sq_counter;
  uint16_t sq_counter_u16 = my_sq_counter;
  swap_endian_store(&le_sq_counter, sq_counter_u16);

  bool flag = sq_overflow;
  uint32_t lkey_in_stack_frame = lkey;
  uint32_t rkey_in_stack_frame = rkey;
  uint32_t ctrl_qp_sq_in_stack_frame = ctrl_qp_sq;
  uint64_t ctrl_sig_in_stack_frame = ctrl_sig;

  connection_policy.setRkey(&rkey_in_stack_frame, pe);

  if (opcode == MLX5_OPCODE_RDMA_WRITE && !size) {
    rkey_in_stack_frame = hdp_rkey[pe];
    size = 4;
  }

  /*
   * Build out all the segments required for my WQE(s) based on the
   * operation, starting at my_sq_index into the SQ. SegmentBuilder will
   * keep track of placing the segments in the correct location.
   */
  SegmentBuilder seg_build(my_sq_index, current_sq);
  seg_build.update_cntrl_seg(opcode, le_sq_counter, ctrl_qp_sq_in_stack_frame,
                             ctrl_sig_in_stack_frame, &connection_policy,
                             zero_byte_rd);
  seg_build.update_connection_seg(pe, &connection_policy);
  seg_build.update_rdma_seg(raddr, rkey_in_stack_frame);

  if (opcode == MLX5_OPCODE_ATOMIC_FA || opcode == MLX5_OPCODE_ATOMIC_CS) {
    seg_build.update_atomic_data_seg(atomic_data, atomic_cmp);
    size = 8;
    lkey_in_stack_frame = atomic_ret.atomic_lkey;
    laddr = &atomic_ret.atomic_base_ptr[atomic_ret_pos];
  }

  if (size <= inline_threshold && opcode == MLX5_OPCODE_RDMA_WRITE) {
    seg_build.update_inl_data_seg(laddr, size);
  } else {
    seg_build.update_data_seg(laddr, size, lkey_in_stack_frame);
  }

  profiler.incStat(WQE_COUNT);
  profiler.endTimer(start, UPDATE_WQE);
  start = profiler.startTimer();

  L.template finishPost<cqe>(this, ring_db, num_wqes, pe, le_sq_counter,
                             opcode);

  profiler.incStat(DB_COUNT);
  profiler.endTimer(start, RING_SQ_DB);
}

/******************************************************************************
 ****************************** SHMEM INTERFACE *******************************
 *****************************************************************************/
template <class level>
__device__ void QueuePair::put_nbi(void *dest, const void *source,
                                   size_t nelems, int pe, bool db_ring) {
  uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);

  update_posted_wqe_generic<level, false>(
      pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0);
}

template <class level>
__device__ void QueuePair::put_nbi_cqe(void *dest, const void *source,
                                       size_t nelems, int pe, bool db_ring) {
  uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);

  update_posted_wqe_generic<level, true>(
      pe, nelems, src, dst, MLX5_OPCODE_RDMA_WRITE, 0, 0, db_ring, 0);
}

template <class level>
__device__ void QueuePair::get_nbi(void *dest, const void *source,
                                   size_t nelems, int pe, bool db_ring) {
  uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);

  update_posted_wqe_generic<level, false>(
      pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0);
}

template <class level>
__device__ void QueuePair::get_nbi_cqe(void *dest, const void *source,
                                       size_t nelems, int pe, bool db_ring) {
  uintptr_t *src = reinterpret_cast<uintptr_t *>(const_cast<void *>(source));
  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);

  update_posted_wqe_generic<level, true>(
      pe, nelems, src, dst, MLX5_OPCODE_RDMA_READ, 0, 0, db_ring, 0);
}

template <class level>
__device__ void QueuePair::zero_b_rd(int pe) {
  uintptr_t *dst = reinterpret_cast<uintptr_t *>(base_heap[pe]);

  update_posted_wqe_generic<level, true>(pe, 0, nullptr, dst,
                                         MLX5_OPCODE_RDMA_READ, 0, 0, true, 0,
                                         true);  // enable 0_byte read op
}

__device__ int64_t QueuePair::atomic_fetch(void *dest, int64_t value,
                                           int64_t cond, int pe, bool db_ring,
                                           uint8_t atomic_op) {
  THREAD TH;
  uint64_t pos = TH.threadAtomicAdd(
      reinterpret_cast<unsigned long long *>(/* NOLINT(runtime/int) */
                                             &atomic_ret.atomic_counter));

  pos = pos % max_nb_atomic;

  int64_t *atomic_base_ptr =
      reinterpret_cast<int64_t *>(atomic_ret.atomic_base_ptr);

  int64_t *load_address = &atomic_base_ptr[pos];

  *load_address = -100;

  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);

  update_posted_wqe_generic<THREAD, true>(pe, sizeof(int64_t), nullptr, dst,
                                          atomic_op, value, cond, db_ring, pos);
  quiet_single<THREAD>();

  while (uncached_load(load_address) == -100) {
  }

  int64_t ret = *load_address;

  __threadfence();

  return ret;
}

__device__ void QueuePair::atomic_nofetch(void *dest, int64_t value,
                                          int64_t cond, int pe, bool db_ring,
                                          uint8_t atomic_op) {
  THREAD TH;
  uint64_t pos = TH.threadAtomicAdd(
      reinterpret_cast<unsigned long long *>(/* NOLINT(runtime/int) */
                                             &atomic_ret.atomic_counter));
  pos = pos % max_nb_atomic;
  uintptr_t *dst = reinterpret_cast<uintptr_t *>(dest);

  update_posted_wqe_generic<THREAD, true>(pe, sizeof(int64_t), nullptr, dst,
                                          atomic_op, value, cond, db_ring, pos);

  quiet_single<THREAD>();
}

__device__ void QueuePair::fence(int pe) {
  // TODO(khamidou): should this be replaced by a zero_byte_rd?
  // FIXME: the relaxed ordering requires an intervening read to order
  // prior operations.
  auto remote_hdp_uncast = hdp_address[pe];
  uintptr_t *remote_hdp = reinterpret_cast<uintptr_t *>(remote_hdp_uncast);
  update_posted_wqe_generic<THREAD, true>(
      pe, 0, nullptr, remote_hdp, MLX5_OPCODE_RDMA_WRITE, 0, 0, true, 0);
}

__device__ void QueuePair::waitCQSpace(int num_msgs) {
  // We cannot post more outstanding requests than the completion queue
  // size.  Force a quiet if we are out of space.
  if ((quiet_counter + num_msgs) >= cq_size) {
    GPU_DPRINTF(
        "*** inside post_cq forcing flush: outstanding %d "
        "adding %d cq_size %d\n",
        quiet_counter, num_msgs, cq_size);

    // TODO(khamidou): More targeted flush would be better here.
    quiet_single<THREAD>();
  }
}

__device__ void QueuePair::waitSQSpace(int num_msgs) {
  // We cannot post more outstanding requests than the Send queue
  // size.  Force a quiet if we are out of space.
  local_sq_cnt += num_msgs;
  int div = local_sq_cnt / max_nwqe;

  if (div > 0) {
    GPU_DPRINTF(
        "*** inside waitSQSpace forcing flush to overrun the SQ"
        " sq_counter %d  adding %d quiet_conter %d \n",
        sq_counter, num_msgs, max_nwqe, quiet_counter);

    quiet_single<THREAD>();
    local_sq_cnt = local_sq_cnt % max_nwqe;
  }
}

void QueuePair::setDBval(uint64_t val) { db_val = val; }

#define THREAD_LEVEL_GEN(T)                                                 \
  template __device__ void QueuePair::put_nbi<T>(                           \
      void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
  template __device__ void QueuePair::put_nbi_cqe<T>(                       \
      void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
  template __device__ void QueuePair::get_nbi<T>(                           \
      void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
  template __device__ void QueuePair::get_nbi_cqe<T>(                       \
      void *dest, const void *source, size_t nelems, int pe, bool db_ring); \
  template __device__ void QueuePair::zero_b_rd<T>(int pe);                 \
  template __device__ void QueuePair::quiet_single<T>();                    \
  template __device__ void QueuePair::quiet_single_heavy<T>(int pe);        \
  template __device__ void QueuePair::quiet_internal<T>();

THREAD_LEVEL_GEN(THREAD)
THREAD_LEVEL_GEN(WG)
THREAD_LEVEL_GEN(WAVE)

}  // namespace rocshmem