diff --git a/projects/rocshmem/src/gda/CMakeLists.txt b/projects/rocshmem/src/gda/CMakeLists.txt index ff6cd546d4..f39dcee8b9 100644 --- a/projects/rocshmem/src/gda/CMakeLists.txt +++ b/projects/rocshmem/src/gda/CMakeLists.txt @@ -36,7 +36,6 @@ target_sources( gda_team.cpp queue_pair.cpp numa_wrapper.cpp - endian.cpp topology.cpp ) diff --git a/projects/rocshmem/src/gda/endian.cpp b/projects/rocshmem/src/gda/endian.cpp deleted file mode 100644 index c8baeb18d5..0000000000 --- a/projects/rocshmem/src/gda/endian.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/****************************************************************************** - * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "endian.hpp" - -namespace rocshmem { - -template -__device__ void swap_endian_store(T *dst, const T val) { - typedef union U { - T val; - uint8_t bytes[sizeof(T)]; - } union_type; - union_type src; - union_type dst_tmp; - - src.val = val; - std::reverse_copy(src.bytes, src.bytes + sizeof(T), dst_tmp.bytes); - *dst = dst_tmp.val; -} - -template <> -__device__ void swap_endian_store(uint64_t *dst, const uint64_t val) { - uint64_t new_val = ((val << 8) & 0xFF00FF00FF00FF00ULL) | - ((val >> 8) & 0x00FF00FF00FF00FFULL); - - new_val = ((new_val << 16) & 0xFFFF0000FFFF0000ULL) | - ((new_val >> 16) & 0x0000FFFF0000FFFFULL); - - *dst = (new_val << 32) | (new_val >> 32); -} - -template <> -__device__ void swap_endian_store(int64_t *dst, const int64_t val) { - swap_endian_store(reinterpret_cast(dst), (const uint64_t)val); -} - -template <> -__device__ void swap_endian_store(uint32_t *dst, const uint32_t val) { - uint32_t new_val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF); - - *dst = (new_val << 16) | (new_val >> 16); -} - -template <> -__device__ void swap_endian_store(int32_t *dst, const int32_t val) { - swap_endian_store(reinterpret_cast(dst), (const uint32_t)val); -} - -template <> -__device__ void swap_endian_store(uint16_t *dst, const uint16_t val) { - *dst = ((val << 8) & 0xFF00) | ((val >> 8) & 0x00FF); -} - -template <> -__device__ void swap_endian_store(int16_t *dst, const int16_t val) { - swap_endian_store(reinterpret_cast(dst), (const uint16_t)val); -} - -} // namespace rocshmem diff --git a/projects/rocshmem/src/gda/endian.hpp b/projects/rocshmem/src/gda/endian.hpp index 61663cdbcb..ce19b954fe 100644 --- a/projects/rocshmem/src/gda/endian.hpp +++ b/projects/rocshmem/src/gda/endian.hpp @@ -25,36 +25,73 @@ #ifndef LIBRARY_SRC_GDA_ENDIAN_HPP_ #define LIBRARY_SRC_GDA_ENDIAN_HPP_ +#include #include namespace rocshmem { -template -__device__ void swap_endian_store(T *dst, const T val); +// this is essentially std::byteswap from C++23 +template , bool> = true> +constexpr inline __host__ __device__ T byteswap(T val) { + if constexpr (sizeof(T) == 1) { + return val; + } else if constexpr (sizeof(T) == 2) { + return __builtin_bswap16(val); + } else if constexpr (sizeof(T) == 4) { + return __builtin_bswap32(val); + } else if constexpr (sizeof(T) == 8) { + return __builtin_bswap64(val); + } else { + // sizeof(T) to force this to be instantiation-dependent + static_assert(sizeof(T) == 0, "byteswap not implemented for this type"); + } +} -template <> -__device__ void swap_endian_store(uint64_t *dst, const uint64_t val); +namespace endian { + enum class Order { + Big = __ORDER_BIG_ENDIAN__, + Little = __ORDER_LITTLE_ENDIAN__, + Native = __BYTE_ORDER__ + }; -template <> -__device__ void swap_endian_store(int64_t *dst, const int64_t val); + template , bool> = true> + __host__ __device__ constexpr inline T convert(T val) { + if constexpr (To == From) { + return val; + } else { + return byteswap(val); + } + } -template <> -__device__ void swap_endian_store(uint32_t *dst, const uint32_t val); + template + __host__ __device__ constexpr inline T to_native(T val) { + return convert(val); + } -template <> -__device__ void swap_endian_store(int32_t *dst, const int32_t val); + template + __host__ __device__ constexpr inline T from_native(T val) { + return convert(val); + } -template <> -__device__ void swap_endian_store(uint16_t *dst, const uint16_t val); + template + __host__ __device__ constexpr inline T to_be(T val) { + return convert(val); + } -template <> -__device__ void swap_endian_store(int16_t *dst, const int16_t val); + template + __host__ __device__ constexpr inline T from_be(T val) { + return convert(val); + } -template -__device__ T swap_endian_val(const T val) { - T dst; - swap_endian_store(&dst, val); - return dst; + template + __host__ __device__ constexpr inline T to_le(T val) { + return convert(val); + } + + template + __host__ __device__ constexpr inline T from_le(T val) { + return convert(val); + } } } // namespace rocshmem diff --git a/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp b/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp index e5f0caa2a6..9a9cea5b7c 100644 --- a/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp +++ b/projects/rocshmem/src/gda/ionic/queue_pair_ionic.cpp @@ -109,7 +109,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask]; /* Determine expected color based on cq wrap count */ - uint32_t qtf_color_bit = swap_endian_val(IONIC_V1_CQE_COLOR); + uint32_t qtf_color_bit = byteswap(IONIC_V1_CQE_COLOR); uint32_t qtf_color_exp = qtf_color_bit; if (my_cq_pos & (cq_mask + 1)) { qtf_color_exp = 0; @@ -121,16 +121,16 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) { return; } - uint32_t msn = swap_endian_val(cqe->send.msg_msn); + uint32_t msn = byteswap(cqe->send.msg_msn); /* Report if the completion indicates an error. */ - if (!!(qtf_be & swap_endian_val(IONIC_V1_CQE_ERROR))) { + if (!!(qtf_be & byteswap(IONIC_V1_CQE_ERROR))) { #if defined(DEBUG) - uint32_t qtf = swap_endian_val(qtf_be); + uint32_t qtf = byteswap(qtf_be); uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT; uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; uint32_t flag = qtf & 0xf; - uint32_t status = swap_endian_val(cqe->status_length); + uint32_t status = byteswap(cqe->status_length); uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK; printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n", @@ -249,11 +249,11 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t la uint16_t wqe_flags = 0; if (!(my_sq_pos & (sq_mask + 1))) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); + wqe_flags |= byteswap(IONIC_V1_FLAG_COLOR); } if (is_last_active_lane(activemask)) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); + wqe_flags |= byteswap(IONIC_V1_FLAG_SIG); } // TODO why is this needed? @@ -264,16 +264,16 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t la wqe->base.wqe_idx = my_sq_pos; wqe->base.op = opcode; wqe->base.num_sge_key = size ? 1 : 0; - wqe->base.imm_data_key = swap_endian_val(0); + wqe->base.imm_data_key = byteswap(0); - wqe->common.rdma.remote_va_high = swap_endian_val(raddr >> 32); - wqe->common.rdma.remote_va_low = swap_endian_val(raddr); - wqe->common.rdma.remote_rkey = swap_endian_val(rkey); - wqe->common.length = swap_endian_val(size); + wqe->common.rdma.remote_va_high = byteswap(raddr >> 32); + wqe->common.rdma.remote_va_low = byteswap(raddr); + wqe->common.rdma.remote_rkey = byteswap(rkey); + wqe->common.length = byteswap(size); if (size) { if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_INL); + wqe_flags |= byteswap(IONIC_V1_FLAG_INL); wqe->base.num_sge_key = 0; if (!laddr) { // TODO why is this needed? @@ -282,9 +282,9 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t la memcpy(wqe->common.pld.data, reinterpret_cast(laddr), size); } } else { - wqe->common.pld.sgl[0].va = swap_endian_val(laddr); - wqe->common.pld.sgl[0].len = swap_endian_val(size); - wqe->common.pld.sgl[0].lkey = swap_endian_val(lkey); + wqe->common.pld.sgl[0].va = byteswap(laddr); + wqe->common.pld.sgl[0].len = byteswap(size); + wqe->common.pld.sgl[0].lkey = byteswap(lkey); } } @@ -319,32 +319,32 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_ } if (!(my_sq_pos & (sq_mask + 1))) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_COLOR); + wqe_flags |= byteswap(IONIC_V1_FLAG_COLOR); } if (is_last_active_lane(activemask)) { - wqe_flags |= swap_endian_val(IONIC_V1_FLAG_SIG); + wqe_flags |= byteswap(IONIC_V1_FLAG_SIG); } wqe->base.wqe_idx = my_sq_pos; wqe->base.op = opcode; wqe->base.num_sge_key = 1; - wqe->base.imm_data_key = swap_endian_val(0); + wqe->base.imm_data_key = byteswap(0); - wqe->atomic_v2.remote_va_high = swap_endian_val(raddr >> 32); - wqe->atomic_v2.remote_va_low = swap_endian_val(raddr); - wqe->atomic_v2.remote_rkey = swap_endian_val(rkey); - wqe->atomic_v2.swap_add_high = swap_endian_val(atomic_data >> 32); - wqe->atomic_v2.swap_add_low = swap_endian_val(atomic_data); - wqe->atomic_v2.compare_high = swap_endian_val(atomic_cmp >> 32); - wqe->atomic_v2.compare_low = swap_endian_val(atomic_cmp); + wqe->atomic_v2.remote_va_high = byteswap(raddr >> 32); + wqe->atomic_v2.remote_va_low = byteswap(raddr); + wqe->atomic_v2.remote_rkey = byteswap(rkey); + wqe->atomic_v2.swap_add_high = byteswap(atomic_data >> 32); + wqe->atomic_v2.swap_add_low = byteswap(atomic_data); + wqe->atomic_v2.compare_high = byteswap(atomic_cmp >> 32); + wqe->atomic_v2.compare_low = byteswap(atomic_cmp); if (fetching) { - wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(wave_fetch_atomic + my_logical_lane_id)); - wqe->atomic_v2.lkey = swap_endian_val(fetching_atomic_lkey); + wqe->atomic_v2.local_va = byteswap(reinterpret_cast(wave_fetch_atomic + my_logical_lane_id)); + wqe->atomic_v2.lkey = byteswap(fetching_atomic_lkey); } else { - wqe->atomic_v2.local_va = swap_endian_val(reinterpret_cast(nonfetching_atomic)); - wqe->atomic_v2.lkey = swap_endian_val(nonfetching_atomic_lkey); + wqe->atomic_v2.local_va = byteswap(reinterpret_cast(nonfetching_atomic)); + wqe->atomic_v2.lkey = byteswap(nonfetching_atomic_lkey); } __hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT); diff --git a/projects/rocshmem/src/gda/mlx5/queue_pair_mlx5.cpp b/projects/rocshmem/src/gda/mlx5/queue_pair_mlx5.cpp index 9a081f36e8..327fc3e1c7 100644 --- a/projects/rocshmem/src/gda/mlx5/queue_pair_mlx5.cpp +++ b/projects/rocshmem/src/gda/mlx5/queue_pair_mlx5.cpp @@ -31,7 +31,7 @@ namespace rocshmem { __device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) { - swap_endian_store(const_cast(dbrec), (uint32_t)my_sq_counter); + *dbrec = byteswap(my_sq_counter); __atomic_signal_fence(__ATOMIC_SEQ_CST); __hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); @@ -98,8 +98,7 @@ __device__ void QueuePair::mlx5_quiet() { } } - uint16_t wqe_counter; - swap_endian_store(const_cast(&wqe_counter), reinterpret_cast(be_wqe_counter)); + uint16_t wqe_counter = byteswap(be_wqe_counter); uint64_t wqe_id = outstanding_wqes[wqe_counter]; __hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit; @@ -112,7 +111,7 @@ __device__ void QueuePair::mlx5_quiet() { completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } while (completed != wave_cq_consumer); - swap_endian_store(const_cast(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount)); + *cq_dbrec = byteswap(wave_cq_consumer + quiet_amount); __atomic_signal_fence(__ATOMIC_SEQ_CST); uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id]; diff --git a/projects/rocshmem/src/gda/mlx5/segment_builder.cpp b/projects/rocshmem/src/gda/mlx5/segment_builder.cpp index deb0963bb4..ca95f3ae46 100644 --- a/projects/rocshmem/src/gda/mlx5/segment_builder.cpp +++ b/projects/rocshmem/src/gda/mlx5/segment_builder.cpp @@ -72,8 +72,8 @@ __device__ SegmentBuilder::SegmentBuilder(uint64_t wqe_idx, void *base) { */ __device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm) { segp->ctrl_seg = {0}; - swap_endian_store(&segp->ctrl_seg.opmod_idx_opcode, ((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode); - swap_endian_store(&segp->ctrl_seg.qpn_ds, qp_num << 8 | ds); + segp->ctrl_seg.opmod_idx_opcode = byteswap(((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode); + segp->ctrl_seg.qpn_ds = byteswap(qp_num << 8 | ds); segp->ctrl_seg.fm_ce_se = fm_ce_se; segp->ctrl_seg.signature = signature; segp->ctrl_seg.imm = imm; @@ -82,7 +82,7 @@ __device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uin __device__ void SegmentBuilder::update_raddr_seg(uint64_t raddr, uint32_t rkey) { segp->raddr_seg = {0}; - swap_endian_store(reinterpret_cast(&segp->raddr_seg.raddr), raddr); + segp->raddr_seg.raddr = byteswap(raddr); segp->raddr_seg.rkey = rkey; segp++; } @@ -102,15 +102,15 @@ __device__ void SegmentBuilder::update_raddr_seg(uint64_t raddr, uint32_t rkey) */ __device__ void SegmentBuilder::update_data_seg(uint64_t laddr, uint32_t size, uint32_t lkey) { segp->data_seg = {0}; - swap_endian_store(&segp->data_seg.byte_count, size); + segp->data_seg.byte_count = byteswap(size); segp->data_seg.lkey = lkey; - swap_endian_store(reinterpret_cast(&segp->data_seg.addr), laddr); + segp->data_seg.addr = byteswap(laddr); segp++; } __device__ void SegmentBuilder::update_inl_data_seg(const void* laddr, int32_t size) { // size is masked with 0x3FF because only the first 10 bits of byte_count are valid - swap_endian_store(&segp->inl_data_seg.byte_count, ((size & 0x3FF) | MLX5_INLINE_SEG)); + segp->inl_data_seg.byte_count = byteswap((size & 0x3FF) | MLX5_INLINE_SEG); // + 1 because we start packing the segment with data after the byte_count parameter memcpy(&segp->inl_data_seg + 1, laddr, size); segp++; @@ -118,8 +118,8 @@ __device__ void SegmentBuilder::update_inl_data_seg(const void* laddr, int32_t s __device__ void SegmentBuilder::update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp) { segp->atomic_seg = {0}; - swap_endian_store(reinterpret_cast(&segp->atomic_seg.swap_add), atomic_data); - swap_endian_store(reinterpret_cast(&segp->atomic_seg.compare), atomic_cmp); + segp->atomic_seg.swap_add = byteswap(atomic_data); + segp->atomic_seg.compare = byteswap(atomic_cmp); segp++; }