Replace byteswap interface to align with C++23 std::byteswap (#368)

* byteswap<T> returns by value
* replace hand-rolled implementations with Clang __builtin_bswap<N> intrinsics
* new high-level interface endian::to_be, endian::from_be, etc. to indicate conversion direction
Este commit está contenido en:
Omri Mor
2026-01-15 14:03:01 -07:00
cometido por GitHub
padre 491739c9b4
commit cf8b72a047
Se han modificado 6 ficheros con 97 adiciones y 143 borrados
-1
Ver fichero
@@ -36,7 +36,6 @@ target_sources(
gda_team.cpp
queue_pair.cpp
numa_wrapper.cpp
endian.cpp
topology.cpp
)
-81
Ver fichero
@@ -1,81 +0,0 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "endian.hpp"
namespace rocshmem {
template <typename T>
__device__ void swap_endian_store(T *dst, const T val) {
typedef union U {
T val;
uint8_t bytes[sizeof(T)];
} union_type;
union_type src;
union_type dst_tmp;
src.val = val;
std::reverse_copy(src.bytes, src.bytes + sizeof(T), dst_tmp.bytes);
*dst = dst_tmp.val;
}
template <>
__device__ void swap_endian_store(uint64_t *dst, const uint64_t val) {
uint64_t new_val = ((val << 8) & 0xFF00FF00FF00FF00ULL) |
((val >> 8) & 0x00FF00FF00FF00FFULL);
new_val = ((new_val << 16) & 0xFFFF0000FFFF0000ULL) |
((new_val >> 16) & 0x0000FFFF0000FFFFULL);
*dst = (new_val << 32) | (new_val >> 32);
}
template <>
__device__ void swap_endian_store(int64_t *dst, const int64_t val) {
swap_endian_store(reinterpret_cast<uint64_t*>(dst), (const uint64_t)val);
}
template <>
__device__ void swap_endian_store(uint32_t *dst, const uint32_t val) {
uint32_t new_val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
*dst = (new_val << 16) | (new_val >> 16);
}
template <>
__device__ void swap_endian_store(int32_t *dst, const int32_t val) {
swap_endian_store(reinterpret_cast<uint32_t*>(dst), (const uint32_t)val);
}
template <>
__device__ void swap_endian_store(uint16_t *dst, const uint16_t val) {
*dst = ((val << 8) & 0xFF00) | ((val >> 8) & 0x00FF);
}
template <>
__device__ void swap_endian_store(int16_t *dst, const int16_t val) {
swap_endian_store(reinterpret_cast<uint16_t*>(dst), (const uint16_t)val);
}
} // namespace rocshmem
+56 -19
Ver fichero
@@ -25,36 +25,73 @@
#ifndef LIBRARY_SRC_GDA_ENDIAN_HPP_
#define LIBRARY_SRC_GDA_ENDIAN_HPP_
#include <type_traits>
#include <hip/hip_runtime.h>
namespace rocshmem {
template <typename T>
__device__ void swap_endian_store(T *dst, const T val);
// this is essentially std::byteswap from C++23
template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
constexpr inline __host__ __device__ T byteswap(T val) {
if constexpr (sizeof(T) == 1) {
return val;
} else if constexpr (sizeof(T) == 2) {
return __builtin_bswap16(val);
} else if constexpr (sizeof(T) == 4) {
return __builtin_bswap32(val);
} else if constexpr (sizeof(T) == 8) {
return __builtin_bswap64(val);
} else {
// sizeof(T) to force this to be instantiation-dependent
static_assert(sizeof(T) == 0, "byteswap not implemented for this type");
}
}
template <>
__device__ void swap_endian_store(uint64_t *dst, const uint64_t val);
namespace endian {
enum class Order {
Big = __ORDER_BIG_ENDIAN__,
Little = __ORDER_LITTLE_ENDIAN__,
Native = __BYTE_ORDER__
};
template <>
__device__ void swap_endian_store(int64_t *dst, const int64_t val);
template <Order To, Order From, typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
__host__ __device__ constexpr inline T convert(T val) {
if constexpr (To == From) {
return val;
} else {
return byteswap(val);
}
}
template <>
__device__ void swap_endian_store(uint32_t *dst, const uint32_t val);
template <Order From, typename T>
__host__ __device__ constexpr inline T to_native(T val) {
return convert<Order::Native, From, T>(val);
}
template <>
__device__ void swap_endian_store(int32_t *dst, const int32_t val);
template <Order To, typename T>
__host__ __device__ constexpr inline T from_native(T val) {
return convert<To, Order::Native, T>(val);
}
template <>
__device__ void swap_endian_store(uint16_t *dst, const uint16_t val);
template <typename T>
__host__ __device__ constexpr inline T to_be(T val) {
return convert<Order::Big, Order::Native, T>(val);
}
template <>
__device__ void swap_endian_store(int16_t *dst, const int16_t val);
template <typename T>
__host__ __device__ constexpr inline T from_be(T val) {
return convert<Order::Native, Order::Big, T>(val);
}
template <typename T>
__device__ T swap_endian_val(const T val) {
T dst;
swap_endian_store(&dst, val);
return dst;
template <typename T>
__host__ __device__ constexpr inline T to_le(T val) {
return convert<Order::Little, Order::Native, T>(val);
}
template <typename T>
__host__ __device__ constexpr inline T from_le(T val) {
return convert<Order::Native, Order::Little, T>(val);
}
}
} // namespace rocshmem
+30 -30
Ver fichero
@@ -109,7 +109,7 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
struct ionic_v1_cqe *cqe = &ionic_cq_buf[my_cq_pos & cq_mask];
/* Determine expected color based on cq wrap count */
uint32_t qtf_color_bit = swap_endian_val<uint32_t>(IONIC_V1_CQE_COLOR);
uint32_t qtf_color_bit = byteswap<uint32_t>(IONIC_V1_CQE_COLOR);
uint32_t qtf_color_exp = qtf_color_bit;
if (my_cq_pos & (cq_mask + 1)) {
qtf_color_exp = 0;
@@ -121,16 +121,16 @@ __device__ void QueuePair::poll_wave_cqes(uint64_t activemask) {
return;
}
uint32_t msn = swap_endian_val<uint32_t>(cqe->send.msg_msn);
uint32_t msn = byteswap<uint32_t>(cqe->send.msg_msn);
/* Report if the completion indicates an error. */
if (!!(qtf_be & swap_endian_val<uint32_t>(IONIC_V1_CQE_ERROR))) {
if (!!(qtf_be & byteswap<uint32_t>(IONIC_V1_CQE_ERROR))) {
#if defined(DEBUG)
uint32_t qtf = swap_endian_val<uint32_t>(qtf_be);
uint32_t qtf = byteswap<uint32_t>(qtf_be);
uint32_t qid = qtf >> IONIC_V1_CQE_QID_SHIFT;
uint32_t type = (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
uint32_t flag = qtf & 0xf;
uint32_t status = swap_endian_val<uint32_t>(cqe->status_length);
uint32_t status = byteswap<uint32_t>(cqe->status_length);
uint64_t npg = cqe->send.npg_wqe_idx_timestamp & IONIC_V1_CQE_WQE_IDX_MASK;
printf("QUIET ERROR: %s qid %u type %u flag %#x status %u msn %u npg %lu\n",
@@ -249,11 +249,11 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t la
uint16_t wqe_flags = 0;
if (!(my_sq_pos & (sq_mask + 1))) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
wqe_flags |= byteswap<uint16_t>(IONIC_V1_FLAG_COLOR);
}
if (is_last_active_lane(activemask)) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
wqe_flags |= byteswap<uint16_t>(IONIC_V1_FLAG_SIG);
}
// TODO why is this needed?
@@ -264,16 +264,16 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t la
wqe->base.wqe_idx = my_sq_pos;
wqe->base.op = opcode;
wqe->base.num_sge_key = size ? 1 : 0;
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->base.imm_data_key = byteswap<uint32_t>(0);
wqe->common.rdma.remote_va_high = swap_endian_val<uint32_t>(raddr >> 32);
wqe->common.rdma.remote_va_low = swap_endian_val<uint32_t>(raddr);
wqe->common.rdma.remote_rkey = swap_endian_val<uint32_t>(rkey);
wqe->common.length = swap_endian_val<uint32_t>(size);
wqe->common.rdma.remote_va_high = byteswap<uint32_t>(raddr >> 32);
wqe->common.rdma.remote_va_low = byteswap<uint32_t>(raddr);
wqe->common.rdma.remote_rkey = byteswap<uint32_t>(rkey);
wqe->common.length = byteswap<uint32_t>(size);
if (size) {
if (opcode == IONIC_V2_OP_RDMA_WRITE && size <= inline_threshold) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_INL);
wqe_flags |= byteswap<uint16_t>(IONIC_V1_FLAG_INL);
wqe->base.num_sge_key = 0;
if (!laddr) {
// TODO why is this needed?
@@ -282,9 +282,9 @@ __device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t la
memcpy(wqe->common.pld.data, reinterpret_cast<const void*>(laddr), size);
}
} else {
wqe->common.pld.sgl[0].va = swap_endian_val<uint64_t>(laddr);
wqe->common.pld.sgl[0].len = swap_endian_val<uint32_t>(size);
wqe->common.pld.sgl[0].lkey = swap_endian_val<uint32_t>(lkey);
wqe->common.pld.sgl[0].va = byteswap<uint64_t>(laddr);
wqe->common.pld.sgl[0].len = byteswap<uint32_t>(size);
wqe->common.pld.sgl[0].lkey = byteswap<uint32_t>(lkey);
}
}
@@ -319,32 +319,32 @@ __device__ uint64_t QueuePair::ionic_post_wqe_amo(int pe, int32_t size, uintptr_
}
if (!(my_sq_pos & (sq_mask + 1))) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_COLOR);
wqe_flags |= byteswap<uint16_t>(IONIC_V1_FLAG_COLOR);
}
if (is_last_active_lane(activemask)) {
wqe_flags |= swap_endian_val<uint16_t>(IONIC_V1_FLAG_SIG);
wqe_flags |= byteswap<uint16_t>(IONIC_V1_FLAG_SIG);
}
wqe->base.wqe_idx = my_sq_pos;
wqe->base.op = opcode;
wqe->base.num_sge_key = 1;
wqe->base.imm_data_key = swap_endian_val<uint32_t>(0);
wqe->base.imm_data_key = byteswap<uint32_t>(0);
wqe->atomic_v2.remote_va_high = swap_endian_val<uint32_t>(raddr >> 32);
wqe->atomic_v2.remote_va_low = swap_endian_val<uint32_t>(raddr);
wqe->atomic_v2.remote_rkey = swap_endian_val<uint32_t>(rkey);
wqe->atomic_v2.swap_add_high = swap_endian_val<uint32_t>(atomic_data >> 32);
wqe->atomic_v2.swap_add_low = swap_endian_val<uint32_t>(atomic_data);
wqe->atomic_v2.compare_high = swap_endian_val<uint32_t>(atomic_cmp >> 32);
wqe->atomic_v2.compare_low = swap_endian_val<uint32_t>(atomic_cmp);
wqe->atomic_v2.remote_va_high = byteswap<uint32_t>(raddr >> 32);
wqe->atomic_v2.remote_va_low = byteswap<uint32_t>(raddr);
wqe->atomic_v2.remote_rkey = byteswap<uint32_t>(rkey);
wqe->atomic_v2.swap_add_high = byteswap<uint32_t>(atomic_data >> 32);
wqe->atomic_v2.swap_add_low = byteswap<uint32_t>(atomic_data);
wqe->atomic_v2.compare_high = byteswap<uint32_t>(atomic_cmp >> 32);
wqe->atomic_v2.compare_low = byteswap<uint32_t>(atomic_cmp);
if (fetching) {
wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(wave_fetch_atomic + my_logical_lane_id));
wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(fetching_atomic_lkey);
wqe->atomic_v2.local_va = byteswap<uint64_t>(reinterpret_cast<uint64_t>(wave_fetch_atomic + my_logical_lane_id));
wqe->atomic_v2.lkey = byteswap<uint32_t>(fetching_atomic_lkey);
} else {
wqe->atomic_v2.local_va = swap_endian_val<uint64_t>(reinterpret_cast<uint64_t>(nonfetching_atomic));
wqe->atomic_v2.lkey = swap_endian_val<uint32_t>(nonfetching_atomic_lkey);
wqe->atomic_v2.local_va = byteswap<uint64_t>(reinterpret_cast<uint64_t>(nonfetching_atomic));
wqe->atomic_v2.lkey = byteswap<uint32_t>(nonfetching_atomic_lkey);
}
__hip_atomic_store(&wqe->base.flags, wqe_flags, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
+3 -4
Ver fichero
@@ -31,7 +31,7 @@
namespace rocshmem {
__device__ void QueuePair::mlx5_ring_doorbell(uint64_t db_val, uint64_t my_sq_counter) {
swap_endian_store(const_cast<uint32_t*>(dbrec), (uint32_t)my_sq_counter);
*dbrec = byteswap<uint32_t>(my_sq_counter);
__atomic_signal_fence(__ATOMIC_SEQ_CST);
__hip_atomic_store(db.ptr, db_val, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -98,8 +98,7 @@ __device__ void QueuePair::mlx5_quiet() {
}
}
uint16_t wqe_counter;
swap_endian_store(const_cast<uint16_t*>(&wqe_counter), reinterpret_cast<uint16_t>(be_wqe_counter));
uint16_t wqe_counter = byteswap<uint16_t>(be_wqe_counter);
uint64_t wqe_id = outstanding_wqes[wqe_counter];
__hip_atomic_fetch_max(&wqe_broadcast[wavefront_id], wqe_id, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
uint8_t mlx5_invld_bits = MLX5_CQE_INVALID << 4 | owner_bit;
@@ -112,7 +111,7 @@ __device__ void QueuePair::mlx5_quiet() {
completed = __hip_atomic_load(&quiet_completed, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
} while (completed != wave_cq_consumer);
swap_endian_store(const_cast<uint32_t*>(cq_dbrec), (uint32_t)(wave_cq_consumer + quiet_amount));
*cq_dbrec = byteswap<uint32_t>(wave_cq_consumer + quiet_amount);
__atomic_signal_fence(__ATOMIC_SEQ_CST);
uint64_t sunk_wqe_id = wqe_broadcast[wavefront_id];
+8 -8
Ver fichero
@@ -72,8 +72,8 @@ __device__ SegmentBuilder::SegmentBuilder(uint64_t wqe_idx, void *base) {
*/
__device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm) {
segp->ctrl_seg = {0};
swap_endian_store(&segp->ctrl_seg.opmod_idx_opcode, ((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode);
swap_endian_store(&segp->ctrl_seg.qpn_ds, qp_num << 8 | ds);
segp->ctrl_seg.opmod_idx_opcode = byteswap<uint32_t>(((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode);
segp->ctrl_seg.qpn_ds = byteswap<uint32_t>(qp_num << 8 | ds);
segp->ctrl_seg.fm_ce_se = fm_ce_se;
segp->ctrl_seg.signature = signature;
segp->ctrl_seg.imm = imm;
@@ -82,7 +82,7 @@ __device__ void SegmentBuilder::update_ctrl_seg(uint16_t pi, uint8_t opcode, uin
__device__ void SegmentBuilder::update_raddr_seg(uint64_t raddr, uint32_t rkey) {
segp->raddr_seg = {0};
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->raddr_seg.raddr), raddr);
segp->raddr_seg.raddr = byteswap<uint64_t>(raddr);
segp->raddr_seg.rkey = rkey;
segp++;
}
@@ -102,15 +102,15 @@ __device__ void SegmentBuilder::update_raddr_seg(uint64_t raddr, uint32_t rkey)
*/
__device__ void SegmentBuilder::update_data_seg(uint64_t laddr, uint32_t size, uint32_t lkey) {
segp->data_seg = {0};
swap_endian_store(&segp->data_seg.byte_count, size);
segp->data_seg.byte_count = byteswap<uint32_t>(size);
segp->data_seg.lkey = lkey;
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->data_seg.addr), laddr);
segp->data_seg.addr = byteswap<uint64_t>(laddr);
segp++;
}
__device__ void SegmentBuilder::update_inl_data_seg(const void* laddr, int32_t size) {
// size is masked with 0x3FF because only the first 10 bits of byte_count are valid
swap_endian_store(&segp->inl_data_seg.byte_count, ((size & 0x3FF) | MLX5_INLINE_SEG));
segp->inl_data_seg.byte_count = byteswap<uint32_t>((size & 0x3FF) | MLX5_INLINE_SEG);
// + 1 because we start packing the segment with data after the byte_count parameter
memcpy(&segp->inl_data_seg + 1, laddr, size);
segp++;
@@ -118,8 +118,8 @@ __device__ void SegmentBuilder::update_inl_data_seg(const void* laddr, int32_t s
__device__ void SegmentBuilder::update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp) {
segp->atomic_seg = {0};
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->atomic_seg.swap_add), atomic_data);
swap_endian_store(reinterpret_cast<uint64_t*>(&segp->atomic_seg.compare), atomic_cmp);
segp->atomic_seg.swap_add = byteswap<uint64_t>(atomic_data);
segp->atomic_seg.compare = byteswap<uint64_t>(atomic_cmp);
segp++;
}