From 21171deeb8b97e6890dffa397c20187f4a861234 Mon Sep 17 00:00:00 2001 From: Yiltan Hassan Temucin Date: Mon, 3 Feb 2025 10:40:41 -0800 Subject: [PATCH 1/2] [RO] added MPI_UNSIGNED_LONG as type --- src/reverse_offload/commands_types.hpp | 1 + src/reverse_offload/context_ro_tmpl_device.hpp | 2 +- src/reverse_offload/mpi_transport.cpp | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/reverse_offload/commands_types.hpp b/src/reverse_offload/commands_types.hpp index e28ff572cd..05997e442f 100644 --- a/src/reverse_offload/commands_types.hpp +++ b/src/reverse_offload/commands_types.hpp @@ -53,6 +53,7 @@ enum ro_net_types { RO_NET_DOUBLE, RO_NET_INT, RO_NET_LONG, + RO_NET_UNSIGNED_LONG, RO_NET_LONG_LONG, RO_NET_SHORT, RO_NET_LONG_DOUBLE diff --git a/src/reverse_offload/context_ro_tmpl_device.hpp b/src/reverse_offload/context_ro_tmpl_device.hpp index d76a9eb222..b13c643244 100644 --- a/src/reverse_offload/context_ro_tmpl_device.hpp +++ b/src/reverse_offload/context_ro_tmpl_device.hpp @@ -61,7 +61,7 @@ struct GetROType { template <> struct GetROType { - static constexpr ro_net_types Type{RO_NET_LONG}; + static constexpr ro_net_types Type{RO_NET_UNSIGNED_LONG}; }; template <> diff --git a/src/reverse_offload/mpi_transport.cpp b/src/reverse_offload/mpi_transport.cpp index 9aa660d30c..3bdc40cd35 100644 --- a/src/reverse_offload/mpi_transport.cpp +++ b/src/reverse_offload/mpi_transport.cpp @@ -376,6 +376,8 @@ static MPI_Datatype convertType(ro_net_types type) { return MPI_INT; case RO_NET_LONG: return MPI_LONG; + case RO_NET_UNSIGNED_LONG: + return MPI_UNSIGNED_LONG; case RO_NET_LONG_LONG: return MPI_LONG_LONG; case RO_NET_SHORT: From f1c25f7e197eb6f0ece15eec26dcde862927c273 Mon Sep 17 00:00:00 2001 From: Yiltan Hassan Temucin Date: Mon, 3 Feb 2025 11:51:45 -0800 Subject: [PATCH 2/2] [RO] implemented signaling operations --- src/reverse_offload/context_ro_device.cpp | 106 ++++++++++++++++++ src/reverse_offload/context_ro_device.hpp | 21 ++++ .../context_ro_tmpl_device.hpp | 19 ++++ 3 files changed, 146 insertions(+) diff --git a/src/reverse_offload/context_ro_device.cpp b/src/reverse_offload/context_ro_device.cpp index 4cc5951de0..8cf2faeedb 100644 --- a/src/reverse_offload/context_ro_device.cpp +++ b/src/reverse_offload/context_ro_device.cpp @@ -21,6 +21,7 @@ *****************************************************************************/ #include "context_ro_device.hpp" +#include "context_ro_tmpl_device.hpp" #include #include @@ -348,6 +349,111 @@ __device__ void ROContext::getmem_nbi_wave(void *dest, const void *source, } } +__device__ void ROContext::putmem_signal(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem(dest, source, nelems, pe); + fence(); + + switch (sig_op) { + case ROCSHMEM_SIGNAL_SET: + amo_set(static_cast(sig_addr), signal, pe); + break; + case ROCSHMEM_SIGNAL_ADD: + amo_add(static_cast(sig_addr), signal, pe); + break; + default: + DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op); + break; + } +} + +__device__ void ROContext::putmem_signal_wg(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_wg(dest, source, nelems, pe); + fence(); + + if (is_thread_zero_in_block()) { + switch (sig_op) { + case ROCSHMEM_SIGNAL_SET: + amo_set(static_cast(sig_addr), signal, pe); + break; + case ROCSHMEM_SIGNAL_ADD: + amo_add(static_cast(sig_addr), signal, pe); + break; + default: + DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op); + break; + } + } +} + +__device__ void ROContext::putmem_signal_wave(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_wave(dest, source, nelems, pe); + fence(); + + if (is_thread_zero_in_wave()) { + switch (sig_op) { + case ROCSHMEM_SIGNAL_SET: + amo_set(static_cast(sig_addr), signal, pe); + break; + case ROCSHMEM_SIGNAL_ADD: + amo_add(static_cast(sig_addr), signal, pe); + break; + default: + DPRINTF("[%s] Invalid sig_op value (%d)\n", __func__, sig_op); + break; + } + } +} + +__device__ void ROContext::putmem_signal_nbi(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_signal(dest, source, nelems, sig_addr, signal, sig_op, pe); +} + +__device__ void ROContext::putmem_signal_nbi_wg(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_signal_wg(dest, source, nelems, sig_addr, signal, sig_op, pe); +} + +__device__ void ROContext::putmem_signal_nbi_wave(void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, + int pe) { + putmem_signal_wave(dest, source, nelems, sig_addr, signal, sig_op, pe); +} + +__device__ uint64_t ROContext::signal_fetch(const uint64_t *sig_addr) { + uint64_t *dst = const_cast(sig_addr); + return amo_fetch_add(static_cast(dst), 0, my_pe); +} + +__device__ uint64_t ROContext::signal_fetch_wg(const uint64_t *sig_addr) { + __shared__ uint64_t value; + if (is_thread_zero_in_block()) { + uint64_t *dst = const_cast(sig_addr); + value = amo_fetch_add(static_cast(dst), 0, my_pe); + } + __threadfence_block(); + return value; +} + +__device__ uint64_t ROContext::signal_fetch_wave(const uint64_t *sig_addr) { + uint64_t value; + if (is_thread_zero_in_wave()) { + uint64_t *dst = const_cast(sig_addr); + value = amo_fetch_add(static_cast(dst), 0, my_pe); + } + __threadfence_block(); + value = __shfl(value, 0); + return value; +} + __device__ uint64_t number_active_lanes() { return __popcll(__ballot(1)); } diff --git a/src/reverse_offload/context_ro_device.hpp b/src/reverse_offload/context_ro_device.hpp index ceccf5e776..84be8b158f 100644 --- a/src/reverse_offload/context_ro_device.hpp +++ b/src/reverse_offload/context_ro_device.hpp @@ -229,6 +229,27 @@ class ROContext : public Context { template __device__ void get_nbi_wave(T *dest, const T *source, size_t nelems, int pe); +#define RO_CONTEXT_PUT_SIGNAL_DEC(SUFFIX) \ + template \ + __device__ void put_signal##SUFFIX(T *dest, const T *source, size_t nelems, \ + uint64_t *sig_addr, uint64_t signal, int sig_op, \ + int pe); \ + \ + __device__ void putmem_signal##SUFFIX(void *dest, const void *source, size_t nelems, \ + uint64_t *sig_addr, uint64_t signal, int sig_op, \ + int pe); + + RO_CONTEXT_PUT_SIGNAL_DEC() + RO_CONTEXT_PUT_SIGNAL_DEC(_wg) + RO_CONTEXT_PUT_SIGNAL_DEC(_wave) + RO_CONTEXT_PUT_SIGNAL_DEC(_nbi) + RO_CONTEXT_PUT_SIGNAL_DEC(_nbi_wg) + RO_CONTEXT_PUT_SIGNAL_DEC(_nbi_wave) + + __device__ uint64_t signal_fetch(const uint64_t *sig_addr); + __device__ uint64_t signal_fetch_wg(const uint64_t *sig_addr); + __device__ uint64_t signal_fetch_wave(const uint64_t *sig_addr); + private: __device__ uint64_t *get_unused_atomic(); diff --git a/src/reverse_offload/context_ro_tmpl_device.hpp b/src/reverse_offload/context_ro_tmpl_device.hpp index b13c643244..5c05d9e902 100644 --- a/src/reverse_offload/context_ro_tmpl_device.hpp +++ b/src/reverse_offload/context_ro_tmpl_device.hpp @@ -432,6 +432,25 @@ __device__ void ROContext::get_nbi_wave(T *dest, const T *source, size_t nelems, getmem_nbi_wave(dest, source, size, pe); } +#define RO_CONTEXT_PUT_SIGNAL_DEF(SUFFIX) \ + template \ + __device__ void ROContext::put_signal##SUFFIX(T *dest, const T *source, size_t nelems, \ + uint64_t *sig_addr, uint64_t signal, int sig_op, \ + int pe) { \ + putmem_signal##SUFFIX(dest, source, nelems * sizeof(T), sig_addr, signal, sig_op, pe); \ + } \ + \ + template \ + __device__ void ROContext::put_signal_nbi##SUFFIX(T *dest, const T *source, size_t nelems, \ + uint64_t *sig_addr, uint64_t signal, int sig_op, \ + int pe) { \ + putmem_signal##SUFFIX(dest, source, nelems * sizeof(T), sig_addr, signal, sig_op, pe); \ + } + +RO_CONTEXT_PUT_SIGNAL_DEF() +RO_CONTEXT_PUT_SIGNAL_DEF(_wg) +RO_CONTEXT_PUT_SIGNAL_DEF(_wave) + } // namespace rocshmem #endif // LIBRARY_SRC_REVERSE_OFFLOAD_RO_NET_GPU_TEMPLATES_HPP_