Files
rocm-systems/projects/rocshmem/src/context_tmpl_device.hpp
T
Avinash Kethineedi c4de6833f6 Add SPDX license identifiers and update copyright headers (#85)
* Update copyright information and add SPDX license identifier

* Update AUTHORS

* Remove `sos_tests`

[ROCm/rocshmem commit: f6ef19f5a9]
2025-04-15 15:37:53 -05:00

583 baris
15 KiB
C++

/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_
#define LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "backend_type.hpp"
#ifdef USE_RO
#include "reverse_offload/context_ro_device.hpp"
#else
#include "ipc/context_ipc_device.hpp"
#endif
namespace rocshmem {
/*
* Context dispatch implementations for the template functions. Needs to
* be in a header and not cpp because it is a template.
*/
template <typename T>
__device__ void Context::p(T *dest, T value, int pe) {
ctxStats.incStat(NUM_P);
/*
* TODO: Need to handle _p a bit differently for coalescing, since the
* owner of a coalesced message needs val from all absorbed messages.
*/
DISPATCH(p(dest, value, pe));
}
template <typename T>
__device__ T Context::g(T *source, int pe) {
ctxStats.incStat(NUM_G);
/*
* TODO: Need to handle _g a bit differently for coalescing, since the
* owner of a coalesced message needs val from all absorbed messages.
*/
DISPATCH_RET(g(source, pe));
}
// The only way to get multi-arg templates to feed into a macro
template <typename T, ROCSHMEM_OP Op>
__device__ void Context::to_all(T *dest, const T *source, int nreduce,
int PE_start, int logPE_stride, int PE_size,
T *pWrk,
long *pSync) { // NOLINT(runtime/int)
if (nreduce == 0) {
return;
}
if (is_thread_zero_in_block()) {
ctxStats.incStat(NUM_TO_ALL);
}
DISPATCH(to_all<PAIR(T, Op)>(dest, source, nreduce, PE_start, logPE_stride,
PE_size, pWrk, pSync));
}
template <typename T, ROCSHMEM_OP Op>
__device__ int Context::reduce(rocshmem_team_t team, T *dest, const T *source,
int nreduce) {
if (nreduce == 0) {
return ROCSHMEM_SUCCESS;
}
if (is_thread_zero_in_block()) {
ctxStats.incStat(NUM_TO_ALL);
}
DISPATCH_RET(reduce<PAIR(T, Op)>(team, dest, source, nreduce));
}
template <typename T>
__device__ void Context::put(T *dest, const T *source, size_t nelems, int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_PUT);
DISPATCH(put(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::put_nbi(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_PUT_NBI);
DISPATCH(put_nbi(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::get(T *dest, const T *source, size_t nelems, int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_GET);
DISPATCH(get(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::get_nbi(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_GET_NBI);
DISPATCH(get_nbi(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::alltoall(rocshmem_team_t team, T *dest,
const T *source, int nelems) {
if (nelems == 0) {
return;
}
if (is_thread_zero_in_block()) {
ctxStats.incStat(NUM_ALLTOALL);
}
DISPATCH(alltoall<T>(team, dest, source, nelems));
}
template <typename T>
__device__ void Context::fcollect(rocshmem_team_t team, T *dest,
const T *source, int nelems) {
if (nelems == 0) {
return;
}
if (is_thread_zero_in_block()) {
ctxStats.incStat(NUM_FCOLLECT);
}
DISPATCH(fcollect<T>(team, dest, source, nelems));
}
template <typename T>
__device__ void Context::broadcast(rocshmem_team_t team, T *dest,
const T *source, int nelems, int pe_root) {
if (nelems == 0) {
return;
}
if (is_thread_zero_in_block()) {
ctxStats.incStat(NUM_BROADCAST);
}
DISPATCH(broadcast<T>(team, dest, source, nelems, pe_root));
}
template <typename T>
__device__ void Context::broadcast(T *dest, const T *source, int nelems,
int pe_root, int pe_start, int log_pe_stride,
int pe_size,
long *p_sync) { // NOLINT(runtime/int)
if (nelems == 0) {
return;
}
if (is_thread_zero_in_block()) {
ctxStats.incStat(NUM_BROADCAST);
}
DISPATCH(broadcast<T>(dest, source, nelems, pe_root, pe_start, log_pe_stride,
pe_size, p_sync));
}
template <typename T>
__device__ __forceinline__ void Context::wait_until(T *ivars, int cmp,
T val) {
while (!test(ivars, cmp, val)) {
}
}
__device__ __forceinline__ size_t status_entry(size_t nelems,
const int *status) {
size_t i{0};
while (i < nelems) {
if (status[i] == 0) {
return i;
}
i++;
}
return i;
}
template <typename T>
__device__ __forceinline__
size_t Context::wait_until_any(T *ivars, size_t nelems,
const int *status,
int cmp, T val) {
// zero nelems error condition
if (!nelems) {
return SIZE_MAX;
}
size_t pos{status_entry(nelems, status)};
// invalid (empty) status array error condition
if (pos == nelems) {
return SIZE_MAX;
}
while (true) {
for (size_t i{pos}; i < nelems; i++) {
// skip entries marked with non-zero status
if (status[i]) {
continue;
}
if (test(ivars + i, cmp, val)) {
return i;
}
}
}
}
template <typename T>
__device__ __forceinline__
void Context::wait_until_all(T *ivars, size_t nelems,
const int *status,
int cmp, T val) {
// zero nelems error condition
if (!nelems) {
return;
}
size_t pos{status_entry(nelems, status)};
// invalid (empty) status array error condition
if (pos == nelems) {
return;
}
for (size_t i{pos}; i < nelems; i++) {
if (status[i]) {
continue;
}
while (!test(ivars + i, cmp, val)) {
}
}
}
template <typename T>
__device__ __forceinline__
size_t Context::wait_until_some(T *ivars, size_t nelems,
size_t* indices,
const int *status,
int cmp, T val) {
// zero nelems error condition
if (!nelems) {
return 0;
}
size_t pos{status_entry(nelems, status)};
// invalid (empty) status array error condition
if (pos == nelems) {
return 0;
}
bool done {false};
size_t ncompleted {0};
while (!done) {
for (size_t i{pos}; i < nelems; i++) {
// skip entries marked with non-zero status
if (status[i]) {
continue;
}
if (test(ivars + i, cmp, val)) {
done = true;
indices[ncompleted] = i;
ncompleted++;
}
}
}
return ncompleted;
}
template <typename T>
__device__ __forceinline__
void Context::wait_until_all_vector(T *ivars, size_t nelems,
const int *status,
int cmp, T* vals) {
;
}
template <typename T>
__device__ __forceinline__
size_t Context::wait_until_any_vector(T *ivars, size_t nelems,
const int *status,
int cmp, T* vals) {
return 0;
}
template <typename T>
__device__ __forceinline__
size_t Context::wait_until_some_vector(T *ivars, size_t nelems,
size_t* indices,
const int *status,
int cmp, T* vals) {
return 0;
}
template <typename T>
__device__ __forceinline__ int Context::test(T *ivars, int cmp,
T val) {
int ret = 0;
volatile T *vol_ivars = reinterpret_cast<T *>(ivars);
switch (cmp) {
case ROCSHMEM_CMP_EQ:
if (uncached_load(vol_ivars) == val) {
ret = 1;
}
break;
case ROCSHMEM_CMP_NE:
if (uncached_load(vol_ivars) != val) {
ret = 1;
}
break;
case ROCSHMEM_CMP_GT:
if (uncached_load(vol_ivars) > val) {
ret = 1;
}
break;
case ROCSHMEM_CMP_GE:
if (uncached_load(vol_ivars) >= val) {
ret = 1;
}
break;
case ROCSHMEM_CMP_LT:
if (uncached_load(vol_ivars) < val) {
ret = 1;
}
break;
case ROCSHMEM_CMP_LE:
if (uncached_load(vol_ivars) <= val) {
ret = 1;
}
break;
default:
break;
}
return ret;
}
template <typename T>
__device__ void Context::put_wg(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_PUT_WG);
DISPATCH(put_wg(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::put_nbi_wg(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_PUT_NBI_WG);
DISPATCH(put_nbi_wg(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::get_wg(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_GET_WG);
DISPATCH(get_wg(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::get_nbi_wg(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_GET_NBI_WG);
DISPATCH(get_nbi_wg(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::put_wave(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_PUT_WAVE);
DISPATCH(put_wave(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::put_nbi_wave(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_PUT_NBI_WAVE);
DISPATCH(put_nbi_wave(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::get_wave(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_GET_WAVE);
DISPATCH(get_wave(dest, source, nelems, pe));
}
template <typename T>
__device__ void Context::get_nbi_wave(T *dest, const T *source, size_t nelems,
int pe) {
if (nelems == 0) {
return;
}
ctxStats.incStat(NUM_GET_NBI_WAVE);
DISPATCH(get_nbi_wave(dest, source, nelems, pe));
}
template <typename T>
__device__ T Context::amo_fetch_add(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_FADD);
DISPATCH_RET(amo_fetch_add(dst, value, pe));
}
template <typename T>
__device__ void Context::amo_add(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_ADD);
DISPATCH(amo_add(dst, value, pe));
}
template <typename T>
__device__ void Context::amo_set(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_SET);
DISPATCH(amo_set(dst, value, pe));
}
template <typename T>
__device__ T Context::amo_swap(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_SWAP);
DISPATCH_RET(amo_swap(dst, value, pe));
}
template <typename T>
__device__ T Context::amo_fetch_and(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_FETCH_AND);
DISPATCH_RET(amo_fetch_and(dst, value, pe));
}
template <typename T>
__device__ void Context::amo_and(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_AND);
DISPATCH(amo_and(dst, value, pe));
}
template <typename T>
__device__ T Context::amo_fetch_or(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_FETCH_OR);
DISPATCH_RET(amo_fetch_or(dst, value, pe));
}
template <typename T>
__device__ void Context::amo_or(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_OR);
DISPATCH(amo_or(dst, value, pe));
}
template <typename T>
__device__ T Context::amo_fetch_xor(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_FETCH_XOR);
DISPATCH_RET(amo_fetch_xor(dst, value, pe));
}
template <typename T>
__device__ void Context::amo_xor(void *dst, T value, int pe) {
ctxStats.incStat(NUM_ATOMIC_XOR);
DISPATCH(amo_xor(dst, value, pe));
}
template <typename T>
__device__ T Context::amo_fetch_cas(void *dst, T value, T cond, int pe) {
ctxStats.incStat(NUM_ATOMIC_FCSWAP);
DISPATCH_RET(amo_fetch_cas(dst, value, cond, pe));
}
template <typename T>
__device__ void Context::amo_cas(void *dst, T value, T cond, int pe) {
ctxStats.incStat(NUM_ATOMIC_CSWAP);
DISPATCH(amo_cas(dst, value, cond, pe));
}
#define CONTEXT_PUT_SIGNAL_DEF(SUFFIX, STATS_SUFFIX) \
template <typename T> \
__device__ void Context::put_signal##SUFFIX(T *dest, const T *source, size_t nelems, \
uint64_t *sig_addr, uint64_t signal, int sig_op, \
int pe) { \
if (nelems == 0) { \
return; \
} \
\
ctxStats.incStat(NUM_PUT_SIGNAL##STATS_SUFFIX); \
\
DISPATCH(put_signal##SUFFIX(dest, source, nelems, sig_addr, signal, sig_op, pe)); \
}
CONTEXT_PUT_SIGNAL_DEF(,)
CONTEXT_PUT_SIGNAL_DEF(_wg, _WG)
CONTEXT_PUT_SIGNAL_DEF(_wave, _WAVE)
CONTEXT_PUT_SIGNAL_DEF(_nbi, _NBI)
CONTEXT_PUT_SIGNAL_DEF(_nbi_wg, _NBI_WG)
CONTEXT_PUT_SIGNAL_DEF(_nbi_wave, _NBI_WAVE)
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTEXT_TMPL_DEVICE_HPP_