* Use a single printf per line (reduce chances of lines being cut in logs)

* team_comm can be an int or a pointer depending on MPI impl.
Received is confusing (since we are on the origin), use submitted
instead

* Print arguments to calls when using DEBUG

---------

Signed-off-by: Aurelien Bouteiller <abouteil@amd.com>

[ROCm/rocshmem commit: 3600291558]
This commit is contained in:
Aurelien Bouteiller
2025-05-15 10:37:12 -04:00
committed by GitHub
szülő 17cde51fb7
commit 03a9fac960
3 fájl változott, egészen pontosan 194 új sor hozzáadva és 127 régi sor törölve
@@ -97,7 +97,7 @@ void MPITransport::submitRequestsToMPI() {
putMem(next_element.dst, next_element.src, next_element.ol1.size,
next_element.PE, next_element.ro_net_win_id, queue_idx,
next_element.status, true);
DPRINTF("Received PUT dst %p src %p size %lu pe %d win_id %d\n",
DPRINTF("Submitted PUT dst %p src %p size %lu pe %d win_id %d\n",
next_element.dst, next_element.src, next_element.ol1.size,
next_element.PE, next_element.ro_net_win_id);
break;
@@ -112,7 +112,7 @@ void MPITransport::submitRequestsToMPI() {
putMem(next_element.dst, source_buffer, next_element.ol1.size,
next_element.PE, next_element.ro_net_win_id, queue_idx,
next_element.status, true, true);
DPRINTF("Received P dst %p value %p pe %d\n", next_element.dst,
DPRINTF("Submitted P dst %p value %p pe %d\n", next_element.dst,
next_element.src, next_element.PE);
break;
}
@@ -120,14 +120,14 @@ void MPITransport::submitRequestsToMPI() {
getMem(next_element.dst, next_element.src, next_element.ol1.size,
next_element.PE, next_element.ro_net_win_id, queue_idx,
next_element.status, true);
DPRINTF("Received GET dst %p src %p size %lu pe %d\n", next_element.dst,
DPRINTF("Submitted GET dst %p src %p size %lu pe %d\n", next_element.dst,
next_element.src, next_element.ol1.size, next_element.PE);
break;
case RO_NET_PUT_NBI:
putMem(next_element.dst, next_element.src, next_element.ol1.size,
next_element.PE, next_element.ro_net_win_id, queue_idx,
next_element.status, false);
DPRINTF("Received PUT NBI dst %p src %p size %lu pe %d\n",
DPRINTF("Submitted PUT NBI dst %p src %p size %lu pe %d\n",
next_element.dst, next_element.src, next_element.ol1.size,
next_element.PE);
break;
@@ -135,7 +135,7 @@ void MPITransport::submitRequestsToMPI() {
getMem(next_element.dst, next_element.src, next_element.ol1.size,
next_element.PE, next_element.ro_net_win_id, queue_idx,
next_element.status, false);
DPRINTF("Received GET NBI dst %p src %p size %lu pe %d\n",
DPRINTF("Submitted GET NBI dst %p src %p size %lu pe %d\n",
next_element.dst, next_element.src, next_element.ol1.size,
next_element.PE);
break;
@@ -146,7 +146,7 @@ void MPITransport::submitRequestsToMPI() {
next_element.status, true,
static_cast<ROCSHMEM_OP>(next_element.op),
static_cast<ro_net_types>(next_element.datatype));
DPRINTF("Received AMO dst %p src %p Val %llu pe %d\n", next_element.dst,
DPRINTF("Submitted AMO dst %p src %p Val %llu pe %d\n", next_element.dst,
next_element.src, next_element.ol1.atomic_value, next_element.PE);
break;
case RO_NET_AMO_FCAS:
@@ -156,7 +156,7 @@ void MPITransport::submitRequestsToMPI() {
next_element.status, true,
const_cast<void **>(&next_element.ol2.pWrk),
static_cast<ro_net_types>(next_element.datatype));
DPRINTF("Received F_CSWAP dst %p src %p Val %llu pe %d cond %ld\n",
DPRINTF("Submitted F_CSWAP dst %p src %p Val %llu pe %d cond %ld\n",
next_element.dst, next_element.src, next_element.ol1.atomic_value,
next_element.PE,
reinterpret_cast<int64_t>(next_element.ol2.pWrk));
@@ -168,9 +168,9 @@ void MPITransport::submitRequestsToMPI() {
static_cast<ROCSHMEM_OP>(next_element.op),
static_cast<ro_net_types>(next_element.datatype),
next_element.status, true);
DPRINTF("Received FLOAT_SUM_TEAM_REDUCE dst %p src %p size %lu team %d\n",
DPRINTF("Submitted FLOAT_SUM_TEAM_REDUCE dst %p src %p size %lu team %zd\n",
next_element.dst, next_element.src, next_element.ol1.size,
next_element.team_comm);
(intptr_t)next_element.team_comm);
break;
case RO_NET_TEAM_BROADCAST:
team_broadcast(next_element.dst, next_element.src, next_element.ol1.size,
@@ -179,10 +179,10 @@ void MPITransport::submitRequestsToMPI() {
static_cast<ro_net_types>(next_element.datatype),
next_element.status, true);
DPRINTF(
"Received TEAM_BROADCAST dst %p src %p size %lu "
"team %d, PE_root %d \n",
"Submitted TEAM_BROADCAST dst %p src %p size %lu "
"team %zd, PE_root %d \n",
next_element.dst, next_element.src, next_element.ol1.size,
next_element.team_comm, next_element.PE_root);
(intptr_t)next_element.team_comm, next_element.PE_root);
break;
case RO_NET_ALLTOALL:
alltoall(next_element.dst, next_element.src, next_element.ol1.size,
@@ -190,9 +190,9 @@ void MPITransport::submitRequestsToMPI() {
next_element.ol2.pWrk,
static_cast<ro_net_types>(next_element.datatype),
next_element.status, true);
DPRINTF("Received ALLTOALL dst %p src %p size %lu team %d\n",
DPRINTF("Submitted ALLTOALL dst %p src %p size %lu team %zd\n",
next_element.dst, next_element.src, next_element.ol1.size,
next_element.team_comm);
(intptr_t)next_element.team_comm);
break;
case RO_NET_FCOLLECT:
fcollect(next_element.dst, next_element.src, next_element.ol1.size,
@@ -200,30 +200,30 @@ void MPITransport::submitRequestsToMPI() {
next_element.ol2.pWrk,
static_cast<ro_net_types>(next_element.datatype),
next_element.status, true);
DPRINTF("Received FCOLLECT dst %p src %p size %lu team %d\n",
DPRINTF("Submitted FCOLLECT dst %p src %p size %lu team %zd\n",
next_element.dst, next_element.src, next_element.ol1.size,
next_element.team_comm);
(intptr_t)next_element.team_comm);
break;
case RO_NET_BARRIER:
barrier(queue_idx, next_element.status, true,
next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm,
true);
DPRINTF("Received Barrier_all\n");
next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm,
true);
DPRINTF("Submitted Barrier_all\n");
break;
case RO_NET_SYNC:
barrier(queue_idx, next_element.status, true,
next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm,
false);
DPRINTF("Received Sync\n");
next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm,
false);
DPRINTF("Submitted Sync\n");
break;
case RO_NET_FENCE:
case RO_NET_QUIET:
quiet(queue_idx, next_element.status);
DPRINTF("Received FENCE/QUIET\n");
DPRINTF("Submitted FENCE/QUIET\n");
break;
case RO_NET_FINALIZE:
quiet(queue_idx, next_element.status);
DPRINTF("Received Finalize\n");
DPRINTF("Submitted Finalize\n");
break;
default:
fprintf(stderr, "Invalid GPU Packet received, exiting....\n");
+164 -96
Fájl megtekintése
@@ -283,14 +283,14 @@ __device__ Context *get_internal_ctx(rocshmem_ctx_t ctx) {
return reinterpret_cast<Context *>(ctx.ctx_opaque);
}
__device__ int rocshmem_wg_ctx_create(long option, rocshmem_ctx_t *ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_create\n");
__device__ int rocshmem_wg_ctx_create(long options, rocshmem_ctx_t *ctx) {
GPU_DPRINTF("Function: rocshmem_wg_ctx_create (options=%ld)\n", options);
bool result{true};
if (get_flat_block_id() == 0) {
ctx->team_opaque = reinterpret_cast<TeamInfo *>(ROCSHMEM_CTX_DEFAULT.team_opaque);
result = device_backend_proxy->create_ctx(option, ctx);
result = device_backend_proxy->create_ctx(options, ctx);
if(result) {
reinterpret_cast<Context *>(ctx->ctx_opaque)->setFence(option);
reinterpret_cast<Context *>(ctx->ctx_opaque)->setFence(options);
}
}
__syncthreads();
@@ -298,8 +298,9 @@ __device__ int rocshmem_wg_ctx_create(long option, rocshmem_ctx_t *ctx) {
}
__device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options,
rocshmem_ctx_t *ctx) {
GPU_DPRINTF("Function: rocshmem_team_create_ctx\n");
rocshmem_ctx_t *ctx) {
GPU_DPRINTF("Function: rocshmem_wg_team_create_ctx (team=%zd, options=%ld)\n",
(intptr_t)team, options);
if (team == ROCSHMEM_TEAM_INVALID) {
return -1;
}
@@ -321,7 +322,8 @@ __device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options,
__device__ void rocshmem_wg_ctx_destroy(
[[maybe_unused]] rocshmem_ctx_t *ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_destroy\n");
GPU_DPRINTF("Function: rocshmem_wg_ctx_destroy (ctx=%zd)\n",
ctx->ctx_opaque);
if (get_flat_block_id() == 0) {
device_backend_proxy->destroy_ctx(ctx);
@@ -329,7 +331,8 @@ __device__ void rocshmem_wg_ctx_destroy(
}
__device__ void rocshmem_ctx_threadfence_system(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_threadfence_system\n");
GPU_DPRINTF("Function: rocshmem_ctx_threadfence_system (ctx=%zd)\n",
ctx.ctx_opaque);
get_internal_ctx(ctx)->threadfence_system();
}
@@ -337,9 +340,9 @@ __device__ void rocshmem_ctx_threadfence_system(rocshmem_ctx_t ctx) {
__device__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_putmem\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_ctx_putmem (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
get_internal_ctx(ctx)->putmem(dest, source, nelems, pe_in_world);
}
@@ -347,27 +350,27 @@ __device__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest,
template <typename T>
__device__ void rocshmem_put(rocshmem_ctx_t ctx, T *dest, const T *source,
size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_put\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_put (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
get_internal_ctx(ctx)->put(dest, source, nelems, pe_in_world);
}
template <typename T>
__device__ void rocshmem_p(rocshmem_ctx_t ctx, T *dest, T value, int pe) {
GPU_DPRINTF("Function: rocshmem_p\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_p (ctx=%zd, dest=%p, value=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, value, pe, pe_in_world);
get_internal_ctx(ctx)->p(dest, value, pe_in_world);
}
template <typename T>
__device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe) {
GPU_DPRINTF("Function: rocshmem_g\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_g (ctx=%zd, source=%p, pe=%d w%d)\n",
ctx.ctx_opaque, source, pe, pe_in_world);
return get_internal_ctx(ctx)->g(source, pe_in_world);
}
@@ -375,9 +378,9 @@ __device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe) {
__device__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_getmem\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_ctx_getmem (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
get_internal_ctx(ctx)->getmem(dest, source, nelems, pe_in_world);
}
@@ -385,9 +388,9 @@ __device__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest,
template <typename T>
__device__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source,
size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_get\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_get (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
get_internal_ctx(ctx)->get(dest, source, nelems, pe_in_world);
}
@@ -395,9 +398,9 @@ __device__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source,
__device__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
get_internal_ctx(ctx)->putmem_nbi(dest, source, nelems, pe_in_world);
}
@@ -405,9 +408,9 @@ __device__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest,
template <typename T>
__device__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *source,
size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_put_nbi\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_put_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
get_internal_ctx(ctx)->put_nbi(dest, source, nelems, pe_in_world);
}
@@ -415,9 +418,9 @@ __device__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *source,
__device__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
get_internal_ctx(ctx)->getmem_nbi(dest, source, nelems, pe_in_world);
}
@@ -425,35 +428,38 @@ __device__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest,
template <typename T>
__device__ void rocshmem_get_nbi(rocshmem_ctx_t ctx, T *dest, const T *source,
size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_get_nbi\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_get_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
get_internal_ctx(ctx)->get_nbi(dest, source, nelems, pe_in_world);
}
__device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_fence\n");
GPU_DPRINTF("Function: rocshmem_ctx_fence (ctx=%zd)\n", ctx.ctx_opaque);
get_internal_ctx(ctx)->fence();
}
__device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx, int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_fence\n");
int pe_in_world = translate_pe(ctx, pe);
GPU_DPRINTF("Function: rocshmem_ctx_fence (ctx=%zd, pe=%d w%d))\n",
ctx.ctx_opaque, pe, pe_in_world);
get_internal_ctx(ctx)->fence(pe_in_world);
}
__device__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_quiet\n");
GPU_DPRINTF("Function: rocshmem_ctx_quiet (ctx=%zd)\n",
ctx.ctx_opaque);
get_internal_ctx(ctx)->quiet();
}
__device__ void *rocshmem_ptr(const void *dest, int pe) {
GPU_DPRINTF("Function: rocshmem_ptr\n");
GPU_DPRINTF("Function: rocshmem_ptr (dest=%p, pe=%d w%d\n",
dest, pe, pe);
return get_internal_ctx(ROCSHMEM_CTX_DEFAULT)->shmem_ptr(dest, pe);
}
@@ -461,7 +467,8 @@ __device__ void *rocshmem_ptr(const void *dest, int pe) {
template <typename T, ROCSHMEM_OP Op>
__device__ int rocshmem_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team,
T *dest, const T *source, int nreduce) {
GPU_DPRINTF("Function: rocshmem_reduce\n");
GPU_DPRINTF("Function: rocshmem_reduce_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nreduce=%d\n",
ctx.ctx_opaque, team, dest, source, nreduce);
return get_internal_ctx(ctx)->reduce<T, Op>(team, dest, source, nreduce);
}
@@ -471,7 +478,8 @@ __device__ void rocshmem_broadcast_wg(rocshmem_ctx_t ctx,
rocshmem_team_t team, T *dest,
const T *source, int nelem,
int pe_root) {
GPU_DPRINTF("Function: Team-based rocshmem_broadcast\n");
GPU_DPRINTF("Function: Team-based rocshmem_broadcast_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d, root=%d)\n",
ctx.ctx_opaque, team, dest, source, nelem, pe_root);
get_internal_ctx(ctx)->broadcast<T>(team, dest, source, nelem, pe_root);
}
@@ -480,7 +488,8 @@ template <typename T>
__device__ void rocshmem_alltoall_wg(rocshmem_ctx_t ctx,
rocshmem_team_t team, T *dest,
const T *source, int nelem) {
GPU_DPRINTF("Function: rocshmem_alltoall\n");
GPU_DPRINTF("Function: rocshmem_alltoall_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d\n",
ctx.ctx_opaque, team, dest, source, nelem);
get_internal_ctx(ctx)->alltoall<T>(team, dest, source, nelem);
}
@@ -489,14 +498,16 @@ template <typename T>
__device__ void rocshmem_fcollect_wg(rocshmem_ctx_t ctx,
rocshmem_team_t team, T *dest,
const T *source, int nelem) {
GPU_DPRINTF("Function: rocshmem_fcollect\n");
GPU_DPRINTF("Function: rocshmem_fcollect_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d\n",
ctx.ctx_opaque, team, dest, source, nelem);
get_internal_ctx(ctx)->fcollect<T>(team, dest, source, nelem);
}
template <typename T>
__device__ void rocshmem_wait_until(T *ivars, int cmp, T val) {
GPU_DPRINTF("Function: rocshmem_wait_until\n");
GPU_DPRINTF("Function: rocshmem_wait_until (ivars=%p, cmp=%d, val=%g)\n",
ivars, cmp, (double)val);
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL);
@@ -506,7 +517,8 @@ __device__ void rocshmem_wait_until(T *ivars, int cmp, T val) {
template <typename T>
__device__ void rocshmem_wait_until_all(T *ivars, size_t nelems, const int* status,
int cmp, T val) {
GPU_DPRINTF("Function: rocshmem_wait_until_all\n");
GPU_DPRINTF("Function: rocshmem_wait_until_all (ivars=%p, nelems=%zd cmp=%d, val=%g)\n",
ivars, nelems, cmp, (double)val);
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL);
@@ -516,7 +528,8 @@ __device__ void rocshmem_wait_until_all(T *ivars, size_t nelems, const int* stat
template <typename T>
__device__ size_t rocshmem_wait_until_any(T *ivars, size_t nelems, const int* status,
int cmp, T val) {
GPU_DPRINTF("Function: rocshmem_wait_until_any\n");
GPU_DPRINTF("Function: rocshmem_wait_until_any (ivars=%p, nelems=%zd cmp=%d, val=%g)\n",
ivars, nelems, cmp, (double)val);
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY);
@@ -527,7 +540,8 @@ template <typename T>
__device__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems, size_t* indices,
const int* status, int cmp,
T val) {
DPRINTF("Function: rocshmem_wait_until_some\n");
DPRINTF("Function: rocshmem_wait_until_some (ivars=%p, nelems=%zd cmp=%d, val=%g)\n",
ivars, nelems, cmp, (double)val);
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME);
@@ -537,7 +551,8 @@ __device__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems, size_t* indi
template <typename T>
__device__ size_t rocshmem_wait_until_any_vector(T *ivars, size_t nelems, const int* status,
int cmp, T* vals) {
DPRINTF("Function: rocshmem_wait_until_any_vector\n");
DPRINTF("Function: rocshmem_wait_until_any_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n",
ivars, nelems, cmp, vals);
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY_VECTOR);
@@ -547,7 +562,8 @@ __device__ size_t rocshmem_wait_until_any_vector(T *ivars, size_t nelems, const
template <typename T>
__device__ void rocshmem_wait_until_all_vector(T *ivars, size_t nelems, const int* status,
int cmp, T* vals) {
DPRINTF("Function: rocshmem_wait_until_all_vector\n");
DPRINTF("Function: rocshmem_wait_until_all_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n",
ivars, nelems, cmp, vals);
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL_VECTOR);
@@ -559,7 +575,8 @@ __device__ size_t rocshmem_wait_until_some_vector(T *ivars, size_t nelems,
size_t* indices,
const int* status,
int cmp, T* vals) {
DPRINTF("Function: rocshmem_wait_until_some_vector\n");
DPRINTF("Function: rocshmem_wait_until_some_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n",
ivars, nelems, cmp, vals);
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME_VECTOR);
@@ -568,7 +585,8 @@ __device__ size_t rocshmem_wait_until_some_vector(T *ivars, size_t nelems,
template <typename T>
__device__ int rocshmem_test(T *ivars, int cmp, T val) {
GPU_DPRINTF("Function: rocshmem_testl\n");
GPU_DPRINTF("Function: rocshmem_test (ivars=%p, cmp=%d, val=%g)\n",
ivars, cmp, (double)val);
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
ctx_internal->ctxStats.incStat(NUM_TEST);
@@ -577,82 +595,95 @@ __device__ int rocshmem_test(T *ivars, int cmp, T val) {
}
__device__ void rocshmem_ctx_barrier_all(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all\n");
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all (ctx=%zd)\n",
ctx.ctx_opaque);
get_internal_ctx(ctx)->barrier_all();
}
__device__ void rocshmem_ctx_barrier_all_wave(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave\n");
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave (ctx=%zd)\n",
ctx.ctx_opaque);
get_internal_ctx(ctx)->barrier_all_wave();
}
__device__ void rocshmem_ctx_barrier_all_wg(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg\n");
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg (ctx=%zd)\n",
ctx.ctx_opaque);
get_internal_ctx(ctx)->barrier_all_wg();
}
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_barrier\n");
GPU_DPRINTF("Function: rocshmem_ctx_barrier (ctx=%zd, team=%zd)\n",
ctx.ctx_opaque, team);
get_internal_ctx(ctx)->barrier(team);
}
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_barrier_wave\n");
GPU_DPRINTF("Function: rocshmem_ctx_barrier_wave (ctx=%zd, team=%zd)\n",
ctx.ctx_opaque, team);
get_internal_ctx(ctx)->barrier_wave(team);
}
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_barrier_wg\n");
GPU_DPRINTF("Function: rocshmem_ctx_barrier_wg (ctx=%zd, team=%zd)\n",
ctx.ctx_opaque, team);
get_internal_ctx(ctx)->barrier_wg(team);
}
__device__ void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n");
GPU_DPRINTF("Function: rocshmem_ctx_sync_all (ctx=%zd)\n",
ctx.ctx_opaque);
get_internal_ctx(ctx)->sync_all();
}
__device__ void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave\n");
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave (ctx=%zd)\n",
ctx.ctx_opaque);
get_internal_ctx(ctx)->sync_all_wave();
}
__device__ void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg\n");
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg (ctx=%zd)\n",
ctx.ctx_opaque);
get_internal_ctx(ctx)->sync_all_wg();
}
__device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx,
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_sync\n");
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_sync (ctx=%zd, team=%zd)\n",
ctx.ctx_opaque, team);
get_internal_ctx(ctx)->sync_wg(team);
}
__device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx,
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_wave\n");
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_wave (ctx=%zd, team=%zd)\n",
ctx.ctx_opaque, team);
get_internal_ctx(ctx)->sync_wg(team);
}
__device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx,
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_wg\n");
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_wg (ctx=%zd, team=%zd)\n",
ctx.ctx_opaque, team);
get_internal_ctx(ctx)->sync_wg(team);
}
__device__ int rocshmem_ctx_n_pes(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_n_pes\n");
GPU_DPRINTF("Function: rocshmem_ctx_n_pes (ctx=%zd)\n",
ctx.ctx_opaque);
TeamInfo *tinfo = reinterpret_cast<TeamInfo *>(ctx.team_opaque);
return tinfo->size;
@@ -663,7 +694,8 @@ __device__ int rocshmem_n_pes() {
}
__device__ int rocshmem_ctx_my_pe(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_my_pe\n");
GPU_DPRINTF("Function: rocshmem_ctx_my_pe (ctx=%zd)\n",
ctx.ctx_opaque);
TeamInfo *tinfo = reinterpret_cast<TeamInfo *>(ctx.team_opaque);
int my_pe{get_internal_ctx(ctx)->my_pe};
@@ -689,7 +721,8 @@ __device__ int rocshmem_my_pe() {
template <typename T>
__device__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_fetch_add\n");
GPU_DPRINTF("Function: rocshmem_atomic_fetch_add (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
return get_internal_ctx(ctx)->amo_fetch_add<T>(dest, val, pe);
}
@@ -697,21 +730,24 @@ __device__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val,
template <typename T>
__device__ T rocshmem_atomic_compare_swap(rocshmem_ctx_t ctx, T *dest, T cond,
T val, int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_compare_swap\n");
GPU_DPRINTF("Function: rocshmem_atomic_compare_swap (ctx=%zd, dest=%p, cond=%g, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, cond, (double)val, pe, translate_pe(ctx, pe));
return get_internal_ctx(ctx)->amo_fetch_cas(dest, val, cond, pe);
}
template <typename T>
__device__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, T *dest, int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_fetch_inc\n");
GPU_DPRINTF("Function: rocshmem_atomic_fetch_inc (ctx=%zd, dest=%p, pe=%d w%d)\n",
ctx.ctx_opaque, dest, pe, translate_pe(ctx, pe));
return get_internal_ctx(ctx)->amo_fetch_add<T>(dest, 1, pe);
}
template <typename T>
__device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_fetch\n");
GPU_DPRINTF("Function: rocshmem_atomic_fetch (ctx=%zd, source=%p, pe=%d w%d)\n",
ctx.ctx_opaque, source, pe, translate_pe(ctx, pe));
return get_internal_ctx(ctx)->amo_fetch_add<T>(source, 0, pe);
}
@@ -719,14 +755,16 @@ __device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe) {
template <typename T>
__device__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_add\n");
GPU_DPRINTF("Function: rocshmem_atomic_add (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->amo_add<T>(dest, val, pe);
}
template <typename T>
__device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_inc\n");
GPU_DPRINTF("Function: rocshmem_atomic_inc (ctx=%zd, dest=%p, pe=%d w%d)\n",
ctx.ctx_opaque, dest, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->amo_add<T>(dest, 1, pe);
}
@@ -734,7 +772,8 @@ __device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe) {
template <typename T>
__device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_set\n");
GPU_DPRINTF("Function: rocshmem_atomic_set (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->amo_set(dest, val, pe);
}
@@ -742,7 +781,8 @@ __device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val,
template <typename T>
__device__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_swap\n");
GPU_DPRINTF("Function: rocshmem_atomic_swap (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
return get_internal_ctx(ctx)->amo_swap(dest, val, pe);
}
@@ -750,7 +790,8 @@ __device__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T *dest, T val,
template <typename T>
__device__ T rocshmem_atomic_fetch_and(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_fetch_and\n");
GPU_DPRINTF("Function: rocshmem_atomic_fetch_and (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
return get_internal_ctx(ctx)->amo_fetch_and(dest, val, pe);
}
@@ -758,7 +799,8 @@ __device__ T rocshmem_atomic_fetch_and(rocshmem_ctx_t ctx, T *dest, T val,
template <typename T>
__device__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_and\n");
GPU_DPRINTF("Function: rocshmem_atomic_and (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->amo_and(dest, val, pe);
}
@@ -766,7 +808,8 @@ __device__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, T *dest, T val,
template <typename T>
__device__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_fetch_or\n");
GPU_DPRINTF("Function: rocshmem_atomic_fetch_or (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
return get_internal_ctx(ctx)->amo_fetch_or(dest, val, pe);
}
@@ -774,7 +817,8 @@ __device__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, T *dest, T val,
template <typename T>
__device__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_or\n");
GPU_DPRINTF("Function: rocshmem_atomic_or (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->amo_or(dest, val, pe);
}
@@ -782,7 +826,8 @@ __device__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T *dest, T val,
template <typename T>
__device__ T rocshmem_atomic_fetch_xor(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_fetch_xor\n");
GPU_DPRINTF("Function: rocshmem_atomic_fetch_xor (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
return get_internal_ctx(ctx)->amo_fetch_xor(dest, val, pe);
}
@@ -790,7 +835,8 @@ __device__ T rocshmem_atomic_fetch_xor(rocshmem_ctx_t ctx, T *dest, T val,
template <typename T>
__device__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, T *dest, T val,
int pe) {
GPU_DPRINTF("Function: rocshmem_atomic_xor\n");
GPU_DPRINTF("Function: rocshmem_atomic_xor (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->amo_xor(dest, val, pe);
}
@@ -801,7 +847,8 @@ __device__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, T *dest, T val,
__device__ void rocshmem_ctx_putmem_wave(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_putmem_wave\n");
GPU_DPRINTF("Function: rocshmem_ctx_putmem_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->putmem_wave(dest, source, nelems, pe);
}
@@ -809,7 +856,8 @@ __device__ void rocshmem_ctx_putmem_wave(rocshmem_ctx_t ctx, void *dest,
__device__ void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_putmem_wg\n");
GPU_DPRINTF("Function: rocshmem_ctx_putmem_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->putmem_wg(dest, source, nelems, pe);
}
@@ -817,7 +865,8 @@ __device__ void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest,
__device__ void rocshmem_ctx_putmem_nbi_wave(rocshmem_ctx_t ctx, void *dest,
const void *source,
size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wave\n");
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->putmem_nbi_wave(dest, source, nelems, pe);
}
@@ -825,7 +874,8 @@ __device__ void rocshmem_ctx_putmem_nbi_wave(rocshmem_ctx_t ctx, void *dest,
__device__ void rocshmem_ctx_putmem_nbi_wg(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wg\n");
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->putmem_nbi_wg(dest, source, nelems, pe);
}
@@ -833,7 +883,8 @@ __device__ void rocshmem_ctx_putmem_nbi_wg(rocshmem_ctx_t ctx, void *dest,
template <typename T>
__device__ void rocshmem_put_wave(rocshmem_ctx_t ctx, T *dest,
const T *source, size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_put_wave\n");
GPU_DPRINTF("Function: rocshmem_put_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->put_wave(dest, source, nelems, pe);
}
@@ -841,7 +892,8 @@ __device__ void rocshmem_put_wave(rocshmem_ctx_t ctx, T *dest,
template <typename T>
__device__ void rocshmem_put_wg(rocshmem_ctx_t ctx, T *dest, const T *source,
size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_put_wg\n");
GPU_DPRINTF("Function: rocshmem_put_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->put_wg(dest, source, nelems, pe);
}
@@ -850,7 +902,8 @@ template <typename T>
__device__ void rocshmem_put_nbi_wave(rocshmem_ctx_t ctx, T *dest,
const T *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_put_nbi_wave\n");
GPU_DPRINTF("Function: rocshmem_put_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->put_nbi_wave(dest, source, nelems, pe);
}
@@ -858,7 +911,8 @@ __device__ void rocshmem_put_nbi_wave(rocshmem_ctx_t ctx, T *dest,
template <typename T>
__device__ void rocshmem_put_nbi_wg(rocshmem_ctx_t ctx, T *dest,
const T *source, size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_put_nbi_wg\n");
GPU_DPRINTF("Function: rocshmem_put_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->put_nbi_wg(dest, source, nelems, pe);
}
@@ -866,7 +920,8 @@ __device__ void rocshmem_put_nbi_wg(rocshmem_ctx_t ctx, T *dest,
__device__ void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_getmem_wg\n");
GPU_DPRINTF("Function: rocshmem_ctx_getmem_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->getmem_wg(dest, source, nelems, pe);
}
@@ -874,7 +929,8 @@ __device__ void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest,
__device__ void rocshmem_ctx_getmem_wave(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_getmem_wave\n");
GPU_DPRINTF("Function: rocshmem_ctx_getmem_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->getmem_wave(dest, source, nelems, pe);
}
@@ -882,7 +938,8 @@ __device__ void rocshmem_ctx_getmem_wave(rocshmem_ctx_t ctx, void *dest,
template <typename T>
__device__ void rocshmem_get_wg(rocshmem_ctx_t ctx, T *dest, const T *source,
size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_get_wg\n");
GPU_DPRINTF("Function: rocshmem_get_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->get_wg(dest, source, nelems, pe);
}
@@ -890,7 +947,8 @@ __device__ void rocshmem_get_wg(rocshmem_ctx_t ctx, T *dest, const T *source,
template <typename T>
__device__ void rocshmem_get_wave(rocshmem_ctx_t ctx, T *dest,
const T *source, size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_get_wave\n");
GPU_DPRINTF("Function: rocshmem_get_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->get_wave(dest, source, nelems, pe);
}
@@ -898,7 +956,8 @@ __device__ void rocshmem_get_wave(rocshmem_ctx_t ctx, T *dest,
__device__ void rocshmem_ctx_getmem_nbi_wg(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wg\n");
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->getmem_nbi_wg(dest, source, nelems, pe);
}
@@ -906,7 +965,8 @@ __device__ void rocshmem_ctx_getmem_nbi_wg(rocshmem_ctx_t ctx, void *dest,
template <typename T>
__device__ void rocshmem_get_nbi_wg(rocshmem_ctx_t ctx, T *dest,
const T *source, size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_get_nbi_wg\n");
GPU_DPRINTF("Function: rocshmem_get_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->get_nbi_wg(dest, source, nelems, pe);
}
@@ -914,7 +974,8 @@ __device__ void rocshmem_get_nbi_wg(rocshmem_ctx_t ctx, T *dest,
__device__ void rocshmem_ctx_getmem_nbi_wave(rocshmem_ctx_t ctx, void *dest,
const void *source,
size_t nelems, int pe) {
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wave\n");
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->getmem_nbi_wave(dest, source, nelems, pe);
}
@@ -923,31 +984,38 @@ template <typename T>
__device__ void rocshmem_get_nbi_wave(rocshmem_ctx_t ctx, T *dest,
const T *source, size_t nelems,
int pe) {
GPU_DPRINTF("Function: rocshmem_get_nbi_wave\n");
GPU_DPRINTF("Function: rocshmem_get_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
get_internal_ctx(ctx)->get_nbi_wave(dest, source, nelems, pe);
}
#define ROCSHMEM_CTX_PUTMEM_SIGNAL_DEF(SUFFIX) \
__device__ void rocshmem_ctx_putmem_signal##SUFFIX(rocshmem_ctx_t ctx, \
#define ROCSHMEM_CTX_PUTMEM_SIGNAL_DEF(SUFFIX) \
__device__ void rocshmem_ctx_putmem_signal##SUFFIX(rocshmem_ctx_t ctx, \
void *dest, const void *source, \
size_t nelems, \
uint64_t *sig_addr, uint64_t signal, \
int sig_op, \
int pe) { \
GPU_DPRINTF("Function: rocshmem_ctx_putmem_signal##SUFFIX\n"); \
GPU_DPRINTF("Function: rocshmem_ctx_putmem_signal##SUFFIX (ctx=%zd, dest=%p, " \
"source=%p, nelems=%d, sig_addr=%p, signal=%ld, sig_op=%d, pe=%d w%d)\n", \
ctx.ctx_opaque, dest, source, nelems, \
sig_addr, signal, sig_op, pe, translate_pe(ctx, pe)); \
\
get_internal_ctx(ctx)->putmem_signal##SUFFIX(dest, source, nelems, \
sig_addr, signal, sig_op, pe); \
} \
\
template <typename T> \
__device__ void rocshmem_ctx_put_signal##SUFFIX(rocshmem_ctx_t ctx, \
__device__ void rocshmem_ctx_put_signal##SUFFIX(rocshmem_ctx_t ctx, \
T *dest, const T *source, \
size_t nelems, \
uint64_t *sig_addr, uint64_t signal, \
int sig_op, int pe) { \
GPU_DPRINTF("Function: rocshmem_ctx_put_signal##SUFFIX\n"); \
GPU_DPRINTF("Function: rocshmem_ctx_put_signal##SUFFIX (ctx=%zd, dest=%p, " \
"source=%p, nelems=%d, sig_addr=%p, signal=%ld, sig_op=%d, pe=%d w%d)\n", \
ctx.ctx_opaque, dest, source, nelems, \
sig_addr, signal, sig_op, pe, translate_pe(ctx, pe)); \
\
get_internal_ctx(ctx)->put_signal##SUFFIX(dest, source, nelems, \
sig_addr, signal, sig_op, pe); \
+6 -7
Fájl megtekintése
@@ -62,9 +62,9 @@ namespace rocshmem {
#endif
#ifdef DEBUG
#define GPU_DPRINTF(...) \
do { \
gpu_dprintf(__VA_ARGS__); \
#define GPU_DPRINTF(...) \
do { \
gpu_dprintf("WG (%u, %u, %u) TH (%u, %u, %u) " __VA_ARGS__); \
} while (0);
#else
#define GPU_DPRINTF(...) \
@@ -159,10 +159,9 @@ __device__ void gpu_dprintf(const char* fmt, const Args&... args) {
while (atomicCAS(print_lock, 0, 1) == 1) {
}
printf("WG (%u, %u, %u) TH (%u, %u, %u) ", hipBlockIdx_x,
hipBlockIdx_y, hipBlockIdx_z, hipThreadIdx_x, hipThreadIdx_y,
hipThreadIdx_z);
printf(fmt, args...);
printf(fmt, hipBlockIdx_x, hipBlockIdx_y, hipBlockIdx_z,
hipThreadIdx_x, hipThreadIdx_y, hipThreadIdx_z,
args...);
*print_lock = 0;
}