Detailed logs (#124)
* Use a single printf per line (reduce chances of lines being cut in logs)
* team_comm can be an int or a pointer depending on MPI impl.
Received is confusing (since we are on the origin), use submitted
instead
* Print arguments to calls when using DEBUG
---------
Signed-off-by: Aurelien Bouteiller <abouteil@amd.com>
[ROCm/rocshmem commit: 3600291558]
This commit is contained in:
committed by
GitHub
szülő
17cde51fb7
commit
03a9fac960
@@ -97,7 +97,7 @@ void MPITransport::submitRequestsToMPI() {
|
||||
putMem(next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.PE, next_element.ro_net_win_id, queue_idx,
|
||||
next_element.status, true);
|
||||
DPRINTF("Received PUT dst %p src %p size %lu pe %d win_id %d\n",
|
||||
DPRINTF("Submitted PUT dst %p src %p size %lu pe %d win_id %d\n",
|
||||
next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.PE, next_element.ro_net_win_id);
|
||||
break;
|
||||
@@ -112,7 +112,7 @@ void MPITransport::submitRequestsToMPI() {
|
||||
putMem(next_element.dst, source_buffer, next_element.ol1.size,
|
||||
next_element.PE, next_element.ro_net_win_id, queue_idx,
|
||||
next_element.status, true, true);
|
||||
DPRINTF("Received P dst %p value %p pe %d\n", next_element.dst,
|
||||
DPRINTF("Submitted P dst %p value %p pe %d\n", next_element.dst,
|
||||
next_element.src, next_element.PE);
|
||||
break;
|
||||
}
|
||||
@@ -120,14 +120,14 @@ void MPITransport::submitRequestsToMPI() {
|
||||
getMem(next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.PE, next_element.ro_net_win_id, queue_idx,
|
||||
next_element.status, true);
|
||||
DPRINTF("Received GET dst %p src %p size %lu pe %d\n", next_element.dst,
|
||||
DPRINTF("Submitted GET dst %p src %p size %lu pe %d\n", next_element.dst,
|
||||
next_element.src, next_element.ol1.size, next_element.PE);
|
||||
break;
|
||||
case RO_NET_PUT_NBI:
|
||||
putMem(next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.PE, next_element.ro_net_win_id, queue_idx,
|
||||
next_element.status, false);
|
||||
DPRINTF("Received PUT NBI dst %p src %p size %lu pe %d\n",
|
||||
DPRINTF("Submitted PUT NBI dst %p src %p size %lu pe %d\n",
|
||||
next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.PE);
|
||||
break;
|
||||
@@ -135,7 +135,7 @@ void MPITransport::submitRequestsToMPI() {
|
||||
getMem(next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.PE, next_element.ro_net_win_id, queue_idx,
|
||||
next_element.status, false);
|
||||
DPRINTF("Received GET NBI dst %p src %p size %lu pe %d\n",
|
||||
DPRINTF("Submitted GET NBI dst %p src %p size %lu pe %d\n",
|
||||
next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.PE);
|
||||
break;
|
||||
@@ -146,7 +146,7 @@ void MPITransport::submitRequestsToMPI() {
|
||||
next_element.status, true,
|
||||
static_cast<ROCSHMEM_OP>(next_element.op),
|
||||
static_cast<ro_net_types>(next_element.datatype));
|
||||
DPRINTF("Received AMO dst %p src %p Val %llu pe %d\n", next_element.dst,
|
||||
DPRINTF("Submitted AMO dst %p src %p Val %llu pe %d\n", next_element.dst,
|
||||
next_element.src, next_element.ol1.atomic_value, next_element.PE);
|
||||
break;
|
||||
case RO_NET_AMO_FCAS:
|
||||
@@ -156,7 +156,7 @@ void MPITransport::submitRequestsToMPI() {
|
||||
next_element.status, true,
|
||||
const_cast<void **>(&next_element.ol2.pWrk),
|
||||
static_cast<ro_net_types>(next_element.datatype));
|
||||
DPRINTF("Received F_CSWAP dst %p src %p Val %llu pe %d cond %ld\n",
|
||||
DPRINTF("Submitted F_CSWAP dst %p src %p Val %llu pe %d cond %ld\n",
|
||||
next_element.dst, next_element.src, next_element.ol1.atomic_value,
|
||||
next_element.PE,
|
||||
reinterpret_cast<int64_t>(next_element.ol2.pWrk));
|
||||
@@ -168,9 +168,9 @@ void MPITransport::submitRequestsToMPI() {
|
||||
static_cast<ROCSHMEM_OP>(next_element.op),
|
||||
static_cast<ro_net_types>(next_element.datatype),
|
||||
next_element.status, true);
|
||||
DPRINTF("Received FLOAT_SUM_TEAM_REDUCE dst %p src %p size %lu team %d\n",
|
||||
DPRINTF("Submitted FLOAT_SUM_TEAM_REDUCE dst %p src %p size %lu team %zd\n",
|
||||
next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.team_comm);
|
||||
(intptr_t)next_element.team_comm);
|
||||
break;
|
||||
case RO_NET_TEAM_BROADCAST:
|
||||
team_broadcast(next_element.dst, next_element.src, next_element.ol1.size,
|
||||
@@ -179,10 +179,10 @@ void MPITransport::submitRequestsToMPI() {
|
||||
static_cast<ro_net_types>(next_element.datatype),
|
||||
next_element.status, true);
|
||||
DPRINTF(
|
||||
"Received TEAM_BROADCAST dst %p src %p size %lu "
|
||||
"team %d, PE_root %d \n",
|
||||
"Submitted TEAM_BROADCAST dst %p src %p size %lu "
|
||||
"team %zd, PE_root %d \n",
|
||||
next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.team_comm, next_element.PE_root);
|
||||
(intptr_t)next_element.team_comm, next_element.PE_root);
|
||||
break;
|
||||
case RO_NET_ALLTOALL:
|
||||
alltoall(next_element.dst, next_element.src, next_element.ol1.size,
|
||||
@@ -190,9 +190,9 @@ void MPITransport::submitRequestsToMPI() {
|
||||
next_element.ol2.pWrk,
|
||||
static_cast<ro_net_types>(next_element.datatype),
|
||||
next_element.status, true);
|
||||
DPRINTF("Received ALLTOALL dst %p src %p size %lu team %d\n",
|
||||
DPRINTF("Submitted ALLTOALL dst %p src %p size %lu team %zd\n",
|
||||
next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.team_comm);
|
||||
(intptr_t)next_element.team_comm);
|
||||
break;
|
||||
case RO_NET_FCOLLECT:
|
||||
fcollect(next_element.dst, next_element.src, next_element.ol1.size,
|
||||
@@ -200,30 +200,30 @@ void MPITransport::submitRequestsToMPI() {
|
||||
next_element.ol2.pWrk,
|
||||
static_cast<ro_net_types>(next_element.datatype),
|
||||
next_element.status, true);
|
||||
DPRINTF("Received FCOLLECT dst %p src %p size %lu team %d\n",
|
||||
DPRINTF("Submitted FCOLLECT dst %p src %p size %lu team %zd\n",
|
||||
next_element.dst, next_element.src, next_element.ol1.size,
|
||||
next_element.team_comm);
|
||||
(intptr_t)next_element.team_comm);
|
||||
break;
|
||||
case RO_NET_BARRIER:
|
||||
barrier(queue_idx, next_element.status, true,
|
||||
next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm,
|
||||
true);
|
||||
DPRINTF("Received Barrier_all\n");
|
||||
next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm,
|
||||
true);
|
||||
DPRINTF("Submitted Barrier_all\n");
|
||||
break;
|
||||
case RO_NET_SYNC:
|
||||
barrier(queue_idx, next_element.status, true,
|
||||
next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm,
|
||||
false);
|
||||
DPRINTF("Received Sync\n");
|
||||
next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm,
|
||||
false);
|
||||
DPRINTF("Submitted Sync\n");
|
||||
break;
|
||||
case RO_NET_FENCE:
|
||||
case RO_NET_QUIET:
|
||||
quiet(queue_idx, next_element.status);
|
||||
DPRINTF("Received FENCE/QUIET\n");
|
||||
DPRINTF("Submitted FENCE/QUIET\n");
|
||||
break;
|
||||
case RO_NET_FINALIZE:
|
||||
quiet(queue_idx, next_element.status);
|
||||
DPRINTF("Received Finalize\n");
|
||||
DPRINTF("Submitted Finalize\n");
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Invalid GPU Packet received, exiting....\n");
|
||||
|
||||
@@ -283,14 +283,14 @@ __device__ Context *get_internal_ctx(rocshmem_ctx_t ctx) {
|
||||
return reinterpret_cast<Context *>(ctx.ctx_opaque);
|
||||
}
|
||||
|
||||
__device__ int rocshmem_wg_ctx_create(long option, rocshmem_ctx_t *ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_create\n");
|
||||
__device__ int rocshmem_wg_ctx_create(long options, rocshmem_ctx_t *ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_wg_ctx_create (options=%ld)\n", options);
|
||||
bool result{true};
|
||||
if (get_flat_block_id() == 0) {
|
||||
ctx->team_opaque = reinterpret_cast<TeamInfo *>(ROCSHMEM_CTX_DEFAULT.team_opaque);
|
||||
result = device_backend_proxy->create_ctx(option, ctx);
|
||||
result = device_backend_proxy->create_ctx(options, ctx);
|
||||
if(result) {
|
||||
reinterpret_cast<Context *>(ctx->ctx_opaque)->setFence(option);
|
||||
reinterpret_cast<Context *>(ctx->ctx_opaque)->setFence(options);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -298,8 +298,9 @@ __device__ int rocshmem_wg_ctx_create(long option, rocshmem_ctx_t *ctx) {
|
||||
}
|
||||
|
||||
__device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options,
|
||||
rocshmem_ctx_t *ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_team_create_ctx\n");
|
||||
rocshmem_ctx_t *ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_wg_team_create_ctx (team=%zd, options=%ld)\n",
|
||||
(intptr_t)team, options);
|
||||
if (team == ROCSHMEM_TEAM_INVALID) {
|
||||
return -1;
|
||||
}
|
||||
@@ -321,7 +322,8 @@ __device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options,
|
||||
|
||||
__device__ void rocshmem_wg_ctx_destroy(
|
||||
[[maybe_unused]] rocshmem_ctx_t *ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_destroy\n");
|
||||
GPU_DPRINTF("Function: rocshmem_wg_ctx_destroy (ctx=%zd)\n",
|
||||
ctx->ctx_opaque);
|
||||
|
||||
if (get_flat_block_id() == 0) {
|
||||
device_backend_proxy->destroy_ctx(ctx);
|
||||
@@ -329,7 +331,8 @@ __device__ void rocshmem_wg_ctx_destroy(
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_threadfence_system(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_threadfence_system\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_threadfence_system (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->threadfence_system();
|
||||
}
|
||||
@@ -337,9 +340,9 @@ __device__ void rocshmem_ctx_threadfence_system(rocshmem_ctx_t ctx) {
|
||||
__device__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->putmem(dest, source, nelems, pe_in_world);
|
||||
}
|
||||
@@ -347,27 +350,27 @@ __device__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_put(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_put\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_put (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->put(dest, source, nelems, pe_in_world);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void rocshmem_p(rocshmem_ctx_t ctx, T *dest, T value, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_p\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_p (ctx=%zd, dest=%p, value=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, value, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->p(dest, value, pe_in_world);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_g\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_g (ctx=%zd, source=%p, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, source, pe, pe_in_world);
|
||||
|
||||
return get_internal_ctx(ctx)->g(source, pe_in_world);
|
||||
}
|
||||
@@ -375,9 +378,9 @@ __device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe) {
|
||||
__device__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->getmem(dest, source, nelems, pe_in_world);
|
||||
}
|
||||
@@ -385,9 +388,9 @@ __device__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_get\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_get (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->get(dest, source, nelems, pe_in_world);
|
||||
}
|
||||
@@ -395,9 +398,9 @@ __device__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
__device__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->putmem_nbi(dest, source, nelems, pe_in_world);
|
||||
}
|
||||
@@ -405,9 +408,9 @@ __device__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_put_nbi\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_put_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->put_nbi(dest, source, nelems, pe_in_world);
|
||||
}
|
||||
@@ -415,9 +418,9 @@ __device__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
__device__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->getmem_nbi(dest, source, nelems, pe_in_world);
|
||||
}
|
||||
@@ -425,35 +428,38 @@ __device__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_get_nbi(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_get_nbi\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_get_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->get_nbi(dest, source, nelems, pe_in_world);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_fence\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_fence (ctx=%zd)\n", ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->fence();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_fence\n");
|
||||
|
||||
int pe_in_world = translate_pe(ctx, pe);
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_fence (ctx=%zd, pe=%d w%d))\n",
|
||||
ctx.ctx_opaque, pe, pe_in_world);
|
||||
|
||||
get_internal_ctx(ctx)->fence(pe_in_world);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_quiet\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_quiet (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->quiet();
|
||||
}
|
||||
|
||||
__device__ void *rocshmem_ptr(const void *dest, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ptr\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ptr (dest=%p, pe=%d w%d\n",
|
||||
dest, pe, pe);
|
||||
|
||||
return get_internal_ctx(ROCSHMEM_CTX_DEFAULT)->shmem_ptr(dest, pe);
|
||||
}
|
||||
@@ -461,7 +467,8 @@ __device__ void *rocshmem_ptr(const void *dest, int pe) {
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ int rocshmem_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team,
|
||||
T *dest, const T *source, int nreduce) {
|
||||
GPU_DPRINTF("Function: rocshmem_reduce\n");
|
||||
GPU_DPRINTF("Function: rocshmem_reduce_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nreduce=%d\n",
|
||||
ctx.ctx_opaque, team, dest, source, nreduce);
|
||||
|
||||
return get_internal_ctx(ctx)->reduce<T, Op>(team, dest, source, nreduce);
|
||||
}
|
||||
@@ -471,7 +478,8 @@ __device__ void rocshmem_broadcast_wg(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelem,
|
||||
int pe_root) {
|
||||
GPU_DPRINTF("Function: Team-based rocshmem_broadcast\n");
|
||||
GPU_DPRINTF("Function: Team-based rocshmem_broadcast_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d, root=%d)\n",
|
||||
ctx.ctx_opaque, team, dest, source, nelem, pe_root);
|
||||
|
||||
get_internal_ctx(ctx)->broadcast<T>(team, dest, source, nelem, pe_root);
|
||||
}
|
||||
@@ -480,7 +488,8 @@ template <typename T>
|
||||
__device__ void rocshmem_alltoall_wg(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelem) {
|
||||
GPU_DPRINTF("Function: rocshmem_alltoall\n");
|
||||
GPU_DPRINTF("Function: rocshmem_alltoall_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d\n",
|
||||
ctx.ctx_opaque, team, dest, source, nelem);
|
||||
|
||||
get_internal_ctx(ctx)->alltoall<T>(team, dest, source, nelem);
|
||||
}
|
||||
@@ -489,14 +498,16 @@ template <typename T>
|
||||
__device__ void rocshmem_fcollect_wg(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelem) {
|
||||
GPU_DPRINTF("Function: rocshmem_fcollect\n");
|
||||
GPU_DPRINTF("Function: rocshmem_fcollect_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d\n",
|
||||
ctx.ctx_opaque, team, dest, source, nelem);
|
||||
|
||||
get_internal_ctx(ctx)->fcollect<T>(team, dest, source, nelem);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void rocshmem_wait_until(T *ivars, int cmp, T val) {
|
||||
GPU_DPRINTF("Function: rocshmem_wait_until\n");
|
||||
GPU_DPRINTF("Function: rocshmem_wait_until (ivars=%p, cmp=%d, val=%g)\n",
|
||||
ivars, cmp, (double)val);
|
||||
|
||||
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
|
||||
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL);
|
||||
@@ -506,7 +517,8 @@ __device__ void rocshmem_wait_until(T *ivars, int cmp, T val) {
|
||||
template <typename T>
|
||||
__device__ void rocshmem_wait_until_all(T *ivars, size_t nelems, const int* status,
|
||||
int cmp, T val) {
|
||||
GPU_DPRINTF("Function: rocshmem_wait_until_all\n");
|
||||
GPU_DPRINTF("Function: rocshmem_wait_until_all (ivars=%p, nelems=%zd cmp=%d, val=%g)\n",
|
||||
ivars, nelems, cmp, (double)val);
|
||||
|
||||
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
|
||||
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL);
|
||||
@@ -516,7 +528,8 @@ __device__ void rocshmem_wait_until_all(T *ivars, size_t nelems, const int* stat
|
||||
template <typename T>
|
||||
__device__ size_t rocshmem_wait_until_any(T *ivars, size_t nelems, const int* status,
|
||||
int cmp, T val) {
|
||||
GPU_DPRINTF("Function: rocshmem_wait_until_any\n");
|
||||
GPU_DPRINTF("Function: rocshmem_wait_until_any (ivars=%p, nelems=%zd cmp=%d, val=%g)\n",
|
||||
ivars, nelems, cmp, (double)val);
|
||||
|
||||
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
|
||||
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY);
|
||||
@@ -527,7 +540,8 @@ template <typename T>
|
||||
__device__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems, size_t* indices,
|
||||
const int* status, int cmp,
|
||||
T val) {
|
||||
DPRINTF("Function: rocshmem_wait_until_some\n");
|
||||
DPRINTF("Function: rocshmem_wait_until_some (ivars=%p, nelems=%zd cmp=%d, val=%g)\n",
|
||||
ivars, nelems, cmp, (double)val);
|
||||
|
||||
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
|
||||
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME);
|
||||
@@ -537,7 +551,8 @@ __device__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems, size_t* indi
|
||||
template <typename T>
|
||||
__device__ size_t rocshmem_wait_until_any_vector(T *ivars, size_t nelems, const int* status,
|
||||
int cmp, T* vals) {
|
||||
DPRINTF("Function: rocshmem_wait_until_any_vector\n");
|
||||
DPRINTF("Function: rocshmem_wait_until_any_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n",
|
||||
ivars, nelems, cmp, vals);
|
||||
|
||||
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
|
||||
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY_VECTOR);
|
||||
@@ -547,7 +562,8 @@ __device__ size_t rocshmem_wait_until_any_vector(T *ivars, size_t nelems, const
|
||||
template <typename T>
|
||||
__device__ void rocshmem_wait_until_all_vector(T *ivars, size_t nelems, const int* status,
|
||||
int cmp, T* vals) {
|
||||
DPRINTF("Function: rocshmem_wait_until_all_vector\n");
|
||||
DPRINTF("Function: rocshmem_wait_until_all_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n",
|
||||
ivars, nelems, cmp, vals);
|
||||
|
||||
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
|
||||
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL_VECTOR);
|
||||
@@ -559,7 +575,8 @@ __device__ size_t rocshmem_wait_until_some_vector(T *ivars, size_t nelems,
|
||||
size_t* indices,
|
||||
const int* status,
|
||||
int cmp, T* vals) {
|
||||
DPRINTF("Function: rocshmem_wait_until_some_vector\n");
|
||||
DPRINTF("Function: rocshmem_wait_until_some_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n",
|
||||
ivars, nelems, cmp, vals);
|
||||
|
||||
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
|
||||
ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME_VECTOR);
|
||||
@@ -568,7 +585,8 @@ __device__ size_t rocshmem_wait_until_some_vector(T *ivars, size_t nelems,
|
||||
|
||||
template <typename T>
|
||||
__device__ int rocshmem_test(T *ivars, int cmp, T val) {
|
||||
GPU_DPRINTF("Function: rocshmem_testl\n");
|
||||
GPU_DPRINTF("Function: rocshmem_test (ivars=%p, cmp=%d, val=%g)\n",
|
||||
ivars, cmp, (double)val);
|
||||
|
||||
Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT);
|
||||
ctx_internal->ctxStats.incStat(NUM_TEST);
|
||||
@@ -577,82 +595,95 @@ __device__ int rocshmem_test(T *ivars, int cmp, T val) {
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_barrier_all(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->barrier_all();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_barrier_all_wave(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->barrier_all_wave();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_barrier_all_wg(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->barrier_all_wg();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_barrier\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier (ctx=%zd, team=%zd)\n",
|
||||
ctx.ctx_opaque, team);
|
||||
|
||||
get_internal_ctx(ctx)->barrier(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_barrier_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_wave (ctx=%zd, team=%zd)\n",
|
||||
ctx.ctx_opaque, team);
|
||||
|
||||
get_internal_ctx(ctx)->barrier_wave(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_barrier_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_wg (ctx=%zd, team=%zd)\n",
|
||||
ctx.ctx_opaque, team);
|
||||
|
||||
get_internal_ctx(ctx)->barrier_wg(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->sync_all();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->sync_all_wave();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
get_internal_ctx(ctx)->sync_all_wg();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync\n");
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync (ctx=%zd, team=%zd)\n",
|
||||
ctx.ctx_opaque, team);
|
||||
|
||||
get_internal_ctx(ctx)->sync_wg(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_wave\n");
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_wave (ctx=%zd, team=%zd)\n",
|
||||
ctx.ctx_opaque, team);
|
||||
|
||||
get_internal_ctx(ctx)->sync_wg(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_wg\n");
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_wg (ctx=%zd, team=%zd)\n",
|
||||
ctx.ctx_opaque, team);
|
||||
|
||||
get_internal_ctx(ctx)->sync_wg(team);
|
||||
}
|
||||
|
||||
__device__ int rocshmem_ctx_n_pes(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_n_pes\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_n_pes (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
TeamInfo *tinfo = reinterpret_cast<TeamInfo *>(ctx.team_opaque);
|
||||
return tinfo->size;
|
||||
@@ -663,7 +694,8 @@ __device__ int rocshmem_n_pes() {
|
||||
}
|
||||
|
||||
__device__ int rocshmem_ctx_my_pe(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_my_pe\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_my_pe (ctx=%zd)\n",
|
||||
ctx.ctx_opaque);
|
||||
|
||||
TeamInfo *tinfo = reinterpret_cast<TeamInfo *>(ctx.team_opaque);
|
||||
int my_pe{get_internal_ctx(ctx)->my_pe};
|
||||
@@ -689,7 +721,8 @@ __device__ int rocshmem_my_pe() {
|
||||
template <typename T>
|
||||
__device__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_add\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_add (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
return get_internal_ctx(ctx)->amo_fetch_add<T>(dest, val, pe);
|
||||
}
|
||||
@@ -697,21 +730,24 @@ __device__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
template <typename T>
|
||||
__device__ T rocshmem_atomic_compare_swap(rocshmem_ctx_t ctx, T *dest, T cond,
|
||||
T val, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_compare_swap\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_compare_swap (ctx=%zd, dest=%p, cond=%g, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, cond, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
return get_internal_ctx(ctx)->amo_fetch_cas(dest, val, cond, pe);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, T *dest, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_inc\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_inc (ctx=%zd, dest=%p, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, pe, translate_pe(ctx, pe));
|
||||
|
||||
return get_internal_ctx(ctx)->amo_fetch_add<T>(dest, 1, pe);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch (ctx=%zd, source=%p, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, source, pe, translate_pe(ctx, pe));
|
||||
|
||||
return get_internal_ctx(ctx)->amo_fetch_add<T>(source, 0, pe);
|
||||
}
|
||||
@@ -719,14 +755,16 @@ __device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe) {
|
||||
template <typename T>
|
||||
__device__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_add\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_add (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->amo_add<T>(dest, val, pe);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_inc\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_inc (ctx=%zd, dest=%p, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->amo_add<T>(dest, 1, pe);
|
||||
}
|
||||
@@ -734,7 +772,8 @@ __device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe) {
|
||||
template <typename T>
|
||||
__device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_set\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_set (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->amo_set(dest, val, pe);
|
||||
}
|
||||
@@ -742,7 +781,8 @@ __device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
template <typename T>
|
||||
__device__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_swap\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_swap (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
return get_internal_ctx(ctx)->amo_swap(dest, val, pe);
|
||||
}
|
||||
@@ -750,7 +790,8 @@ __device__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
template <typename T>
|
||||
__device__ T rocshmem_atomic_fetch_and(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_and\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_and (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
return get_internal_ctx(ctx)->amo_fetch_and(dest, val, pe);
|
||||
}
|
||||
@@ -758,7 +799,8 @@ __device__ T rocshmem_atomic_fetch_and(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_and\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_and (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->amo_and(dest, val, pe);
|
||||
}
|
||||
@@ -766,7 +808,8 @@ __device__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
template <typename T>
|
||||
__device__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_or\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_or (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
return get_internal_ctx(ctx)->amo_fetch_or(dest, val, pe);
|
||||
}
|
||||
@@ -774,7 +817,8 @@ __device__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_or\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_or (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->amo_or(dest, val, pe);
|
||||
}
|
||||
@@ -782,7 +826,8 @@ __device__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
template <typename T>
|
||||
__device__ T rocshmem_atomic_fetch_xor(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_xor\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_fetch_xor (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
return get_internal_ctx(ctx)->amo_fetch_xor(dest, val, pe);
|
||||
}
|
||||
@@ -790,7 +835,8 @@ __device__ T rocshmem_atomic_fetch_xor(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_xor\n");
|
||||
GPU_DPRINTF("Function: rocshmem_atomic_xor (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->amo_xor(dest, val, pe);
|
||||
}
|
||||
@@ -801,7 +847,8 @@ __device__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, T *dest, T val,
|
||||
__device__ void rocshmem_ctx_putmem_wave(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->putmem_wave(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -809,7 +856,8 @@ __device__ void rocshmem_ctx_putmem_wave(rocshmem_ctx_t ctx, void *dest,
|
||||
__device__ void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->putmem_wg(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -817,7 +865,8 @@ __device__ void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest,
|
||||
__device__ void rocshmem_ctx_putmem_nbi_wave(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source,
|
||||
size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->putmem_nbi_wave(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -825,7 +874,8 @@ __device__ void rocshmem_ctx_putmem_nbi_wave(rocshmem_ctx_t ctx, void *dest,
|
||||
__device__ void rocshmem_ctx_putmem_nbi_wg(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->putmem_nbi_wg(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -833,7 +883,8 @@ __device__ void rocshmem_ctx_putmem_nbi_wg(rocshmem_ctx_t ctx, void *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_put_wave(rocshmem_ctx_t ctx, T *dest,
|
||||
const T *source, size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_put_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_put_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->put_wave(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -841,7 +892,8 @@ __device__ void rocshmem_put_wave(rocshmem_ctx_t ctx, T *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_put_wg(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_put_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_put_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->put_wg(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -850,7 +902,8 @@ template <typename T>
|
||||
__device__ void rocshmem_put_nbi_wave(rocshmem_ctx_t ctx, T *dest,
|
||||
const T *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_put_nbi_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_put_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->put_nbi_wave(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -858,7 +911,8 @@ __device__ void rocshmem_put_nbi_wave(rocshmem_ctx_t ctx, T *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_put_nbi_wg(rocshmem_ctx_t ctx, T *dest,
|
||||
const T *source, size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_put_nbi_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_put_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->put_nbi_wg(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -866,7 +920,8 @@ __device__ void rocshmem_put_nbi_wg(rocshmem_ctx_t ctx, T *dest,
|
||||
__device__ void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->getmem_wg(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -874,7 +929,8 @@ __device__ void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest,
|
||||
__device__ void rocshmem_ctx_getmem_wave(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->getmem_wave(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -882,7 +938,8 @@ __device__ void rocshmem_ctx_getmem_wave(rocshmem_ctx_t ctx, void *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_get_wg(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_get_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_get_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->get_wg(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -890,7 +947,8 @@ __device__ void rocshmem_get_wg(rocshmem_ctx_t ctx, T *dest, const T *source,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_get_wave(rocshmem_ctx_t ctx, T *dest,
|
||||
const T *source, size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_get_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_get_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->get_wave(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -898,7 +956,8 @@ __device__ void rocshmem_get_wave(rocshmem_ctx_t ctx, T *dest,
|
||||
__device__ void rocshmem_ctx_getmem_nbi_wg(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->getmem_nbi_wg(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -906,7 +965,8 @@ __device__ void rocshmem_ctx_getmem_nbi_wg(rocshmem_ctx_t ctx, void *dest,
|
||||
template <typename T>
|
||||
__device__ void rocshmem_get_nbi_wg(rocshmem_ctx_t ctx, T *dest,
|
||||
const T *source, size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_get_nbi_wg\n");
|
||||
GPU_DPRINTF("Function: rocshmem_get_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->get_nbi_wg(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -914,7 +974,8 @@ __device__ void rocshmem_get_nbi_wg(rocshmem_ctx_t ctx, T *dest,
|
||||
__device__ void rocshmem_ctx_getmem_nbi_wave(rocshmem_ctx_t ctx, void *dest,
|
||||
const void *source,
|
||||
size_t nelems, int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->getmem_nbi_wave(dest, source, nelems, pe);
|
||||
}
|
||||
@@ -923,31 +984,38 @@ template <typename T>
|
||||
__device__ void rocshmem_get_nbi_wave(rocshmem_ctx_t ctx, T *dest,
|
||||
const T *source, size_t nelems,
|
||||
int pe) {
|
||||
GPU_DPRINTF("Function: rocshmem_get_nbi_wave\n");
|
||||
GPU_DPRINTF("Function: rocshmem_get_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n",
|
||||
ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe));
|
||||
|
||||
get_internal_ctx(ctx)->get_nbi_wave(dest, source, nelems, pe);
|
||||
}
|
||||
|
||||
#define ROCSHMEM_CTX_PUTMEM_SIGNAL_DEF(SUFFIX) \
|
||||
__device__ void rocshmem_ctx_putmem_signal##SUFFIX(rocshmem_ctx_t ctx, \
|
||||
#define ROCSHMEM_CTX_PUTMEM_SIGNAL_DEF(SUFFIX) \
|
||||
__device__ void rocshmem_ctx_putmem_signal##SUFFIX(rocshmem_ctx_t ctx, \
|
||||
void *dest, const void *source, \
|
||||
size_t nelems, \
|
||||
uint64_t *sig_addr, uint64_t signal, \
|
||||
int sig_op, \
|
||||
int pe) { \
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_signal##SUFFIX\n"); \
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_putmem_signal##SUFFIX (ctx=%zd, dest=%p, " \
|
||||
"source=%p, nelems=%d, sig_addr=%p, signal=%ld, sig_op=%d, pe=%d w%d)\n", \
|
||||
ctx.ctx_opaque, dest, source, nelems, \
|
||||
sig_addr, signal, sig_op, pe, translate_pe(ctx, pe)); \
|
||||
\
|
||||
get_internal_ctx(ctx)->putmem_signal##SUFFIX(dest, source, nelems, \
|
||||
sig_addr, signal, sig_op, pe); \
|
||||
} \
|
||||
\
|
||||
template <typename T> \
|
||||
__device__ void rocshmem_ctx_put_signal##SUFFIX(rocshmem_ctx_t ctx, \
|
||||
__device__ void rocshmem_ctx_put_signal##SUFFIX(rocshmem_ctx_t ctx, \
|
||||
T *dest, const T *source, \
|
||||
size_t nelems, \
|
||||
uint64_t *sig_addr, uint64_t signal, \
|
||||
int sig_op, int pe) { \
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_put_signal##SUFFIX\n"); \
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_put_signal##SUFFIX (ctx=%zd, dest=%p, " \
|
||||
"source=%p, nelems=%d, sig_addr=%p, signal=%ld, sig_op=%d, pe=%d w%d)\n", \
|
||||
ctx.ctx_opaque, dest, source, nelems, \
|
||||
sig_addr, signal, sig_op, pe, translate_pe(ctx, pe)); \
|
||||
\
|
||||
get_internal_ctx(ctx)->put_signal##SUFFIX(dest, source, nelems, \
|
||||
sig_addr, signal, sig_op, pe); \
|
||||
|
||||
@@ -62,9 +62,9 @@ namespace rocshmem {
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
#define GPU_DPRINTF(...) \
|
||||
do { \
|
||||
gpu_dprintf(__VA_ARGS__); \
|
||||
#define GPU_DPRINTF(...) \
|
||||
do { \
|
||||
gpu_dprintf("WG (%u, %u, %u) TH (%u, %u, %u) " __VA_ARGS__); \
|
||||
} while (0);
|
||||
#else
|
||||
#define GPU_DPRINTF(...) \
|
||||
@@ -159,10 +159,9 @@ __device__ void gpu_dprintf(const char* fmt, const Args&... args) {
|
||||
while (atomicCAS(print_lock, 0, 1) == 1) {
|
||||
}
|
||||
|
||||
printf("WG (%u, %u, %u) TH (%u, %u, %u) ", hipBlockIdx_x,
|
||||
hipBlockIdx_y, hipBlockIdx_z, hipThreadIdx_x, hipThreadIdx_y,
|
||||
hipThreadIdx_z);
|
||||
printf(fmt, args...);
|
||||
printf(fmt, hipBlockIdx_x, hipBlockIdx_y, hipBlockIdx_z,
|
||||
hipThreadIdx_x, hipThreadIdx_y, hipThreadIdx_z,
|
||||
args...);
|
||||
|
||||
*print_lock = 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user