From 03a9fac960d76658096a5911f09a0ba02dc92e9c Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 15 May 2025 10:37:12 -0400 Subject: [PATCH] Detailed logs (#124) * Use a single printf per line (reduce chances of lines being cut in logs) * team_comm can be an int or a pointer depending on MPI impl. Received is confusing (since we are on the origin), use submitted instead * Print arguments to calls when using DEBUG --------- Signed-off-by: Aurelien Bouteiller [ROCm/rocshmem commit: 3600291558d8be239926f0400d052c9d88d65de5] --- .../src/reverse_offload/mpi_transport.cpp | 48 ++-- projects/rocshmem/src/rocshmem_gpu.cpp | 260 +++++++++++------- projects/rocshmem/src/util.hpp | 13 +- 3 files changed, 194 insertions(+), 127 deletions(-) diff --git a/projects/rocshmem/src/reverse_offload/mpi_transport.cpp b/projects/rocshmem/src/reverse_offload/mpi_transport.cpp index c1ff1aea18..87a9b58b1d 100644 --- a/projects/rocshmem/src/reverse_offload/mpi_transport.cpp +++ b/projects/rocshmem/src/reverse_offload/mpi_transport.cpp @@ -97,7 +97,7 @@ void MPITransport::submitRequestsToMPI() { putMem(next_element.dst, next_element.src, next_element.ol1.size, next_element.PE, next_element.ro_net_win_id, queue_idx, next_element.status, true); - DPRINTF("Received PUT dst %p src %p size %lu pe %d win_id %d\n", + DPRINTF("Submitted PUT dst %p src %p size %lu pe %d win_id %d\n", next_element.dst, next_element.src, next_element.ol1.size, next_element.PE, next_element.ro_net_win_id); break; @@ -112,7 +112,7 @@ void MPITransport::submitRequestsToMPI() { putMem(next_element.dst, source_buffer, next_element.ol1.size, next_element.PE, next_element.ro_net_win_id, queue_idx, next_element.status, true, true); - DPRINTF("Received P dst %p value %p pe %d\n", next_element.dst, + DPRINTF("Submitted P dst %p value %p pe %d\n", next_element.dst, next_element.src, next_element.PE); break; } @@ -120,14 +120,14 @@ void MPITransport::submitRequestsToMPI() { getMem(next_element.dst, next_element.src, next_element.ol1.size, next_element.PE, next_element.ro_net_win_id, queue_idx, next_element.status, true); - DPRINTF("Received GET dst %p src %p size %lu pe %d\n", next_element.dst, + DPRINTF("Submitted GET dst %p src %p size %lu pe %d\n", next_element.dst, next_element.src, next_element.ol1.size, next_element.PE); break; case RO_NET_PUT_NBI: putMem(next_element.dst, next_element.src, next_element.ol1.size, next_element.PE, next_element.ro_net_win_id, queue_idx, next_element.status, false); - DPRINTF("Received PUT NBI dst %p src %p size %lu pe %d\n", + DPRINTF("Submitted PUT NBI dst %p src %p size %lu pe %d\n", next_element.dst, next_element.src, next_element.ol1.size, next_element.PE); break; @@ -135,7 +135,7 @@ void MPITransport::submitRequestsToMPI() { getMem(next_element.dst, next_element.src, next_element.ol1.size, next_element.PE, next_element.ro_net_win_id, queue_idx, next_element.status, false); - DPRINTF("Received GET NBI dst %p src %p size %lu pe %d\n", + DPRINTF("Submitted GET NBI dst %p src %p size %lu pe %d\n", next_element.dst, next_element.src, next_element.ol1.size, next_element.PE); break; @@ -146,7 +146,7 @@ void MPITransport::submitRequestsToMPI() { next_element.status, true, static_cast(next_element.op), static_cast(next_element.datatype)); - DPRINTF("Received AMO dst %p src %p Val %llu pe %d\n", next_element.dst, + DPRINTF("Submitted AMO dst %p src %p Val %llu pe %d\n", next_element.dst, next_element.src, next_element.ol1.atomic_value, next_element.PE); break; case RO_NET_AMO_FCAS: @@ -156,7 +156,7 @@ void MPITransport::submitRequestsToMPI() { next_element.status, true, const_cast(&next_element.ol2.pWrk), static_cast(next_element.datatype)); - DPRINTF("Received F_CSWAP dst %p src %p Val %llu pe %d cond %ld\n", + DPRINTF("Submitted F_CSWAP dst %p src %p Val %llu pe %d cond %ld\n", next_element.dst, next_element.src, next_element.ol1.atomic_value, next_element.PE, reinterpret_cast(next_element.ol2.pWrk)); @@ -168,9 +168,9 @@ void MPITransport::submitRequestsToMPI() { static_cast(next_element.op), static_cast(next_element.datatype), next_element.status, true); - DPRINTF("Received FLOAT_SUM_TEAM_REDUCE dst %p src %p size %lu team %d\n", + DPRINTF("Submitted FLOAT_SUM_TEAM_REDUCE dst %p src %p size %lu team %zd\n", next_element.dst, next_element.src, next_element.ol1.size, - next_element.team_comm); + (intptr_t)next_element.team_comm); break; case RO_NET_TEAM_BROADCAST: team_broadcast(next_element.dst, next_element.src, next_element.ol1.size, @@ -179,10 +179,10 @@ void MPITransport::submitRequestsToMPI() { static_cast(next_element.datatype), next_element.status, true); DPRINTF( - "Received TEAM_BROADCAST dst %p src %p size %lu " - "team %d, PE_root %d \n", + "Submitted TEAM_BROADCAST dst %p src %p size %lu " + "team %zd, PE_root %d \n", next_element.dst, next_element.src, next_element.ol1.size, - next_element.team_comm, next_element.PE_root); + (intptr_t)next_element.team_comm, next_element.PE_root); break; case RO_NET_ALLTOALL: alltoall(next_element.dst, next_element.src, next_element.ol1.size, @@ -190,9 +190,9 @@ void MPITransport::submitRequestsToMPI() { next_element.ol2.pWrk, static_cast(next_element.datatype), next_element.status, true); - DPRINTF("Received ALLTOALL dst %p src %p size %lu team %d\n", + DPRINTF("Submitted ALLTOALL dst %p src %p size %lu team %zd\n", next_element.dst, next_element.src, next_element.ol1.size, - next_element.team_comm); + (intptr_t)next_element.team_comm); break; case RO_NET_FCOLLECT: fcollect(next_element.dst, next_element.src, next_element.ol1.size, @@ -200,30 +200,30 @@ void MPITransport::submitRequestsToMPI() { next_element.ol2.pWrk, static_cast(next_element.datatype), next_element.status, true); - DPRINTF("Received FCOLLECT dst %p src %p size %lu team %d\n", + DPRINTF("Submitted FCOLLECT dst %p src %p size %lu team %zd\n", next_element.dst, next_element.src, next_element.ol1.size, - next_element.team_comm); + (intptr_t)next_element.team_comm); break; case RO_NET_BARRIER: barrier(queue_idx, next_element.status, true, - next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm, - true); - DPRINTF("Received Barrier_all\n"); + next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm, + true); + DPRINTF("Submitted Barrier_all\n"); break; case RO_NET_SYNC: barrier(queue_idx, next_element.status, true, - next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm, - false); - DPRINTF("Received Sync\n"); + next_element.team_comm == NULL ? ro_net_comm_world : next_element.team_comm, + false); + DPRINTF("Submitted Sync\n"); break; case RO_NET_FENCE: case RO_NET_QUIET: quiet(queue_idx, next_element.status); - DPRINTF("Received FENCE/QUIET\n"); + DPRINTF("Submitted FENCE/QUIET\n"); break; case RO_NET_FINALIZE: quiet(queue_idx, next_element.status); - DPRINTF("Received Finalize\n"); + DPRINTF("Submitted Finalize\n"); break; default: fprintf(stderr, "Invalid GPU Packet received, exiting....\n"); diff --git a/projects/rocshmem/src/rocshmem_gpu.cpp b/projects/rocshmem/src/rocshmem_gpu.cpp index 0809c42096..787de724c3 100644 --- a/projects/rocshmem/src/rocshmem_gpu.cpp +++ b/projects/rocshmem/src/rocshmem_gpu.cpp @@ -283,14 +283,14 @@ __device__ Context *get_internal_ctx(rocshmem_ctx_t ctx) { return reinterpret_cast(ctx.ctx_opaque); } -__device__ int rocshmem_wg_ctx_create(long option, rocshmem_ctx_t *ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_create\n"); +__device__ int rocshmem_wg_ctx_create(long options, rocshmem_ctx_t *ctx) { + GPU_DPRINTF("Function: rocshmem_wg_ctx_create (options=%ld)\n", options); bool result{true}; if (get_flat_block_id() == 0) { ctx->team_opaque = reinterpret_cast(ROCSHMEM_CTX_DEFAULT.team_opaque); - result = device_backend_proxy->create_ctx(option, ctx); + result = device_backend_proxy->create_ctx(options, ctx); if(result) { - reinterpret_cast(ctx->ctx_opaque)->setFence(option); + reinterpret_cast(ctx->ctx_opaque)->setFence(options); } } __syncthreads(); @@ -298,8 +298,9 @@ __device__ int rocshmem_wg_ctx_create(long option, rocshmem_ctx_t *ctx) { } __device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options, - rocshmem_ctx_t *ctx) { - GPU_DPRINTF("Function: rocshmem_team_create_ctx\n"); + rocshmem_ctx_t *ctx) { + GPU_DPRINTF("Function: rocshmem_wg_team_create_ctx (team=%zd, options=%ld)\n", + (intptr_t)team, options); if (team == ROCSHMEM_TEAM_INVALID) { return -1; } @@ -321,7 +322,8 @@ __device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options, __device__ void rocshmem_wg_ctx_destroy( [[maybe_unused]] rocshmem_ctx_t *ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_destroy\n"); + GPU_DPRINTF("Function: rocshmem_wg_ctx_destroy (ctx=%zd)\n", + ctx->ctx_opaque); if (get_flat_block_id() == 0) { device_backend_proxy->destroy_ctx(ctx); @@ -329,7 +331,8 @@ __device__ void rocshmem_wg_ctx_destroy( } __device__ void rocshmem_ctx_threadfence_system(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_threadfence_system\n"); + GPU_DPRINTF("Function: rocshmem_ctx_threadfence_system (ctx=%zd)\n", + ctx.ctx_opaque); get_internal_ctx(ctx)->threadfence_system(); } @@ -337,9 +340,9 @@ __device__ void rocshmem_ctx_threadfence_system(rocshmem_ctx_t ctx) { __device__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_putmem\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_ctx_putmem (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world); get_internal_ctx(ctx)->putmem(dest, source, nelems, pe_in_world); } @@ -347,27 +350,27 @@ __device__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, template __device__ void rocshmem_put(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_put\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_put (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world); get_internal_ctx(ctx)->put(dest, source, nelems, pe_in_world); } template __device__ void rocshmem_p(rocshmem_ctx_t ctx, T *dest, T value, int pe) { - GPU_DPRINTF("Function: rocshmem_p\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_p (ctx=%zd, dest=%p, value=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, value, pe, pe_in_world); get_internal_ctx(ctx)->p(dest, value, pe_in_world); } template __device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe) { - GPU_DPRINTF("Function: rocshmem_g\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_g (ctx=%zd, source=%p, pe=%d w%d)\n", + ctx.ctx_opaque, source, pe, pe_in_world); return get_internal_ctx(ctx)->g(source, pe_in_world); } @@ -375,9 +378,9 @@ __device__ T rocshmem_g(rocshmem_ctx_t ctx, const T *source, int pe) { __device__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_getmem\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_ctx_getmem (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world); get_internal_ctx(ctx)->getmem(dest, source, nelems, pe_in_world); } @@ -385,9 +388,9 @@ __device__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, template __device__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_get\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_get (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world); get_internal_ctx(ctx)->get(dest, source, nelems, pe_in_world); } @@ -395,9 +398,9 @@ __device__ void rocshmem_get(rocshmem_ctx_t ctx, T *dest, const T *source, __device__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world); get_internal_ctx(ctx)->putmem_nbi(dest, source, nelems, pe_in_world); } @@ -405,9 +408,9 @@ __device__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, template __device__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_put_nbi\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_put_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world); get_internal_ctx(ctx)->put_nbi(dest, source, nelems, pe_in_world); } @@ -415,9 +418,9 @@ __device__ void rocshmem_put_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, __device__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world); get_internal_ctx(ctx)->getmem_nbi(dest, source, nelems, pe_in_world); } @@ -425,35 +428,38 @@ __device__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, template __device__ void rocshmem_get_nbi(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_get_nbi\n"); - int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_get_nbi (ctx=%zd, dest=%p, source=%p, nelems=%zd, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, pe_in_world); get_internal_ctx(ctx)->get_nbi(dest, source, nelems, pe_in_world); } __device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_fence\n"); + GPU_DPRINTF("Function: rocshmem_ctx_fence (ctx=%zd)\n", ctx.ctx_opaque); get_internal_ctx(ctx)->fence(); } __device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_fence\n"); int pe_in_world = translate_pe(ctx, pe); + GPU_DPRINTF("Function: rocshmem_ctx_fence (ctx=%zd, pe=%d w%d))\n", + ctx.ctx_opaque, pe, pe_in_world); get_internal_ctx(ctx)->fence(pe_in_world); } __device__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_quiet\n"); + GPU_DPRINTF("Function: rocshmem_ctx_quiet (ctx=%zd)\n", + ctx.ctx_opaque); get_internal_ctx(ctx)->quiet(); } __device__ void *rocshmem_ptr(const void *dest, int pe) { - GPU_DPRINTF("Function: rocshmem_ptr\n"); + GPU_DPRINTF("Function: rocshmem_ptr (dest=%p, pe=%d w%d\n", + dest, pe, pe); return get_internal_ctx(ROCSHMEM_CTX_DEFAULT)->shmem_ptr(dest, pe); } @@ -461,7 +467,8 @@ __device__ void *rocshmem_ptr(const void *dest, int pe) { template __device__ int rocshmem_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nreduce) { - GPU_DPRINTF("Function: rocshmem_reduce\n"); + GPU_DPRINTF("Function: rocshmem_reduce_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nreduce=%d\n", + ctx.ctx_opaque, team, dest, source, nreduce); return get_internal_ctx(ctx)->reduce(team, dest, source, nreduce); } @@ -471,7 +478,8 @@ __device__ void rocshmem_broadcast_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelem, int pe_root) { - GPU_DPRINTF("Function: Team-based rocshmem_broadcast\n"); + GPU_DPRINTF("Function: Team-based rocshmem_broadcast_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d, root=%d)\n", + ctx.ctx_opaque, team, dest, source, nelem, pe_root); get_internal_ctx(ctx)->broadcast(team, dest, source, nelem, pe_root); } @@ -480,7 +488,8 @@ template __device__ void rocshmem_alltoall_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelem) { - GPU_DPRINTF("Function: rocshmem_alltoall\n"); + GPU_DPRINTF("Function: rocshmem_alltoall_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d\n", + ctx.ctx_opaque, team, dest, source, nelem); get_internal_ctx(ctx)->alltoall(team, dest, source, nelem); } @@ -489,14 +498,16 @@ template __device__ void rocshmem_fcollect_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelem) { - GPU_DPRINTF("Function: rocshmem_fcollect\n"); + GPU_DPRINTF("Function: rocshmem_fcollect_wg (ctx=%zd, team=%zd, dest=%p, source=%p, nelem=%d\n", + ctx.ctx_opaque, team, dest, source, nelem); get_internal_ctx(ctx)->fcollect(team, dest, source, nelem); } template __device__ void rocshmem_wait_until(T *ivars, int cmp, T val) { - GPU_DPRINTF("Function: rocshmem_wait_until\n"); + GPU_DPRINTF("Function: rocshmem_wait_until (ivars=%p, cmp=%d, val=%g)\n", + ivars, cmp, (double)val); Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL); @@ -506,7 +517,8 @@ __device__ void rocshmem_wait_until(T *ivars, int cmp, T val) { template __device__ void rocshmem_wait_until_all(T *ivars, size_t nelems, const int* status, int cmp, T val) { - GPU_DPRINTF("Function: rocshmem_wait_until_all\n"); + GPU_DPRINTF("Function: rocshmem_wait_until_all (ivars=%p, nelems=%zd cmp=%d, val=%g)\n", + ivars, nelems, cmp, (double)val); Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL); @@ -516,7 +528,8 @@ __device__ void rocshmem_wait_until_all(T *ivars, size_t nelems, const int* stat template __device__ size_t rocshmem_wait_until_any(T *ivars, size_t nelems, const int* status, int cmp, T val) { - GPU_DPRINTF("Function: rocshmem_wait_until_any\n"); + GPU_DPRINTF("Function: rocshmem_wait_until_any (ivars=%p, nelems=%zd cmp=%d, val=%g)\n", + ivars, nelems, cmp, (double)val); Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY); @@ -527,7 +540,8 @@ template __device__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems, size_t* indices, const int* status, int cmp, T val) { - DPRINTF("Function: rocshmem_wait_until_some\n"); + DPRINTF("Function: rocshmem_wait_until_some (ivars=%p, nelems=%zd cmp=%d, val=%g)\n", + ivars, nelems, cmp, (double)val); Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME); @@ -537,7 +551,8 @@ __device__ size_t rocshmem_wait_until_some(T *ivars, size_t nelems, size_t* indi template __device__ size_t rocshmem_wait_until_any_vector(T *ivars, size_t nelems, const int* status, int cmp, T* vals) { - DPRINTF("Function: rocshmem_wait_until_any_vector\n"); + DPRINTF("Function: rocshmem_wait_until_any_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n", + ivars, nelems, cmp, vals); Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ANY_VECTOR); @@ -547,7 +562,8 @@ __device__ size_t rocshmem_wait_until_any_vector(T *ivars, size_t nelems, const template __device__ void rocshmem_wait_until_all_vector(T *ivars, size_t nelems, const int* status, int cmp, T* vals) { - DPRINTF("Function: rocshmem_wait_until_all_vector\n"); + DPRINTF("Function: rocshmem_wait_until_all_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n", + ivars, nelems, cmp, vals); Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_ALL_VECTOR); @@ -559,7 +575,8 @@ __device__ size_t rocshmem_wait_until_some_vector(T *ivars, size_t nelems, size_t* indices, const int* status, int cmp, T* vals) { - DPRINTF("Function: rocshmem_wait_until_some_vector\n"); + DPRINTF("Function: rocshmem_wait_until_some_vector (ivars=%p, nelems=%zd cmp=%d, vals=%p)\n", + ivars, nelems, cmp, vals); Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); ctx_internal->ctxStats.incStat(NUM_WAIT_UNTIL_SOME_VECTOR); @@ -568,7 +585,8 @@ __device__ size_t rocshmem_wait_until_some_vector(T *ivars, size_t nelems, template __device__ int rocshmem_test(T *ivars, int cmp, T val) { - GPU_DPRINTF("Function: rocshmem_testl\n"); + GPU_DPRINTF("Function: rocshmem_test (ivars=%p, cmp=%d, val=%g)\n", + ivars, cmp, (double)val); Context *ctx_internal = get_internal_ctx(ROCSHMEM_CTX_DEFAULT); ctx_internal->ctxStats.incStat(NUM_TEST); @@ -577,82 +595,95 @@ __device__ int rocshmem_test(T *ivars, int cmp, T val) { } __device__ void rocshmem_ctx_barrier_all(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_barrier_all\n"); + GPU_DPRINTF("Function: rocshmem_ctx_barrier_all (ctx=%zd)\n", + ctx.ctx_opaque); get_internal_ctx(ctx)->barrier_all(); } __device__ void rocshmem_ctx_barrier_all_wave(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave\n"); + GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave (ctx=%zd)\n", + ctx.ctx_opaque); get_internal_ctx(ctx)->barrier_all_wave(); } __device__ void rocshmem_ctx_barrier_all_wg(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg\n"); + GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg (ctx=%zd)\n", + ctx.ctx_opaque); get_internal_ctx(ctx)->barrier_all_wg(); } __device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) { - GPU_DPRINTF("Function: rocshmem_barrier\n"); + GPU_DPRINTF("Function: rocshmem_ctx_barrier (ctx=%zd, team=%zd)\n", + ctx.ctx_opaque, team); get_internal_ctx(ctx)->barrier(team); } __device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) { - GPU_DPRINTF("Function: rocshmem_barrier_wave\n"); + GPU_DPRINTF("Function: rocshmem_ctx_barrier_wave (ctx=%zd, team=%zd)\n", + ctx.ctx_opaque, team); get_internal_ctx(ctx)->barrier_wave(team); } __device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) { - GPU_DPRINTF("Function: rocshmem_barrier_wg\n"); + GPU_DPRINTF("Function: rocshmem_ctx_barrier_wg (ctx=%zd, team=%zd)\n", + ctx.ctx_opaque, team); get_internal_ctx(ctx)->barrier_wg(team); } __device__ void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n"); + GPU_DPRINTF("Function: rocshmem_ctx_sync_all (ctx=%zd)\n", + ctx.ctx_opaque); get_internal_ctx(ctx)->sync_all(); } __device__ void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave\n"); + GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave (ctx=%zd)\n", + ctx.ctx_opaque); get_internal_ctx(ctx)->sync_all_wave(); } __device__ void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg\n"); + GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg (ctx=%zd)\n", + ctx.ctx_opaque); get_internal_ctx(ctx)->sync_all_wg(); } __device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx, - rocshmem_team_t team) { - GPU_DPRINTF("Function: rocshmem_ctx_sync\n"); + rocshmem_team_t team) { + GPU_DPRINTF("Function: rocshmem_ctx_sync (ctx=%zd, team=%zd)\n", + ctx.ctx_opaque, team); get_internal_ctx(ctx)->sync_wg(team); } __device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx, - rocshmem_team_t team) { -GPU_DPRINTF("Function: rocshmem_ctx_sync_wave\n"); + rocshmem_team_t team) { +GPU_DPRINTF("Function: rocshmem_ctx_sync_wave (ctx=%zd, team=%zd)\n", + ctx.ctx_opaque, team); get_internal_ctx(ctx)->sync_wg(team); } __device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx, - rocshmem_team_t team) { -GPU_DPRINTF("Function: rocshmem_ctx_sync_wg\n"); + rocshmem_team_t team) { +GPU_DPRINTF("Function: rocshmem_ctx_sync_wg (ctx=%zd, team=%zd)\n", + ctx.ctx_opaque, team); get_internal_ctx(ctx)->sync_wg(team); } __device__ int rocshmem_ctx_n_pes(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_n_pes\n"); + GPU_DPRINTF("Function: rocshmem_ctx_n_pes (ctx=%zd)\n", + ctx.ctx_opaque); TeamInfo *tinfo = reinterpret_cast(ctx.team_opaque); return tinfo->size; @@ -663,7 +694,8 @@ __device__ int rocshmem_n_pes() { } __device__ int rocshmem_ctx_my_pe(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_my_pe\n"); + GPU_DPRINTF("Function: rocshmem_ctx_my_pe (ctx=%zd)\n", + ctx.ctx_opaque); TeamInfo *tinfo = reinterpret_cast(ctx.team_opaque); int my_pe{get_internal_ctx(ctx)->my_pe}; @@ -689,7 +721,8 @@ __device__ int rocshmem_my_pe() { template __device__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_fetch_add\n"); + GPU_DPRINTF("Function: rocshmem_atomic_fetch_add (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); return get_internal_ctx(ctx)->amo_fetch_add(dest, val, pe); } @@ -697,21 +730,24 @@ __device__ T rocshmem_atomic_fetch_add(rocshmem_ctx_t ctx, T *dest, T val, template __device__ T rocshmem_atomic_compare_swap(rocshmem_ctx_t ctx, T *dest, T cond, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_compare_swap\n"); + GPU_DPRINTF("Function: rocshmem_atomic_compare_swap (ctx=%zd, dest=%p, cond=%g, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, cond, (double)val, pe, translate_pe(ctx, pe)); return get_internal_ctx(ctx)->amo_fetch_cas(dest, val, cond, pe); } template __device__ T rocshmem_atomic_fetch_inc(rocshmem_ctx_t ctx, T *dest, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_fetch_inc\n"); + GPU_DPRINTF("Function: rocshmem_atomic_fetch_inc (ctx=%zd, dest=%p, pe=%d w%d)\n", + ctx.ctx_opaque, dest, pe, translate_pe(ctx, pe)); return get_internal_ctx(ctx)->amo_fetch_add(dest, 1, pe); } template __device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_fetch\n"); + GPU_DPRINTF("Function: rocshmem_atomic_fetch (ctx=%zd, source=%p, pe=%d w%d)\n", + ctx.ctx_opaque, source, pe, translate_pe(ctx, pe)); return get_internal_ctx(ctx)->amo_fetch_add(source, 0, pe); } @@ -719,14 +755,16 @@ __device__ T rocshmem_atomic_fetch(rocshmem_ctx_t ctx, T *source, int pe) { template __device__ void rocshmem_atomic_add(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_add\n"); + GPU_DPRINTF("Function: rocshmem_atomic_add (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->amo_add(dest, val, pe); } template __device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_inc\n"); + GPU_DPRINTF("Function: rocshmem_atomic_inc (ctx=%zd, dest=%p, pe=%d w%d)\n", + ctx.ctx_opaque, dest, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->amo_add(dest, 1, pe); } @@ -734,7 +772,8 @@ __device__ void rocshmem_atomic_inc(rocshmem_ctx_t ctx, T *dest, int pe) { template __device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_set\n"); + GPU_DPRINTF("Function: rocshmem_atomic_set (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->amo_set(dest, val, pe); } @@ -742,7 +781,8 @@ __device__ void rocshmem_atomic_set(rocshmem_ctx_t ctx, T *dest, T val, template __device__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_swap\n"); + GPU_DPRINTF("Function: rocshmem_atomic_swap (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); return get_internal_ctx(ctx)->amo_swap(dest, val, pe); } @@ -750,7 +790,8 @@ __device__ T rocshmem_atomic_swap(rocshmem_ctx_t ctx, T *dest, T val, template __device__ T rocshmem_atomic_fetch_and(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_fetch_and\n"); + GPU_DPRINTF("Function: rocshmem_atomic_fetch_and (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); return get_internal_ctx(ctx)->amo_fetch_and(dest, val, pe); } @@ -758,7 +799,8 @@ __device__ T rocshmem_atomic_fetch_and(rocshmem_ctx_t ctx, T *dest, T val, template __device__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_and\n"); + GPU_DPRINTF("Function: rocshmem_atomic_and (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->amo_and(dest, val, pe); } @@ -766,7 +808,8 @@ __device__ void rocshmem_atomic_and(rocshmem_ctx_t ctx, T *dest, T val, template __device__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_fetch_or\n"); + GPU_DPRINTF("Function: rocshmem_atomic_fetch_or (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); return get_internal_ctx(ctx)->amo_fetch_or(dest, val, pe); } @@ -774,7 +817,8 @@ __device__ T rocshmem_atomic_fetch_or(rocshmem_ctx_t ctx, T *dest, T val, template __device__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_or\n"); + GPU_DPRINTF("Function: rocshmem_atomic_or (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->amo_or(dest, val, pe); } @@ -782,7 +826,8 @@ __device__ void rocshmem_atomic_or(rocshmem_ctx_t ctx, T *dest, T val, template __device__ T rocshmem_atomic_fetch_xor(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_fetch_xor\n"); + GPU_DPRINTF("Function: rocshmem_atomic_fetch_xor (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); return get_internal_ctx(ctx)->amo_fetch_xor(dest, val, pe); } @@ -790,7 +835,8 @@ __device__ T rocshmem_atomic_fetch_xor(rocshmem_ctx_t ctx, T *dest, T val, template __device__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, T *dest, T val, int pe) { - GPU_DPRINTF("Function: rocshmem_atomic_xor\n"); + GPU_DPRINTF("Function: rocshmem_atomic_xor (ctx=%zd, dest=%p, val=%g, pe=%d w%d)\n", + ctx.ctx_opaque, dest, (double)val, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->amo_xor(dest, val, pe); } @@ -801,7 +847,8 @@ __device__ void rocshmem_atomic_xor(rocshmem_ctx_t ctx, T *dest, T val, __device__ void rocshmem_ctx_putmem_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_putmem_wave\n"); + GPU_DPRINTF("Function: rocshmem_ctx_putmem_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->putmem_wave(dest, source, nelems, pe); } @@ -809,7 +856,8 @@ __device__ void rocshmem_ctx_putmem_wave(rocshmem_ctx_t ctx, void *dest, __device__ void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_putmem_wg\n"); + GPU_DPRINTF("Function: rocshmem_ctx_putmem_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->putmem_wg(dest, source, nelems, pe); } @@ -817,7 +865,8 @@ __device__ void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest, __device__ void rocshmem_ctx_putmem_nbi_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wave\n"); + GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->putmem_nbi_wave(dest, source, nelems, pe); } @@ -825,7 +874,8 @@ __device__ void rocshmem_ctx_putmem_nbi_wave(rocshmem_ctx_t ctx, void *dest, __device__ void rocshmem_ctx_putmem_nbi_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wg\n"); + GPU_DPRINTF("Function: rocshmem_ctx_putmem_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->putmem_nbi_wg(dest, source, nelems, pe); } @@ -833,7 +883,8 @@ __device__ void rocshmem_ctx_putmem_nbi_wg(rocshmem_ctx_t ctx, void *dest, template __device__ void rocshmem_put_wave(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_put_wave\n"); + GPU_DPRINTF("Function: rocshmem_put_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->put_wave(dest, source, nelems, pe); } @@ -841,7 +892,8 @@ __device__ void rocshmem_put_wave(rocshmem_ctx_t ctx, T *dest, template __device__ void rocshmem_put_wg(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_put_wg\n"); + GPU_DPRINTF("Function: rocshmem_put_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->put_wg(dest, source, nelems, pe); } @@ -850,7 +902,8 @@ template __device__ void rocshmem_put_nbi_wave(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_put_nbi_wave\n"); + GPU_DPRINTF("Function: rocshmem_put_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->put_nbi_wave(dest, source, nelems, pe); } @@ -858,7 +911,8 @@ __device__ void rocshmem_put_nbi_wave(rocshmem_ctx_t ctx, T *dest, template __device__ void rocshmem_put_nbi_wg(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_put_nbi_wg\n"); + GPU_DPRINTF("Function: rocshmem_put_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->put_nbi_wg(dest, source, nelems, pe); } @@ -866,7 +920,8 @@ __device__ void rocshmem_put_nbi_wg(rocshmem_ctx_t ctx, T *dest, __device__ void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_getmem_wg\n"); + GPU_DPRINTF("Function: rocshmem_ctx_getmem_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->getmem_wg(dest, source, nelems, pe); } @@ -874,7 +929,8 @@ __device__ void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest, __device__ void rocshmem_ctx_getmem_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_getmem_wave\n"); + GPU_DPRINTF("Function: rocshmem_ctx_getmem_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->getmem_wave(dest, source, nelems, pe); } @@ -882,7 +938,8 @@ __device__ void rocshmem_ctx_getmem_wave(rocshmem_ctx_t ctx, void *dest, template __device__ void rocshmem_get_wg(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_get_wg\n"); + GPU_DPRINTF("Function: rocshmem_get_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->get_wg(dest, source, nelems, pe); } @@ -890,7 +947,8 @@ __device__ void rocshmem_get_wg(rocshmem_ctx_t ctx, T *dest, const T *source, template __device__ void rocshmem_get_wave(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_get_wave\n"); + GPU_DPRINTF("Function: rocshmem_get_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->get_wave(dest, source, nelems, pe); } @@ -898,7 +956,8 @@ __device__ void rocshmem_get_wave(rocshmem_ctx_t ctx, T *dest, __device__ void rocshmem_ctx_getmem_nbi_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wg\n"); + GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->getmem_nbi_wg(dest, source, nelems, pe); } @@ -906,7 +965,8 @@ __device__ void rocshmem_ctx_getmem_nbi_wg(rocshmem_ctx_t ctx, void *dest, template __device__ void rocshmem_get_nbi_wg(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_get_nbi_wg\n"); + GPU_DPRINTF("Function: rocshmem_get_nbi_wg (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->get_nbi_wg(dest, source, nelems, pe); } @@ -914,7 +974,8 @@ __device__ void rocshmem_get_nbi_wg(rocshmem_ctx_t ctx, T *dest, __device__ void rocshmem_ctx_getmem_nbi_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wave\n"); + GPU_DPRINTF("Function: rocshmem_ctx_getmem_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->getmem_nbi_wave(dest, source, nelems, pe); } @@ -923,31 +984,38 @@ template __device__ void rocshmem_get_nbi_wave(rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe) { - GPU_DPRINTF("Function: rocshmem_get_nbi_wave\n"); + GPU_DPRINTF("Function: rocshmem_get_nbi_wave (ctx=%zd, dest=%p, source=%p, nelems=%d, pe=%d w%d)\n", + ctx.ctx_opaque, dest, source, nelems, pe, translate_pe(ctx, pe)); get_internal_ctx(ctx)->get_nbi_wave(dest, source, nelems, pe); } -#define ROCSHMEM_CTX_PUTMEM_SIGNAL_DEF(SUFFIX) \ - __device__ void rocshmem_ctx_putmem_signal##SUFFIX(rocshmem_ctx_t ctx, \ +#define ROCSHMEM_CTX_PUTMEM_SIGNAL_DEF(SUFFIX) \ + __device__ void rocshmem_ctx_putmem_signal##SUFFIX(rocshmem_ctx_t ctx, \ void *dest, const void *source, \ size_t nelems, \ uint64_t *sig_addr, uint64_t signal, \ int sig_op, \ int pe) { \ - GPU_DPRINTF("Function: rocshmem_ctx_putmem_signal##SUFFIX\n"); \ + GPU_DPRINTF("Function: rocshmem_ctx_putmem_signal##SUFFIX (ctx=%zd, dest=%p, " \ + "source=%p, nelems=%d, sig_addr=%p, signal=%ld, sig_op=%d, pe=%d w%d)\n", \ + ctx.ctx_opaque, dest, source, nelems, \ + sig_addr, signal, sig_op, pe, translate_pe(ctx, pe)); \ \ get_internal_ctx(ctx)->putmem_signal##SUFFIX(dest, source, nelems, \ sig_addr, signal, sig_op, pe); \ } \ \ template \ - __device__ void rocshmem_ctx_put_signal##SUFFIX(rocshmem_ctx_t ctx, \ + __device__ void rocshmem_ctx_put_signal##SUFFIX(rocshmem_ctx_t ctx, \ T *dest, const T *source, \ size_t nelems, \ uint64_t *sig_addr, uint64_t signal, \ int sig_op, int pe) { \ - GPU_DPRINTF("Function: rocshmem_ctx_put_signal##SUFFIX\n"); \ + GPU_DPRINTF("Function: rocshmem_ctx_put_signal##SUFFIX (ctx=%zd, dest=%p, " \ + "source=%p, nelems=%d, sig_addr=%p, signal=%ld, sig_op=%d, pe=%d w%d)\n", \ + ctx.ctx_opaque, dest, source, nelems, \ + sig_addr, signal, sig_op, pe, translate_pe(ctx, pe)); \ \ get_internal_ctx(ctx)->put_signal##SUFFIX(dest, source, nelems, \ sig_addr, signal, sig_op, pe); \ diff --git a/projects/rocshmem/src/util.hpp b/projects/rocshmem/src/util.hpp index 3f9a6115a9..f4793563c2 100644 --- a/projects/rocshmem/src/util.hpp +++ b/projects/rocshmem/src/util.hpp @@ -62,9 +62,9 @@ namespace rocshmem { #endif #ifdef DEBUG -#define GPU_DPRINTF(...) \ - do { \ - gpu_dprintf(__VA_ARGS__); \ +#define GPU_DPRINTF(...) \ + do { \ + gpu_dprintf("WG (%u, %u, %u) TH (%u, %u, %u) " __VA_ARGS__); \ } while (0); #else #define GPU_DPRINTF(...) \ @@ -159,10 +159,9 @@ __device__ void gpu_dprintf(const char* fmt, const Args&... args) { while (atomicCAS(print_lock, 0, 1) == 1) { } - printf("WG (%u, %u, %u) TH (%u, %u, %u) ", hipBlockIdx_x, - hipBlockIdx_y, hipBlockIdx_z, hipThreadIdx_x, hipThreadIdx_y, - hipThreadIdx_z); - printf(fmt, args...); + printf(fmt, hipBlockIdx_x, hipBlockIdx_y, hipBlockIdx_z, + hipThreadIdx_x, hipThreadIdx_y, hipThreadIdx_z, + args...); *print_lock = 0; }