diff --git a/src/clique/AllReduceCliqueKernel.h b/src/clique/AllReduceCliqueKernel.h index a46ed6a219..67a306d3c7 100644 --- a/src/clique/AllReduceCliqueKernel.h +++ b/src/clique/AllReduceCliqueKernel.h @@ -36,6 +36,7 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args) size_t const N = args->clique.count; // Total number of elements to reduce int const nBlocks = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x) int const blockId = args->clique.bid; // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x) + int const verbose = args->clique.verbose; // For debug purposes int const rank = args->comm->rank; // Current rank // Each threadblock works independently of others on a subsection of the input @@ -46,6 +47,10 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args) size_t const currBlockStop = min(currBlockStart + perBlockN, N); size_t const blockN = currBlockStop - currBlockStart; + if (verbose && threadIdx.x == 0) + { + printf("Rank %d block %d of %d %lu -> %lu [%lu]\n", rank, blockId, nBlocks, currBlockStart, currBlockStop, blockN); + } if (blockN > 0) { // Prepare input / output subarrays @@ -69,7 +74,12 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args) // Even if there was nothing for this GPU to do, it must participate in a barrier // because other GPUs may be modifying this GPUs output buffer still - if (blockId == 0) WaitForBarrier(cliquePtrs->barrier); + if (blockId == 0) + { + if (verbose && threadIdx.x == 0) printf("Rank %d enters GPU barrier\n", rank); + WaitForBarrier(cliquePtrs->barrier, rank, verbose); + if (verbose && threadIdx.x == 0) printf("Rank %d exits GPU barrier\n", rank); + } } #endif diff --git a/src/clique/CliqueCommon.h b/src/clique/CliqueCommon.h index b7bdee19ae..0fab8cc8ad 100644 --- a/src/clique/CliqueCommon.h +++ b/src/clique/CliqueCommon.h @@ -62,7 +62,7 @@ typedef struct // Multi-GPU (on same node) barrier. One thread per grid per GPU updates barrier / waits template -__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier) +__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int const rank, int const verbose) { if (threadIdx.x == 0) { @@ -71,6 +71,7 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier) int localSense = *barrier.localSense; int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST); + if (verbose) printf("Rank %d arrived at GPU barrier %d\n", rank, val); if (val == NUM_RANKS) { // Last arrival resets barrier @@ -80,7 +81,18 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier) else { // Wait for all ranks to reach barrier - while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense); + int counter = 0; + while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense) + { + if (verbose) + { + counter++; + if (counter == 100000000) + { + printf("Rank %d waiting on GPU barrier: (%d != %d)", rank, *barrier.globalSense, localSense); + } + } + } } } } diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc index 3afd3cfa31..fef2a91599 100644 --- a/src/clique/CliqueManager.cc +++ b/src/clique/CliqueManager.cc @@ -48,6 +48,7 @@ int* CliqueManager::m_staticGpuBarrierMem = NULL; RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0); // Opt-in environment variable for clique-based kernels RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 2097152); // Max number of bytes to use clique-based kernels for all reduce RCCL_PARAM(AllReduceNumChannels, "CLIQUE_ALLREDUCE_NCHANNELS", 4); // Number of channels to use for all-reduce +RCCL_PARAM(CliqueDebug, "CLIQUE_DEBUG", 0); // Emit debug messages CliqueManager::CliqueManager(int const rank, int const numRanks, @@ -337,7 +338,7 @@ ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args) // Prepare clique argments (NOTE: clique pointers are not ready yet) int opIndex = args->opCount % NCCL_MAX_OPS; args->clique.ptrs = &m_pinnedCliquePtrs[opIndex]; - + args->clique.verbose = rcclParamCliqueDebug(); // Determine number of channels to use for this collective args->clique.nChannels = rcclParamAllReduceNumChannels(); @@ -477,16 +478,33 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair