Adding RCCL_CLIQUE_DEBUG to help debug experimental clique feature (#300)

2020-11-13 09:07:11 -07:00
@@ -36,6 +36,7 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
  size_t const N                 = args->clique.count;     // Total number of elements to reduce
  int    const nBlocks           = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x)
  int    const blockId           = args->clique.bid;       // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x)
+  int    const verbose           = args->clique.verbose;   // For debug purposes
  int    const rank              = args->comm->rank;       // Current rank

  // Each threadblock works independently of others on a subsection of the input
@@ -46,6 +47,10 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
  size_t const currBlockStop  = min(currBlockStart + perBlockN, N);
  size_t const blockN         = currBlockStop - currBlockStart;

+  if (verbose && threadIdx.x == 0)
+  {
+    printf("Rank %d block %d of %d %lu -> %lu [%lu]\n", rank, blockId, nBlocks, currBlockStart, currBlockStop, blockN);
+  }
  if (blockN > 0)
  {
    // Prepare input / output subarrays
@@ -69,7 +74,12 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)

  // Even if there was nothing for this GPU to do, it must participate in a barrier
  // because other GPUs may be modifying this GPUs output buffer still
-  if (blockId == 0) WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier);
+  if (blockId == 0)
+  {
+    if (verbose && threadIdx.x == 0)  printf("Rank %d enters GPU barrier\n", rank);
+    WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier, rank, verbose);
+    if (verbose && threadIdx.x == 0)  printf("Rank %d exits GPU barrier\n", rank);
+  }
 }

 #endif
@@ -62,7 +62,7 @@ typedef struct

 // Multi-GPU (on same node) barrier.  One thread per grid per GPU updates barrier / waits
 template <int NUM_RANKS>
-__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
+__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int const rank, int const verbose)
 {
  if (threadIdx.x == 0)
  {
@@ -71,6 +71,7 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
    int localSense = *barrier.localSense;

    int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST);
+    if (verbose) printf("Rank %d arrived at GPU barrier %d\n", rank, val);
    if (val == NUM_RANKS)
    {
      // Last arrival resets barrier
@@ -80,7 +81,18 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
    else
    {
      // Wait for all ranks to reach barrier
-      while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense);
+      int counter = 0;
+      while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense)
+      {
+        if (verbose)
+        {
+          counter++;
+          if (counter == 100000000)
+          {
+            printf("Rank %d waiting on GPU barrier: (%d != %d)", rank, *barrier.globalSense, localSense);
+          }
+        }
+      }
    }
  }
 }
@@ -48,6 +48,7 @@ int*               CliqueManager::m_staticGpuBarrierMem             = NULL;
 RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0);                                  // Opt-in environment variable for clique-based kernels
 RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 2097152);  // Max number of bytes to use clique-based kernels for all reduce
 RCCL_PARAM(AllReduceNumChannels,     "CLIQUE_ALLREDUCE_NCHANNELS", 4);         // Number of channels to use for all-reduce
+RCCL_PARAM(CliqueDebug, "CLIQUE_DEBUG", 0);                                    // Emit debug messages

 CliqueManager::CliqueManager(int          const  rank,
                             int          const  numRanks,
@@ -337,7 +338,7 @@ ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
  // Prepare clique argments (NOTE: clique pointers are not ready yet)
  int opIndex = args->opCount % NCCL_MAX_OPS;
  args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
-
+  args->clique.verbose = rcclParamCliqueDebug();

  // Determine number of channels to use for this collective
  args->clique.nChannels = rcclParamAllReduceNumChannels();
@@ -477,16 +478,33 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz

 void CliqueManager::WaitForBarrier()
 {
+  int const verbose = rcclParamCliqueDebug();
+
  // Sense inversion barrier
  m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;

-  if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
+  int val = __sync_add_and_fetch(m_cpuBarrierGlobalCount, 1);
+  if (verbose) INFO(NCCL_INIT, "Rank %d reaches barrier at %d", m_rank, val);
+
+  if (val == m_numRanks)
  {
    // Reset the barrier
    STORE(m_cpuBarrierGlobalCount, 0);
    STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
  } else {
-    while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
+    size_t counter = 0;
+    while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense)
+    {
+      if (verbose)
+      {
+        counter++;
+        if (counter == 100000000)
+        {
+          WARN("Rank %d waiting in CPU barrier: (%d != %d)", m_rank, *m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
+        }
+      }
+    }
+    if (verbose) INFO(NCCL_INIT, "Rank %d leaves CPU barrier", m_rank);
  }
 }

@@ -201,6 +201,7 @@ struct CollectiveArgs {
      uint8_t bid;
      uint8_t nChannels;
      size_t count;
+      int verbose;
      cliqueDevicePtrs_t* ptrs;
    } clique;
    // [/RCCL]