Clique tuning upgrade (#352)

* Enabling clique for any XGMI-connected topology, adding tuning * Updating CHANGELOG for clique tuning * Re-working clique barrier system to work on multi-process / multi-gpu [ROCm/rccl commit: 9d7232c091]
2021-05-06 09:50:07 -06:00
@@ -2,6 +2,15 @@

 Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)

+## [Unreleased]
+### Optimizations
+- Additional tuning for clique-based kernel AllReduce performance (still requires opt in with RCCL_ENABLE_CLIQUE=1)
+
+### Changed
+- Replaced RCCL_FORCE_ENABLE_CLIQUE to RCCL_CLIQUE_IGNORE_TOPO
+- Clique-based kernels can now be enabled on topologies where all active GPUs are XGMI-connected
+- Topologies not normally supported by clique-based kernels require RCCL_CLIQUE_IGNORE_TOPO=1
+
 ## [RCCL-2.7.8 for ROCm 4.1.0]
 ### Added
 - Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1)
@@ -42,13 +42,14 @@ THE SOFTWARE.
 #include <thread>
 #include <unistd.h>

-cliqueDevicePtrs_t CliqueManager::m_staticCliquePtrs[NCCL_MAX_OPS]  = {};
-int*               CliqueManager::m_staticGpuBarrierMem             = NULL;
+cliqueDevicePtrs_t CliqueManager::m_staticCliquePtrs[NCCL_MAX_OPS]     = {};
+int                CliqueManager::m_staticBarrierCount[NCCL_MAX_OPS*2] = {};
+int*               CliqueManager::m_staticGpuBarrierMem                = NULL;

 // Define some environment variables that affect clique-based kernels
-RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0);                                  // Opt-in environment variable for clique-based kernels
-RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 16777216); // Max number of bytes to use clique-based kernels for all reduce
-RCCL_PARAM(AllReduceNumChannels,     "CLIQUE_ALLREDUCE_NCHANNELS", 0);         // Number of channels to use for all-reduce. (0 for auto-select)
+RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0);                           // Opt-in environment variable for clique-based kernels
+RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 0); // Max number of bytes to use clique-based kernels for all reduce (0 for auto-select)
+RCCL_PARAM(AllReduceNumChannels,     "CLIQUE_ALLREDUCE_NCHANNELS", 0);  // Number of channels to use for all-reduce. (0 for auto-select)

 CliqueManager::CliqueManager(int          const  rank,
                             int          const  numRanks,
@@ -56,6 +57,8 @@ CliqueManager::CliqueManager(int          const  rank,
  m_rank(rank),
  m_numRanks(numRanks),
  m_cliqueMode(cliqueMode),
+  m_opIndexHead(0),
+  m_opIndexTail(0),
  m_init(false),
  m_pinnedCliquePtrs(NULL),
  m_fineGrainBarrierMem(NULL)
@@ -113,7 +116,11 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
  m_init = true;

  m_hash = djb2Hash(commId->internal);
-  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+  if (m_cliqueMode == CLIQUE_DISABLED)
+  {
+    INFO(NCCL_INIT, "Clique kernels disabled");
+    return ncclSuccess;
+  }

  // Check parameters
  if (m_rank < 0 || m_rank >= m_numRanks)
@@ -190,7 +197,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
    }

    // Initialize shared CPU memory to be used for barrier variables
-    m_sharedCpuMemory = ShmObject<int32_t>(2 * sizeof(int32_t),
+    m_sharedCpuMemory = ShmObject<int32_t>(NCCL_MAX_OPS * 2 * sizeof(int32_t),
                                           CliqueShmNames["SharedCounters"] + shmSuffix,
                                           m_rank,
                                           m_numRanks,
@@ -198,19 +205,18 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
    NCCLCHECKGOTO(m_sharedCpuMemory.Open(), res, dropback);

    // Split up the shared CPU memory for barrier counters / global sense
-    m_cpuBarrierGlobalCount = &m_sharedCpuMemory.Get()[0];
-    m_cpuBarrierGlobalSense = &m_sharedCpuMemory.Get()[1];
+    m_cpuBarrierCount = m_sharedCpuMemory.Get();

    // Initialize CPU barriers
    if (m_rank == 0)
    {
-      *m_cpuBarrierGlobalCount = 0;
-      *m_cpuBarrierGlobalSense = 0;
+      memset(m_cpuBarrierCount, 0, NCCL_MAX_OPS * 2 * sizeof(int32_t));
    }
-    m_cpuBarrierLocalSense = 0;
  }
  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
  {
+    m_cpuBarrierCount = &m_staticBarrierCount[0];
+
    // First rank prepares fine-grained memory shared across ranks used for the two barrier variables
    if (m_rank == 0)
    {
@@ -224,9 +230,18 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
    }
  }

+  // Figure out device arch for tuning
+  int deviceId;
+  CUDACHECK(hipGetDevice(&deviceId));
+  hipDeviceProp_t devProp;
+  CUDACHECK(hipGetDeviceProperties(&devProp, deviceId));
+  m_gcnArch = devProp.gcnArch;
+
+  // Establish when to use clique-based kernels based on input size
+  SetByteLimits();

  m_init = true;
-  INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d)", m_cliqueMode);
+  INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArch);
  return ncclSuccess;

 dropback:
@@ -238,6 +253,20 @@ dropback:
  return ncclSuccess;
 }

+void CliqueManager::SetByteLimits()
+{
+  m_allReduceByteLimit = rcclParamAllReduceCliqueByteLimit();
+  if (m_allReduceByteLimit == 0)
+  {
+    switch (m_gcnArch)
+    {
+    case 906: m_allReduceByteLimit =  16777216; break;
+    case 908: m_allReduceByteLimit = 536870912; break;
+    default:  m_allReduceByteLimit =  16777216; break;
+    }
+  }
+}
+
 bool CliqueManager::IsSupported(ncclFunc_t const coll,
                                size_t const count,
                                ncclDataType_t const datatype,
@@ -247,12 +276,11 @@ bool CliqueManager::IsSupported(ncclFunc_t const coll,

  // Filter based on total input size for each collective type
  size_t totalBytes = count * ncclTypeSize(datatype);
-  if (coll == ncclFuncAllReduce && (totalBytes <= rcclParamAllReduceCliqueByteLimit())) return true;
-
+  if (coll == ncclFuncAllReduce && (totalBytes <= m_allReduceByteLimit)) return true;
  return false;
 }

-ncclResult_t CliqueManager::DeclarePointers(uint64_t opCount, void const* inputPtr, void* outputPtr)
+ncclResult_t CliqueManager::DeclarePointers(void const* inputPtr, void* outputPtr)
 {
  // Do nothing if disabled
  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
@@ -263,11 +291,11 @@ ncclResult_t CliqueManager::DeclarePointers(uint64_t opCount, void const* inputP
    return ncclInvalidUsage;
  }

-  int const opIndex = opCount % NCCL_MAX_OPS;
-
-  // Add opIndex to queue of in-progress collectives
-  m_inProgress.push(opIndex);
+  // Add to queue of in-progress collectives
+  int32_t const opIndex = m_opIndexTail;
+  m_opIndexTail = (m_opIndexTail + 1) % NCCL_MAX_OPS;

+  INFO(NCCL_COLL, "Rank %d declaring pointers for opIndex %d", m_rank, opIndex);
  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
  {
    // Get fine-grained device memory if not already done
@@ -302,6 +330,18 @@ ncclResult_t CliqueManager::DeclarePointers(uint64_t opCount, void const* inputP
    m_staticCliquePtrs[opIndex].outputs[m_rank] = outputPtr;
  }

+  // Increment entry barrier counter - must not block
+  volatile int* entryCounter = &m_cpuBarrierCount[2 * opIndex];
+  int entryVal = LOAD(entryCounter);
+  // Loop until successful atomic update to counter
+  bool done = false;
+  while (done == false) {
+    // Last rank resets exit barrier counter prior to incrementing entry count to numRanks
+    if (entryVal+1 == m_numRanks)
+      m_cpuBarrierCount[2 * opIndex + 1] = 0;
+    done = __sync_bool_compare_and_swap(entryCounter, entryVal, entryVal+1);
+    entryVal++;
+  }
  return ncclSuccess;
 }

@@ -320,12 +360,27 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
    {
      // NOTE: These are currently based on collected data and not necessarily ideal for all hardware
      int numChannels;
-      if (totalBytes <= 65536) numChannels = 1;
-      else if (totalBytes <= 262144) numChannels = 2;
-      else if (totalBytes <= 524288) numChannels = 4;
-      else if (totalBytes <= 2097152) numChannels = 8;
-      else numChannels = 11;
-
+      switch (m_gcnArch)
+      {
+      case 906:
+        if      (totalBytes <=   16384) numChannels =  1;
+        else                            numChannels =  2;
+        break;
+      case 908:
+        if      (totalBytes <=  262144) numChannels =  4;
+        else                            numChannels = 14;
+        break;
+      case 910:
+        if      (totalBytes <=  262144) numChannels =  4;
+        else                            numChannels =  8;
+        break;
+      default:
+        if      (totalBytes <=   65536) numChannels =  1;
+        else if (totalBytes <=  262144) numChannels =  2;
+        else if (totalBytes <=  524288) numChannels =  4;
+        else if (totalBytes <= 2097152) numChannels =  8;
+        else                            numChannels = 11;
+      }
      *numChannelstoUse = std::min(numChannels, totalNumChannels);
    }
    else
@@ -333,82 +388,79 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
      *numChannelstoUse = std::min((int)rcclParamAllReduceNumChannels(), totalNumChannels);
    }
  }
-
  return ncclSuccess;
 }

-
-
-ncclResult_t CliqueManager::SetCliqueArgs(ncclWorkElem* args)
+ncclResult_t CliqueManager::WaitForPointers(ncclWorkElem* args)
 {
  // Do nothing if disabled
  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+
  if (!m_init)
  {
    WARN("CliqueManager must be initialized before use");
    return ncclInvalidUsage;
  }

-  // Prepare clique argments (NOTE: clique pointers are not ready yet)
-  int opIndex = args->opCount % NCCL_MAX_OPS;
+  // Check that collective queue is not empty
+  if (m_opIndexHead == m_opIndexTail)
+  {
+    WARN("WaitForPointers must be called after DeclarePointers");
+    return ncclInvalidUsage;
+  }
+
+  // Pop first collective off queue
+  int32_t const opIndex = m_opIndexHead;
+  INFO(NCCL_COLL, "Rank %d waiting for pointers for opIndex %d", m_rank, opIndex);
+
+  m_opIndexHead = (m_opIndexHead + 1) % NCCL_MAX_OPS;
  args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];

-  return ncclSuccess;
-}
+  // Wait for all ranks to declare pointers for this opIndex
+  volatile int* entryCounter = (volatile int*)(&m_cpuBarrierCount[2 * opIndex]);
+  int entryVal = LOAD(entryCounter);
+  while (entryVal != m_numRanks) entryVal = LOAD(entryCounter);

-ncclResult_t CliqueManager::WaitForPointers()
-{
-  // Do nothing if disabled
-  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
-
-  if (!m_init)
-  {
-    WARN("CliqueManager must be initialized before use");
-    return ncclInvalidUsage;
+  // Last rank to past barrier resets entry barrier
+  // NOTE: There is another GPU-barrier performed during the kernels therefore it should
+  //       not be possible for any rank to modify entry count prior to being reset
+  volatile int* exitCounter = &m_cpuBarrierCount[2 * opIndex + 1];
+  int exitVal = LOAD(exitCounter);
+  // Loop until successful atomic update to counter
+  bool done = false;
+  while (done == false) {
+    // Last rank resets entry counter
+    if (exitVal+1 == m_numRanks)
+      m_cpuBarrierCount[2 * opIndex] = 0;
+    done = __sync_bool_compare_and_swap(exitCounter, exitVal, exitVal+1);
+    exitVal++;
  }
+  INFO(NCCL_COLL, "Rank %d past opIndex barrier %d", m_rank, opIndex);

-  // Do nothing if there are no outstanding clique-kernels
-  if (m_inProgress.empty()) return ncclSuccess;
-
-  // Copy clique device pointers to pinned device memory
+  // Collect pointers
  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
  {
-    // Wait for all ranks to arrive
-    WaitForBarrier();
-
    int numHandles = m_numRanks * NUM_HANDLES_PER_RANK;
    std::vector<std::pair<hipIpcMemHandle_t,size_t>> handles(numHandles);

-    while (!m_inProgress.empty())
+    // Collect the ready handles from shared memory and convert them to device pointers
+    NCCLCHECK(m_shmHandles.ReadHandles(opIndex, handles));
+    for (int i = 0; i < m_numRanks; i++)
    {
-      int const opIndex = m_inProgress.front();
-      m_inProgress.pop();
+      void *input;
+      NCCLCHECK(CheckCacheForHandle(handles[i * NUM_HANDLES_PER_RANK],
+                                    m_ipcHandleRecvCache, &input));
+      m_pinnedCliquePtrs[opIndex].inputs[i] = const_cast<const void *>(input);

-      // Collect the ready handles from shared memory and convert them to device pointers
-      NCCLCHECK(m_shmHandles.ReadHandles(opIndex, handles));
-      for (int i = 0; i < m_numRanks; i++)
-      {
-        void *input;
-        NCCLCHECK(CheckCacheForHandle(handles[i * NUM_HANDLES_PER_RANK],
-                                      m_ipcHandleRecvCache, &input));
-        m_pinnedCliquePtrs[opIndex].inputs[i] = const_cast<const void *>(input);
-
-        NCCLCHECK(CheckCacheForHandle(handles[(i * NUM_HANDLES_PER_RANK) + 1],
-                                      m_ipcHandleRecvCache, &m_pinnedCliquePtrs[opIndex].outputs[i]));
-      }
+      NCCLCHECK(CheckCacheForHandle(handles[(i * NUM_HANDLES_PER_RANK) + 1],
+                                    m_ipcHandleRecvCache, &m_pinnedCliquePtrs[opIndex].outputs[i]));
    }
  }
  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
  {
-    while (!m_inProgress.empty())
-    {
-      int const opIndex = m_inProgress.front();
-      m_inProgress.pop();
-
-      // Copy from static memory to pinned host memory and set local sense
-      memcpy(&m_pinnedCliquePtrs[opIndex], &m_staticCliquePtrs[opIndex], sizeof(cliqueDevicePtrs_t));
-      m_pinnedCliquePtrs[opIndex].barrier.localSense = &m_gpuBarrierLocalSense[opIndex];
-    }
+    // Copy from static memory to pinned host memory and set local sense
+    memcpy(&m_pinnedCliquePtrs[opIndex], &m_staticCliquePtrs[opIndex], sizeof(cliqueDevicePtrs_t));
+    m_pinnedCliquePtrs[opIndex].barrier.localSense = &m_gpuBarrierLocalSense[opIndex];
  }
  return ncclSuccess;
 }
@@ -486,21 +538,6 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
  return ncclSuccess;
 }

-void CliqueManager::WaitForBarrier()
-{
-  // Sense inversion barrier
-  m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;
-
-  if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
-  {
-    // Reset the barrier
-    STORE(m_cpuBarrierGlobalCount, 0);
-    STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
-  } else {
-    while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
-  }
-}
-
 ncclResult_t CliqueManager::BootstrapRootInit(int pid, unsigned long hash)
 {
  if (rcclParamEnableClique())
@@ -25,7 +25,6 @@ THE SOFTWARE.

 #include <semaphore.h>
 #include <mutex>
-#include <queue>

 #include "nccl.h"
 #include "devcomm.h"
@@ -53,6 +52,8 @@ public:

  ncclResult_t Init(ncclUniqueId const* commId, int suffix);

+  void SetByteLimits();
+
  // Returns true if the collective is supported via a clique-based kernel
  bool IsSupported(ncclFunc_t const coll,
                   size_t const count,
@@ -60,7 +61,7 @@ public:
                   ncclRedOp_t const op) const;

  // Provide the pointers to be exchanged across the clique for the given rank / opCount
-  ncclResult_t DeclarePointers(uint64_t opCount, void const* inputPtr, void* outputPtr);
+  ncclResult_t DeclarePointers(void const* inputPtr, void* outputPtr);

  // Determine the number of channels / CUs to use for this call
  ncclResult_t GetNumChannelsToUse(ncclFunc_t const coll,
@@ -70,12 +71,9 @@ public:
                                   int const totalNumChannels,
                                   uint8_t* numChannelstoUse);

-  // Set pointers for where clique-related arguments will be found
-  // This sets pointers to device-accessible memory where the arguments will eventually reside
-  ncclResult_t SetCliqueArgs(ncclWorkElem* args);
-
-  // Blocking call that only returns after all out-standing clique pointers are ready
-  ncclResult_t WaitForPointers();
+  // Blocking call that only returns the in-progress clique pointers are ready
+  // This needs to be called in same order as DeclarePointers
+  ncclResult_t WaitForPointers(ncclWorkElem* args);

  // Prepares shared memory files upon initialization
  static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);
@@ -90,19 +88,20 @@ protected:
                                   NcclIpcHandleRecvCache* cache,
                                   void** ptr);

-  // Race-condition helper functions
-  void WaitForBarrier();
-
  int                          m_rank;                               // Associated rank
  int                          m_numRanks;                           // Total number of ranks
  unsigned long                m_hash;                               // Hash used for identifying message queues & shared memory
  cliqueMode_t                 m_cliqueMode;                         // Clique mode (off/single process/single node)
+  int32_t                      m_opIndexHead;                        // Track start of outstanding requests
+  int32_t                      m_opIndexTail;                        // Track end of outstanding requests
  bool                         m_init;                               // Whether CliqueManager has been initialized
+  int                          m_gcnArch;                            // Device GCN arch value
+  size_t                       m_allReduceByteLimit;                 // Byte limit for AllReduce
  cliqueDevicePtrs_t*          m_pinnedCliquePtrs;                   // Pinned-host-memory (device accessible) containing device pointers
  int*                         m_gpuBarrierGlobalCount;              // Part of GPU barrier (count variable shared across ranks)
  int*                         m_gpuBarrierGlobalSense;              // Part of GPU barrier (reset variable shared across ranks)
  int*                         m_gpuBarrierLocalSense;               // Part of GPU barrier (reset variable local to this rank)
-  std::queue<int>              m_inProgress;                         // Queue of clique-based collectives waiting for pointers
+  int*                         m_cpuBarrierCount;                    // Points to either m_sharedBarrierCount or m_staticBarrierCount

  // IPC-related (CLIQUE_SINGLE_NODE)
  NcclIpcHandleShm             m_shmHandles;                         // Used to exchange IPC handles between ranks
@@ -111,12 +110,11 @@ protected:
  ShmObject<int32_t>           m_sharedCpuMemory;                    // Used to pass shared memory used for CPU barrier
  ShmObject<hipIpcMemHandle_t> m_sharedIpcHandle;                    // Used to pass fine-grained device memory buffer IPC handle
  int*                         m_fineGrainBarrierMem;                // Fine-grained GPU memory barrier (allocated only on 1st rank, shared on others)
-  int*                         m_cpuBarrierGlobalCount;              // Part of CPU barrier (count variable shared across ranks)
-  int*                         m_cpuBarrierGlobalSense;              // Part of CPU barrier (reset variable shared across ranks)
-  int                          m_cpuBarrierLocalSense;               // Part of CPU barrier (reset variable local to this rank)
+  int*                         m_sharedBarrierCount;                 // Part of CPU barrier (count variable shared across ranks)

  // Single-process (CLIQUE_SINGLE_PROCESS)
  static cliqueDevicePtrs_t    m_staticCliquePtrs[NCCL_MAX_OPS];     // Use shared static memory to exchange pointer info
+  static int                   m_staticBarrierCount[2*NCCL_MAX_OPS]; // Part of CPU barrier (count variable shared across ranks)
  static int*                  m_staticGpuBarrierMem;                // Static storage backing for fine-grained gpu barrier
 };

@@ -128,10 +128,6 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params)
    STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2);
  }

-  { // [RCCL] Wait for any clique-based collectives
-    NCCLCHECK(comm->cliqueManager->WaitForPointers());
-  } // [/RCCL]
-
  // Find the first operation, choose the kernel accordingly and pass it
  // as the first argument.
  struct ncclChannel* c0 = comm->channels;
@@ -394,12 +390,6 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
                                               info->datatype,
                                               info->op))
    {
-      // Declare the input / output pointers being used (to exchange via IPC with other ranks)
-      NCCLCHECK(info->comm->cliqueManager->DeclarePointers(info->comm->opCount,
-                                                           info->sendbuff,
-                                                           info->recvbuff));
-
-
      info->algorithm = NCCL_ALGO_RING;
      info->protocol = NCCL_PROTO_CLIQUE;
      // Determine the number of channels to use for clique-kernel
@@ -411,6 +401,9 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
                                                               &work->clique.nChannels));
      work->clique.count = info->count;
      work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
+
+      // Setup pointers to where all the input/output pointers will be
+      NCCLCHECK(info->comm->cliqueManager->WaitForPointers(work));
      return ncclSuccess;
    }
  } // [RCCL]
@@ -519,12 +512,6 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {

    info->comm->myParams->gridDim.x++;

-    // [RCCL] Setup pointers to where all the input/output pointers will be
-    if (info->protocol == NCCL_PROTO_CLIQUE) {
-      NCCLCHECK(info->comm->cliqueManager->SetCliqueArgs(&work));
-    }
-    // [/RCCL]
-
    work.coll.bid = bid % nChannels;
    NCCLCHECK(getNextOp(channel, NULL, &work));
  }
@@ -664,6 +651,20 @@ ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
 }

 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  // [RCCL] Check for clique-based kernel support
+  {
+    if (info->comm->cliqueManager->IsSupported(info->coll,
+                                               info->count,
+                                               info->datatype,
+                                               info->op))
+    {
+      // Declare the input / output pointers being used (to exchange via IPC with other ranks)
+      // This is done immediately, and does not block
+      NCCLCHECK(info->comm->cliqueManager->DeclarePointers(info->sendbuff, info->recvbuff));
+    }
+  }
+  // [/RCCL]
+
  // Launch asynchronously if needed
  if (ncclAsyncMode()) {
    ncclResult_t ret = ncclSuccess;
@@ -30,6 +30,7 @@

 // [RCCL]
 #include "clique/CliqueManager.h"
+#include <hsa/hsa_ext_amd.h>
 // [/RCCL]

 #define STR2(v) #v
@@ -363,7 +364,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
  return ncclSuccess;
 }

-RCCL_PARAM(ForceEnableClique, "FORCE_ENABLE_CLIQUE", 0);
+RCCL_PARAM(CliqueIgnoreTopo, "CLIQUE_IGNORE_TOPO", 0);
 RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);

 static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
@@ -865,7 +866,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
    if (comm->localRanks == comm->nRanks)
    {
-      // Check that all the GPUs have peer access to one another
+      // Check that all the GPUs have peer access to one another and are XGMI connected
+      bool allXgmi = true;
      bool hasPeerAccess = true;
      for (int i = 0; i < nranks && hasPeerAccess; i++)
      {
@@ -880,6 +882,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
            hasPeerAccess = false;
            break;
          }
+
+          uint32_t linkType, hopCount;
+          CUDACHECK(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+          allXgmi &= (linkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
        }
      }
      if (hasPeerAccess)
@@ -890,15 +896,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
          cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
      }

-      // For now, only enable clique-based kernels on CR8_G topologies, unless explicitly asked
-      if (!rcclParamForceEnableClique())
+      // For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected
+      if (!allXgmi && !rcclParamCliqueIgnoreTopo())
      {
-        // Disable clique-kernel support if not on CR8 topology
-        if (!(comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)))
-        {
-          INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (force enable with RCCL_FORCE_ENABLE_CLIQUE)");
-          cliqueMode = CliqueManager::CLIQUE_DISABLED;
-        }
+        INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
+        cliqueMode = CliqueManager::CLIQUE_DISABLED;
      }
    }
    comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
@@ -11,6 +11,7 @@ if(BUILD_TESTS)
  set(TEST_SOURCES_SINGLE_PROCESS
    test_AllGather.cpp
    test_AllReduce.cpp
+    test_AllReduceGroup.cpp
    test_Broadcast.cpp
    test_Reduce.cpp
    test_ReduceScatter.cpp
@@ -27,6 +28,7 @@ if(BUILD_TESTS)
  set(TEST_SOURCES_MULTI_PROCESS
    test_AllGatherMultiProcess.cpp
    test_AllReduceMultiProcess.cpp
+    test_AllReduceGroupMultiProcess.cpp
    test_AllToAllMultiProcess.cpp
    test_BroadcastMultiProcess.cpp
    test_CombinedCallsMultiProcess.cpp
@@ -8,56 +8,56 @@

 namespace CorrectnessTests
 {
-    TEST_P(AllReduceCorrectnessTest, Correctness)
+  TEST_P(AllReduceCorrectnessTest, Correctness)
+  {
+    if (numDevices > numDevicesAvailable) return;
+
+    // Prepare input / output / expected results
+    Dataset dataset;
+    dataset.Initialize(numDevices, numElements, dataType, inPlace);
+    FillDatasetWithPattern(dataset);
+    ComputeExpectedResults(dataset, op);
+
+    // Launch the reduction (1 thread per GPU)
+    ncclGroupStart();
+    for (int i = 0; i < numDevices; i++)
    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Prepare input / output / expected results
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace);
-        FillDatasetWithPattern(dataset);
-        ComputeExpectedResults(dataset, op);
-
-        // Launch the reduction (1 thread per GPU)
-        ncclGroupStart();
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
-                          numElements, dataType, op, comms[i], streams[i]);
-        }
-        ncclGroupEnd();
-
-        // Wait for reduction to complete
-        Synchronize();
-
-        // Check results
-        ValidateResults(dataset);
-
-        dataset.Release();
+      ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
+                    numElements, dataType, op, comms[i], streams[i]);
    }
+    ncclGroupEnd();

-    INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
-                            AllReduceCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator
-                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Values(2,3,4,5,6,7,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
-                            CorrectnessTest::PrintToStringParamName());
+    // Wait for reduction to complete
+    Synchronize();
+
+    // Check results
+    ValidateResults(dataset);
+
+    dataset.Release();
+  }
+
+  INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
+                           AllReduceCorrectnessTest,
+                           testing::Combine(
+                             // Reduction operator
+                             testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
+                             // Data types
+                             testing::Values(ncclInt8,
+                                             ncclUint8,
+                                             ncclInt32,
+                                             ncclUint32,
+                                             ncclInt64,
+                                             ncclUint64,
+                                             //ncclFloat16,
+                                             ncclFloat32,
+                                             ncclFloat64,
+                                             ncclBfloat16),
+                             // Number of elements
+                             testing::Values(1024, 1048576),
+                             // Number of devices
+                             testing::Values(2,3,4,5,6,7,8),
+                             // In-place or not
+                             testing::Values(false, true),
+                             testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+                           CorrectnessTest::PrintToStringParamName());
 } // namespace
@@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_AllReduceGroup.hpp"
+
+namespace CorrectnessTests
+{
+  // This tests aggregated AllReduce calls within a group
+  TEST_P(AllReduceGroupCorrectnessTest, Correctness)
+  {
+    if (numDevices > numDevicesAvailable) return;
+
+    // Prepare input / output / expected results
+    Dataset dataset1, dataset2, dataset3;
+    dataset1.Initialize(numDevices, numElements, dataType, inPlace);
+    dataset2.Initialize(numDevices, numElements, dataType, inPlace);
+    dataset3.Initialize(numDevices, numElements, dataType, inPlace);
+    FillDatasetWithPattern(dataset1);
+    FillDatasetWithPattern(dataset2);
+    FillDatasetWithPattern(dataset3);
+    ComputeExpectedResults(dataset1, op);
+    ComputeExpectedResults(dataset2, op);
+    ComputeExpectedResults(dataset3, op);
+
+    // Launch the reduction (1 thread per GPU)
+    ncclGroupStart();
+    for (int i = 0; i < numDevices; i++)
+    {
+      ncclAllReduce(dataset1.inputs[i], dataset1.outputs[i], numElements, dataType, op, comms[i], streams[i]);
+      ncclAllReduce(dataset2.inputs[i], dataset2.outputs[i], numElements, dataType, op, comms[i], streams[i]);
+      ncclAllReduce(dataset3.inputs[i], dataset3.outputs[i], numElements, dataType, op, comms[i], streams[i]);
+    }
+    ncclGroupEnd();
+
+    // Wait for reduction to complete
+    Synchronize();
+
+    // Check results
+    ValidateResults(dataset1);
+    ValidateResults(dataset2);
+    ValidateResults(dataset3);
+
+        dataset1.Release();
+        dataset2.Release();
+        dataset3.Release();
+  }
+
+  INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep,
+                           AllReduceGroupCorrectnessTest,
+                           testing::Combine(
+                             // Reduction operator
+                             testing::Values(ncclSum),
+                             // Data types
+                             testing::Values(ncclFloat32, ncclFloat64),
+                             // Number of elements
+                             testing::Values(1024, 1048576),
+                             // Number of devices
+                             testing::Values(2,3,4,5,6,7,8),
+                             // In-place or not
+                             testing::Values(false, true),
+                             testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+                           CorrectnessTest::PrintToStringParamName());
+} // namespace
@@ -0,0 +1,79 @@
+/*************************************************************************
+ * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_ALLREDUCEGROUP_HPP
+#define TEST_ALLREDUCEGROUP_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class AllReduceGroupCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
+        {
+            // Copy all inputs to expected arrays temporarily to perform reduction on host
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            // Allocate temporary host array to accumulate results
+            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  resultU1 = (uint8_t  *)resultI1;
+            int32_t*  resultI4 = (int32_t  *)resultI1;
+            uint32_t* resultU4 = (uint32_t *)resultI1;
+            int64_t*  resultI8 = (int64_t  *)resultI1;
+            uint64_t* resultU8 = (uint64_t *)resultI1;
+            float*    resultF4 = (float    *)resultI1;
+            double*   resultF8 = (double   *)resultI1;
+            rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
+
+            // Initialize the result with the first device's array
+            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+            // Perform reduction on the other device arrays
+            for (int i = 1; i < dataset.numDevices; i++)
+            {
+                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                float*    arrayF4 = (float    *)arrayI1;
+                double*   arrayF8 = (double   *)arrayI1;
+                rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
+
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                    case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+            }
+
+            // Copy results into expected arrays
+            for (int i = 0; i < dataset.numDevices; i++)
+                memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
+
+            free(resultI1);
+        }
+    };
+}
+
+#endif
@@ -0,0 +1,81 @@
+/*************************************************************************
+ * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "test_AllReduceGroupMultiProcess.hpp"
+
+namespace CorrectnessTests
+{
+    TEST_P(AllReduceGroupMultiProcessCorrectnessTest, Correctness)
+    {
+        // Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
+        // as they appear in TestGroupCalls()
+        std::vector<ncclFunc_t> ncclFuncs;
+        ncclFuncs.push_back(ncclCollAllReduce);
+        ncclFuncs.push_back(ncclCollAllReduce);
+        ncclFuncs.push_back(ncclCollAllReduce);
+
+        // Create multiple datasets for combined operation
+        std::vector<Dataset*> datasets(ncclFuncs.size());
+        for (int i = 0; i < datasets.size(); i++)
+        {
+            datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
+            datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
+        }
+
+        int const numGpusPerProcess = 2;
+        int const numProcesses = numDevices / numGpusPerProcess;
+        std::vector<int> pids(numProcesses);
+        int process = -1;
+
+        for (int i = 0; i < numDevices; i+= numGpusPerProcess)
+        {
+            process++;
+            int pid = fork();
+            if (pid == 0)
+            {
+                int gpuIdx = i;
+                int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
+
+                std::vector<int> ranks;
+                for (; gpuIdx < maxIdx; gpuIdx++)
+                {
+                    ranks.push_back(gpuIdx);
+                }
+
+                bool pass;
+                TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
+                TerminateChildProcess(pass);
+            }
+            else
+            {
+                pids[process] = pid;
+            }
+        }
+
+        ValidateProcesses(pids);
+
+        for (int i = 0; i < datasets.size(); i++)
+        {
+            munmap(datasets[i], sizeof(Dataset));
+        }
+    }
+
+    INSTANTIATE_TEST_SUITE_P(AllReduceGroupMultiProcessCorrectnessSweep,
+                            AllReduceGroupMultiProcessCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator (not used)
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclFloat32,
+                                                ncclFloat64),
+                                // Number of elements
+                                testing::Values(3072, 3145728),
+                                // Number of devices
+                                testing::Values(4,8),
+                                // In-place or not
+                                testing::Values(false, true),
+                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+                            CorrectnessTest::PrintToStringParamName());
+} // namespace
@@ -0,0 +1,105 @@
+/*************************************************************************
+ * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP
+#define TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP
+
+#include "CorrectnessTest.hpp"
+#include "test_AllReduceMultiProcess.hpp"
+#include <string>
+
+namespace CorrectnessTests
+{
+    class AllReduceGroupMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
+    {
+    public:
+        void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
+        {
+            ncclGroupStart();
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
+                if (numDevices > numDevicesAvailable)
+                {
+                    break;
+                }
+            }
+            ncclGroupEnd();
+
+            if (numDevices > numDevicesAvailable)
+            {
+                pass = true;
+                return;
+            }
+
+            int numProcesses = numDevices / ranks.size();
+            Barrier barrier(process, numProcesses, std::atoi(getenv("NCCL_COMM_ID")));
+
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                for (int j = 0; j < datasets.size(); j++)
+                {
+                    FillDatasetWithPattern(*datasets[j], ranks[i]);
+                }
+            }
+
+            int const root = 0;
+
+            for (int i = 0; i < 3; i++)
+            {
+              AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[i], barrier, op, ranks);
+            }
+            barrier.Wait();
+
+            size_t const byteCount = datasets[0]->NumBytes() / numDevices;
+            size_t const elemCount = numElements / numDevices;
+
+            ncclGroupStart();
+            // AllReduce
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                int rank = ranks[i];
+                for (int j = 0; j < 3; j++)
+                {
+                  ncclAllReduce(datasets[j]->inputs[rank], datasets[j]->outputs[rank],
+                                numElements, dataType, op, comms[rank], streams[rank]);
+                }
+            }
+            // Signal end of group call
+            ncclGroupEnd();
+
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                HIP_CALL(hipSetDevice(ranks[i]));
+                HIP_CALL(hipStreamSynchronize(streams[ranks[i]]));
+            }
+
+            for (int i = 0; i < funcs.size(); i++)
+            {
+                for (int j = 0; j < ranks.size(); j++)
+                {
+                    pass = ValidateResults(*datasets[i], ranks[j], root);
+                    if (!pass)
+                    {
+                        break;
+                    }
+                }
+                barrier.Wait();
+                for (int j = 0; j < ranks.size(); j++)
+                {
+                    datasets[i]->Release(ranks[j]);
+                }
+            }
+
+            for (int i = 0; i < ranks.size(); i++)
+            {
+                TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
+            }
+        }
+    };
+}
+
+#endif
@@ -8,53 +8,53 @@

 namespace CorrectnessTests
 {
-    TEST_P(AllReduceMultiProcessCorrectnessTest, Correctness)
+  TEST_P(AllReduceMultiProcessCorrectnessTest, Correctness)
+  {
+    dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
+    std::vector<int> pids(numDevices);
+
+    int gpu = -1;
+    for (int i = 0; i < numDevices; i++)
    {
-        dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestAllReduce(gpu, *dataset, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
+      gpu++;
+      int pid = fork();
+      if (pid == 0)
+      {
+        bool pass;
+        TestAllReduce(gpu, *dataset, pass);
+        TerminateChildProcess(pass);
+      }
+      else
+      {
+        pids[gpu] = pid;
+      }
    }

-    INSTANTIATE_TEST_SUITE_P(AllReduceMultiProcessCorrectnessSweep,
-                            AllReduceMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator
-                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
+    ValidateProcesses(pids);
+  }
+
+  INSTANTIATE_TEST_SUITE_P(AllReduceMultiProcessCorrectnessSweep,
+                           AllReduceMultiProcessCorrectnessTest,
+                           testing::Combine(
+                             // Reduction operator
+                             testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
+                             // Data types
+                             testing::Values(ncclInt8,
+                                             ncclUint8,
+                                             ncclInt32,
+                                             ncclUint32,
+                                             ncclInt64,
+                                             ncclUint64,
+                                             //ncclFloat16,
+                                             ncclFloat32,
+                                             ncclFloat64,
+                                             ncclBfloat16),
+                             // Number of elements
+                             testing::Values(1024, 1048576),
+                             // Number of devices
+                             testing::Values(2,3,4,8),
+                             // In-place or not
+                             testing::Values(false, true),
+                             testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+                           CorrectnessTest::PrintToStringParamName());
 } // namespace