Merge pull request #186 from wenkaidu/v2.6.4

Merge with NCCL 2.6.4
2020-04-02 10:42:01 -07:00
commit 3cbe5c8a40
@@ -111,6 +111,7 @@ set(CC_SOURCES
    src/graph/connect.cc
    src/graph/tuning.cc
    src/graph/topo.cc
+    src/graph/xml.cc
    src/collectives/all_reduce.cc
    src/collectives/all_gather.cc
    src/collectives/reduce.cc
@@ -122,6 +123,7 @@ set(CC_SOURCES
    src/misc/utils.cc
    src/misc/ibvwrap.cc
    src/misc/nvmlwrap_stub.cc
+    src/transport/coll_net.cc
    src/transport/net.cc
    src/transport/net_ib.cc
    src/transport/net_socket.cc
@@ -1,6 +1,5 @@
 #
 # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 5
-NCCL_PATCH   := 7
+NCCL_MINOR   := 6
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -1,6 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2015-2020, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -12,9 +11,9 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
                misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
-		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
-                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
+                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc

 ##### lib files
 LIBNAME     := libnccl.so
@@ -1,6 +1,5 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -7,24 +7,32 @@

 #include "channel.h"
 #include "param.h"
+#include "graph.h"

-NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", -2);

 ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
  struct ncclChannel* channel = comm->channels+channelid;
  channel->id = channelid;

  // Setup intermediate buffering
-  channel->buffSize = ncclParamBuffsize();
+  int buffSize = ncclParamBuffsize();
+  int cpuArch, cpuVendor, cpuModel;
+  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+  channel->buffSize = buffSize != -2 ? buffSize :
+	  cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;

  // Ring index to user rank table.
  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));

  // Communication structures with peers.
-  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
-  for (size_t i=0; i<comm->nRanks; ++i) {
+  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
+  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
+  for (size_t i=0; i<comm->nRanks+1; ++i) {
    channel->peers[i].send.comm = comm;
    channel->peers[i].recv.comm = comm;
  }
@@ -43,9 +51,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
  CUDACHECK(hipFree(channel->ring.devUserRanks));

  // Free transport proxy resources
-  for (int r=0; r<nRanks; r++) {
+  // Note: free all send resources first due to CollNet arrangement
+  for (int r=0; r<nRanks+1; r++) {
    struct ncclPeer* peer = channel->peers+r;
    if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+  }
+  for (int r=0; r<nRanks+1; r++) {
+    struct ncclPeer* peer = channel->peers+r;
    if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
  }

@@ -72,6 +72,10 @@ template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }

+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
@@ -135,6 +139,10 @@ template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }

+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
@@ -200,3 +208,7 @@ __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
@@ -163,6 +163,63 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
  } while(0);
 }

+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  const ssize_t size = args->N;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  int chunkSize = args->lastChunkSize;
+  const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+    struct ncclTree* tree = &channel->collTreeUp;
+    ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.send(thisInput+offset, nelem);
+      } else {
+        prims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  }
+
+  if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+    struct ncclTree* tree = &channel->collTreeDn;
+    ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.recv(thisOutput+offset, nelem);
+      } else {
+        prims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  }
+}
+
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
@@ -298,6 +355,62 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
  } while(0);
 }

+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+    struct ncclTree* tree = &channel->collTreeUp;
+    ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.send(thisInput+offset, nelem);
+      } else {
+        LLprims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  }
+
+  if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+    struct ncclTree* tree = &channel->collTreeDn;
+    ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.recv(thisOutput+offset, nelem);
+      } else {
+        LLprims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  }
+}
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
@@ -437,3 +550,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
    }
  }
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
@@ -73,6 +73,10 @@ template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }

+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
@@ -122,6 +126,10 @@ template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }

+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
@@ -171,3 +179,7 @@ __device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
@@ -1,4 +1,3 @@
-#include "hip/hip_runtime.h"
 /*************************************************************************
 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
@@ -51,7 +50,8 @@ static inline __device__ void exitIfAbortBarrier(int abort) {

 #define NCCL_FUNC4(coll, op, dtype) \
  NCCL_FUNC5(coll##Tree, op, dtype), \
-  NCCL_FUNC5(coll##Ring, op, dtype)
+  NCCL_FUNC5(coll##Ring, op, dtype), \
+  NCCL_FUNC5(coll##CollNet, op, dtype)

 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -133,24 +133,30 @@ struct Caller<f, f + 1>{
 inline
 __device__
 void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
-  if (c->funcIndex < 240) {
-    if (c->funcIndex % 6 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
-    else if (c->funcIndex % 6 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
-    else if (c->funcIndex % 6 == 2) ncclBroadcastTree_copy_i8(&c->args);
-    else if (c->funcIndex % 6 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
-    else if (c->funcIndex % 6 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
-    else ncclBroadcastRing_copy_i8(&c->args);
+  if (c->funcIndex < 360) {
+    if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args);
+    else ncclBroadcastCollNet_copy_i8(&c->args);
  }
-  else if (c->funcIndex < 480) Caller<240, 480>::call(c);
-  else if (c->funcIndex < 720) {
-    if (c->funcIndex % 6 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
-    else if (c->funcIndex % 6 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
-    else if (c->funcIndex % 6 == 2) ncclAllGatherTree_copy_i8(&c->args);
-    else if (c->funcIndex % 6 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
-    else if (c->funcIndex % 6 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
-    else ncclAllGatherRing_copy_i8(&c->args);
+  else if (c->funcIndex < 720) Caller<360, 720>::call(c);
+  else if (c->funcIndex < 1080) {
+    if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args);
+    else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
+    else ncclAllGatherCollNet_copy_i8(&c->args);
  }
-  else Caller<720, 1200>::call(c);
+  else Caller<1080, 1800>::call(c);
 }

 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
@@ -274,7 +280,8 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \

 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
-  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
+  IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)

 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
@@ -346,14 +346,9 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
 template <typename T>
 __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-// Use UNROLL 4 for 2 SRCs, 2 for the rest
-#define AUTOUNROLL (UNROLL*(2/MINSRCS))
-#else
 // Try to limit consecutive load/stores to 8.
 // Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
 #define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
-#endif

 template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
 __device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
@@ -20,7 +20,8 @@ NCCL_FUNC5(coll, op, dtype) \

 #define NCCL_FUNC4(coll, op, dtype) \
  NCCL_FUNC5(coll##Tree, op, dtype), \
-  NCCL_FUNC5(coll##Ring, op, dtype)
+  NCCL_FUNC5(coll##Ring, op, dtype), \
+  NCCL_FUNC5(coll##CollNet, op, dtype)

 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -283,7 +283,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
    r.recvStep[i] = ROUNDUP(r.recvStep[i], SLICESPERCHUNK*SLICESTEPS);
 #if defined(RCCL_USE_DIRECT_BUFFER)
    r.recvDirectBuff[i] = NULL;
-    if (directBuff && LOAD(&conn->direct)) {
+    if (directBuff && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
      r.recvDirectBuff[i] = directBuff;
      if (tid == 0) STORE(conn->ptrExchange, directBuff);
    }
@@ -307,13 +307,13 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
    }
  }

-  __device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
    s.sendBuff[i] = (T*)LOAD(&conn->buff);
    s.sendStep[i] = LOAD(&conn->step);
    s.sendStep[i] = ROUNDUP(s.sendStep[i], SLICESPERCHUNK*SLICESTEPS);
 #if defined(RCCL_USE_DIRECT_BUFFER)
    s.sendDirectBuff[i] = NULL;
-    if (directBuff && LOAD(&conn->direct)) {
+    if (directBuff && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
      void* volatile* ptr = LOAD(&conn->ptrExchange);
      while ((s.sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
      barrier();
@@ -324,7 +324,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
    if (wid == i) s.sendConnTail = s.sendConnHead = s.sendStep[i]; // Make sure we set this after rounding up
    nsend++;
  }
-  __device__ void loadSendSync() {
+  __device__ __forceinline__ void loadSendSync() {
    if (tid < nsend) {
      s.sendConnHeadPtr = LOAD(&s.sendConn->head);
      s.sendConnHeadCache = LOAD(s.sendConnHeadPtr);
@@ -336,7 +336,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
    }
  }

-  __device__ void saveRecvSync() {
+  __device__ __forceinline__ void saveRecvSync() {
    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
      STORE(&r.recvConn->step, r.recvConnHead);
      STORE(r.recvConn->opCountLoc, opCount+1);
@@ -344,7 +344,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
    }
  }

-  __device__ void saveSendSync() {
+  __device__ __forceinline__ void saveSendSync() {
    if (tid < nsend) {
      STORE(&s.sendConn->step, s.sendConnHead);
      STORE(s.sendConn->opCountLoc, opCount+1);
@@ -53,6 +53,10 @@ template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }

+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
@@ -99,6 +103,10 @@ template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }

+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
@@ -145,3 +153,7 @@ __device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
@@ -67,6 +67,10 @@ template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }

+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
@@ -127,6 +131,10 @@ template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }

+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
@@ -189,3 +197,7 @@ __device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
@@ -108,7 +108,6 @@ void ncclDebugInit() {
    if (debugFn[0] != '\0') {
      FILE *file = fopen(debugFn, "w");
      if (file != NULL) {
-        INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn);
        ncclDebugFile = file;
      }
    }
@@ -126,7 +125,7 @@ void ncclDebugInit() {
 */
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
  if (ncclDebugLevel == -1) ncclDebugInit();
-  if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO;
+  if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }

  char hostname[1024];
  getHostName(hostname, 1024, '.');
@@ -136,7 +135,6 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
  char buffer[1024];
  size_t len = 0;
  pthread_mutex_lock(&ncclDebugLock);
-  if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n");
  if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
    len = snprintf(buffer, sizeof(buffer),
                   "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
@@ -7,6 +7,7 @@

 #include "enqueue.h"
 #include "argcheck.h"
+#include "coll_net.h"

 // Only generate inline kernels for LL
 #define NCCL_FUNC5(coll, op, dtype) \
@@ -16,7 +17,8 @@

 #define NCCL_FUNC4(coll, op, dtype) \
  NCCL_FUNC5(coll##Tree, op, dtype), \
-  NCCL_FUNC5(coll##Ring, op, dtype)
+  NCCL_FUNC5(coll##Ring, op, dtype), \
+  NCCL_FUNC5(coll##CollNet, op, dtype)

 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -195,7 +197,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
    hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
  }
  // Start the network proxies as soon as the kernel has been launched. We can't
-  // perform any CUDA call between the two or having a hipFree between the CUDA
+  // perform any CUDA call between the two or having a cudaFree between the CUDA
  // launch and the transportStartProxy call could cause a deadlock.
  // Also, starting the proxies after the CUDA launch seems to be better for
  // performance (latency).
@@ -227,35 +229,23 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/

-// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
-// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
-static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
-  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .84,  .49,  .42,  .60,  .75,  .87,  .94,  .94,  .99,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 },
-  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .84,  .49,  .42,  .60,  .75,  .87,  .94,  .94,  .99,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 },
-  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .41,  .27,  .25,  .39,  .46,  .72,  .76,  .87,  .92,  .97,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 }
-};
-
-static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
-  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .25,  .41,  .55,  .56,  .78,  .94,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 },
-  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .25,  .41,  .55,  .56,  .78,  .94,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 },
-  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .04,  .08,  .09,  .09,  .11,  .13,  .25,  .40,  .59,  .76,  .86,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 }
-};
-
 static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
  struct ncclComm* comm = info->comm;
-  float minTime = 3600000.0; // Hopefully no operation will take an hour to complete.
+  float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
  // Find algorithm / protocol.
  info->algorithm = -1;
  info->protocol = -1;
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+  int nAlgos = NCCL_NUM_ALGORITHMS;
+  // Check collNet support
+  int collNetTypeSupport = 0;
+  if (info->comm->collNetSupport)
+    NCCLCHECK(collNetReduceSupport(info->datatype, info->op, &collNetTypeSupport));
+  if (collNetTypeSupport != 1) nAlgos--;
+  for (int a=0; a<nAlgos; a++) {
    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      float bw = comm->bandwidths[info->coll][a][p];
-      if (bw == 0) continue;
-      int logSize = log2i(info->nBytes>>6);
-      if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize];
-      else if (a == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[p][logSize];
-      float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw);
-      if (time < minTime) {
+      float time;
+      NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, &time));
+      if (time >= 0 && time < minTime) {
        info->algorithm = a;
        info->protocol = p;
        minTime = time;
@@ -266,15 +256,14 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
    WARN("Error : no algorithm/protocol available");
    return ncclInternalError;
  }
-
-  if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, (int)minTime);
+  //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
  TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);

-  int nc = comm->nChannels;
-  int nt = comm->maxThreads[info->protocol];
+  int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
+  int nt = comm->maxThreads[info->algorithm][info->protocol];
  int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
  while (info->nBytes < nc*nt*threadThreshold) {
-    if (nc >= 2) nc--;
+    if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    // do not reduce threads count on VEGA
 #else
@@ -301,7 +290,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
    case ncclCollAllGather:
      info->pattern = ncclPatternRing; break;
    case ncclCollAllReduce:
-      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
+      info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
    default:
      WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
      return ncclInternalError;
@@ -316,6 +305,8 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
    case ncclPatternTreeUpDown:
    case ncclPatternPipelineFrom:
    case ncclPatternPipelineTo:
+    case ncclPatternCollTreeUp:
+    case ncclPatternCollTreeDown:
      info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
    case ncclPatternRing:
      info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
@@ -360,6 +351,13 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
    }
    // Use lastChunkSize as chunkSize
    coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+  } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
+    // Optimize chunkSize / nSteps
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
+    // Use lastChunkSize as chunkSize
+    coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
  } else if (info->protocol == NCCL_PROTO_LL) {
    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
    const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@@ -384,6 +382,8 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
  proxyArgs->chunkSteps = chunkSteps;
  proxyArgs->protocol = info->protocol;
  proxyArgs->opCount = info->comm->opCount;
+  proxyArgs->dtype = info->datatype;
+  proxyArgs->redOp = info->op;
  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
      nLoops, proxyArgs->nsteps, info->comm);
@@ -410,8 +410,11 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
    WARN("Error : mixing different streams within a group call is not supported.");
    return ncclInvalidUsage;
  }
-  for (int bid=0; bid<coll.args.nChannels; bid++) {
-    struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
+
+  int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
+  for (int bid=0; bid<coll.args.nChannels*nSubChannels; bid++) {
+    int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
+    struct ncclChannel* channel = info->comm->channels+channelId;

    if (channel->collCount == NCCL_MAX_OPS) {
      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
@@ -420,6 +423,10 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {

    // Proxy
    proxyArgs.channel = channel;
+    // Adjust pattern for CollNet based on channel index
+    if (nSubChannels == 2) {
+      info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
+    }
    NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));

    info->comm->myParams->gridDim.x++;
@@ -431,7 +438,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {

    memcpy(c, &coll, sizeof(struct ncclColl));

-    c->args.bid = bid;
+    c->args.bid = bid % coll.args.nChannels;
    STORE(&c->active, 1);
    opIndex = (opIndex+1)%NCCL_MAX_OPS;
    c->nextIndex = opIndex;
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -15,7 +15,7 @@
 /******************************************************************/

 ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
    struct ncclTopoRanks* topoRanks) {
  int rank = comm->rank;
  int localRanks = comm->localRanks;
@@ -28,9 +28,14 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
    channel->treeDn.up = -1;
    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
+    channel->collTreeUp.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
+    channel->collTreeDn.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;

    int* ringIntra = ringGraph->intra+c*localRanks;
    int* treeIntra = treeGraph->intra+c*localRanks;
+    int* collNetIntra = collNetGraph->intra+c*localRanks;

    for (int i=0; i<localRanks; i++) {
      if (ringIntra[i] == rank) {
@@ -58,6 +63,16 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
        channel->treeUp.down[0]  = sym ? channel->treeDn.down[0]  : channel->treeDn.up ;
        channel->treeUp.up       = sym ? channel->treeDn.up       : channel->treeDn.down[0];
      }
+      if (collNetIntra[i] == rank) {
+        int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
+
+        // CollTrees are always symmetric, i.e.
+        // up/down go in reverse directions
+        channel->collTreeDn.up      = collNetIntra[prev];
+        channel->collTreeDn.down[0] = collNetIntra[next];
+        channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
+        channel->collTreeUp.up      = channel->collTreeDn.up;
+      }
    }
    topoRanks->ringPrev[c] = channel->ring.prev;
    topoRanks->ringNext[c] = channel->ring.next;
@@ -175,6 +190,40 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* tr
  return ncclSuccess;
 }

+ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) {
+  int nranks = comm->nRanks;
+  int depth = nranks/comm->nNodes;
+  int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;  // send GPU index depends on topo pattern
+  int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks;
+  for (int c=0; c<comm->nChannels/2; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    // Set root of collTree to id nranks
+    if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
+      channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+    }
+    if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
+      channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+    }
+    channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
+    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
+  }
+  int recvIndex = 0;  // recv GPU index is always 0
+  int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
+  for (int c=0; c<comm->nChannels/2; c++) {
+    struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
+    // Set root of collTree to id nranks
+    if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
+      channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+    }
+    if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
+      channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+    }
+    channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
+    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
+  }
+  return ncclSuccess;
+}
+
 // Legacy naming
 NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
 NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
@@ -1,5 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -42,7 +43,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
  NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
  basePath->count = 0;
  basePath->width = LOC_WIDTH;
-  basePath->type = LINK_LOC;
+  basePath->type = PATH_LOC;

  while (nodeList.count) {
    nextNodeList.count = 0;
@@ -58,7 +59,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
        }
        struct ncclTopoLinkList* remPath;
        NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
-        int width = std::min(path->width, link->width);
+        float width = std::min(path->width, link->width);
        if (remPath->width < width) {
          // Find reverse link
          for (int l=0; l<remNode->nlinks; l++) {
@@ -68,8 +69,8 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
            }
          }
          if (remPath->list[0] == NULL) {
-            WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
-                 remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
+            WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
+                 remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
            return ncclInternalError;
          }
          // Copy the rest of the path
@@ -77,9 +78,17 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
          remPath->count = path->count + 1;
          remPath->width = width;

-          // Consider the path is QPI when going through the CPU
-          // Also don't consider LINK_NET as we only care about the NIC->GPU path.
-          int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
+          // Start with path type = link type. PATH and LINK types are supposed to match.
+          // Don't consider LINK_NET as we only care about the NIC->GPU path.
+          int type = link->type == LINK_NET ? 0 : link->type;
+          // Differentiate between one and multiple PCI switches
+          if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
+          // Consider a path going through the CPU as PATH_PHB
+          if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
+          // Ignore Power CPU in an NVLink path
+          if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
+              link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
+
          remPath->type = std::max(path->type, type);

          // Add to the list for the next iteration if not already in the list
@@ -117,9 +126,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
        sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
        offset = strlen(line);
      }
-      INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
+      INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width);
 #else
-      sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
+      sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]);
      offset = strlen(line);
 #endif
    }
@@ -171,7 +180,7 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int

  // Update path characteristics
  srcNode->paths[t2][i2].count = l;
-  srcNode->paths[t2][i2].type = LINK_QPI;
+  srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type);
  srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
  return ncclSuccess;
 }
@@ -194,6 +203,131 @@ static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType)
  }
 }

+static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS };
+ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
+  if (*level == -1) {
+    int l = -1;
+    if (disableEnv) {
+      char* str = getenv(disableEnv);
+      if (str) {
+        int disable = strtol(str, NULL, 0);
+        if (disable == 1) l = 0;
+      }
+    }
+    if (l == -1) {
+      char* str = getenv(levelEnv);
+      if (str) {
+        for (int i=0; i<PATH_NET; i++) {
+          if (strcmp(str, topoPathTypeStr[i]) == 0) {
+            l = i;
+            break;
+          }
+        }
+        // Old style numbering
+        if (l == -1 && str[0] >= '0' && str[0] <= '9') {
+          int oldLevel = strtol(str, NULL, 0);
+          const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
+          if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
+          l = levelsOldToNew[oldLevel];
+        }
+      }
+    }
+    if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]);
+    *level = l >= 0 ? l : -2;
+  }
+  return ncclSuccess;
+}
+
+int ncclTopoUserP2pLevel = -1;
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) {
+  *p2p = 0;
+
+  // Get GPUs from topology
+  int g1, g2;
+  NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
+  struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
+  if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
+    // GPU not found, we can't use p2p.
+    return ncclSuccess;
+  }
+  struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
+
+  // In general, use P2P whenever we can.
+  int p2pLevel = PATH_SYS;
+
+  // Don't use P2P through ARM CPUs
+  int arch, vendor, model;
+  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
+  if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 &&
+      vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
+      model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
+
+  // User override
+  NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
+  if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel;
+
+  // Compute the PCI distance and compare with the p2pLevel.
+  if (path->type <= p2pLevel) *p2p = 1;
+
+  return ncclSuccess;
+}
+
+NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+int ncclTopoUserGdrLevel = -1;
+
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
+  *useGdr = 0;
+
+  // Get GPU and NET
+  int n, g;
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+  struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+
+  // Check that both the NIC and GPUs support it
+  if (net->net.gdrSupport == 0) return ncclSuccess;
+  if (gpu->gpu.gdrSupport == 0) return ncclSuccess;
+
+  if (read) { // For reads (sends) only enable under certain conditions
+    int gdrReadParam = ncclParamNetGdrRead();
+    if (gdrReadParam == 0) return ncclSuccess;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    return ncclSuccess;
+#else
+    if (gdrReadParam < 0) {
+      int nvlink = 0;
+      // Since we don't know whether there are other communicators,
+      // it's better to keep things local if we have a single GPU.
+      if (system->nodes[GPU].count == 1) nvlink = 1;
+      for (int i=0; i<system->nodes[GPU].count; i++) {
+        if (i == g) continue;
+        if (gpu->paths[GPU][i].type == PATH_NVL) {
+          nvlink = 1;
+          break;
+        }
+      }
+      if (!nvlink) return ncclSuccess;
+    }
+#endif
+  }
+
+  // Check if we are close enough that it makes sense to enable GDR
+  int netGdrLevel = PATH_PXB;
+  NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
+  if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
+  int distance = gpu->paths[NET][n].type;
+  if (distance > netGdrLevel) {
+    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
+    return ncclSuccess;
+  }
+
+  *useGdr = 1;
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
  // Precompute paths between GPUs/NICs.

@@ -210,26 +344,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
    // Compute paths to GPU g
    NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));

+    // Update path when we don't want to / can't use GPU Direct P2P
+    for (int p=0; p<system->nodes[GPU].count; p++) {
+      int p2p;
+      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p));
+      if (p2p == 0) {
+        // Divert all traffic through the CPU
+        int cpu;
+        NCCLCHECK(getLocalCpu(system, g, &cpu));
+        NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+      }
+    }
+
    if (peerInfos == NULL) continue;
-    // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
-    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
+    // Remove GPUs we can't talk to because of containers.
+    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank;
    for (int p=0; p<system->nodes[GPU].count; p++) {
      if (p == g) continue;
-      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
-      int p2p;
-      NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
-      if (p2p == 0) {
-        int shm;
-        NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
-        if (shm == 1) {
-          // We cannot use GPU Direct, so we need all traffic to go through a CPU
-          int cpu;
-          NCCLCHECK(getLocalCpu(system, g, &cpu));
-          NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
-        } else {
-          // We cannot communicate with that peer.
-          system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
-        }
+      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank;
+      int shm;
+      NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+      if (shm == 0) {
+        // Mark this peer as inaccessible. We'll trim it later.
+        system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
      }
    }
  }
@@ -239,11 +376,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
    struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
    NCCLCHECK(ncclTopoSetPaths(netNode, system));

-    if (peerInfos == NULL) continue;
    for (int g=0; g<system->nodes[GPU].count; g++) {
-      if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
-        // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
-        // to go through a CPU
+      // Update path when we dont want to / can't use GPU Direct RDMA.
+      int gdr;
+      NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
+      if (gdr == 0) {
+        // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
        int localCpu;
        NCCLCHECK(getLocalCpu(system, g, &localCpu));
        NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
@@ -251,7 +389,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
      }
    }
  }
-
  return ncclSuccess;
 }

@@ -270,7 +407,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
        domains[g] = std::min(domains[g], domains[p]);
      }
    }
-    if (gpu->rank == comm->rank) myDomain = domains[g];
+    if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
  }

  int ngpus = system->nodes[GPU].count;
@@ -288,98 +425,19 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
      free(ids);
      return ncclInternalError;
    }
-
-    // Remove GPUs I can't access (even indirectly) from my view of the node
-    for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
-      for (int n=0; n<system->nodes[t].count; n++) {
-        struct ncclTopoNode* node = system->nodes[t].nodes+n;
-        if (node == gpu) continue;
-        for (int l=0; l<node->nlinks; l++) {
-          while (l<node->nlinks && node->links[l].remNode == gpu) {
-            if (l<node->nlinks-1)
-              memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
-            node->nlinks--;
-          }
-          if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
-            node->links[l].remNode--;
-          }
-        }
-      }
-    }
-    if (g != system->nodes[GPU].count-1)
-      memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
-    system->nodes[GPU].count--;
+    NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
  }

  comm->localRanks = system->nodes[GPU].count;
  if (system->nodes[GPU].count == comm->nRanks) {
-    // Trim network
-    ncclTopoRemovePathType(system, NET);
-    system->nodes[NET].count = 0;
-    for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
-      for (int n=0; n<system->nodes[t].count; n++) {
-        struct ncclTopoNode* node = system->nodes[t].nodes+n;
-        for (int l=0; l<node->nlinks; l++) {
-          struct ncclTopoLink* link = &(node->links[l]);
-          if (link->remNode->type == NET) {
-            // Remove the link
-            for (int i=l; i<(node->nlinks-1); i++) {
-              memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink));
-            }
-            node->nlinks--;
-            l--;  // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop
-          }
-        }
-      }
-    }
+    for (int n=system->nodes[NET].count-1; n>=0; n--)
+      NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
  }
  free(domains);
  free(ids);
  return ncclSuccess;
 }

-static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
-  int nvlSpeed = 0;
-  int nvlPeers = 0;
-  int pciSpeed = 0;
-  for (int l=0; l<node->nlinks; l++) {
-    if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
-    if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
-    if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
-  }
-  *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
-  // Compute max speed to try to accelerate the search.
-  system->maxSpeed = LOC_WIDTH;
-
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
-  }
-  if (system->nodes[NET].count) {
-    // Try to assign one NIC per GPU
-    int netMaxSpeed = 0;
-    int netMaxSpeedCount = 0;
-    for (int n=0; n<system->nodes[NET].count; n++) {
-      int maxSpeed = 0;
-      struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-      for (int g=0; g<system->nodes[GPU].count; g++) {
-        maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
-      }
-      if (maxSpeed > netMaxSpeed) {
-        netMaxSpeed = maxSpeed;
-        netMaxSpeedCount = 1;
-      } else if (maxSpeed == netMaxSpeed) {
-        netMaxSpeedCount++;
-      }
-    }
-    system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
-  }
-  return ncclSuccess;
-}
-
 void ncclTopoFree(struct ncclTopoSystem* system) {
  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
  free(system);
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -8,29 +8,125 @@
 #include "core.h"
 #include "graph.h"
 #include "topo.h"
+#include "xml.h"
+#include <math.h>

-static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) {
-  if (path->count == 0) return ncclSuccess;
-
-  *node = NULL;
-  if (width > 0) {
-    if (path->type > graph->type) return ncclSuccess;
-    graph->type = std::max(graph->type, path->type);
-    graph->nHops += path->count;
-  } else {
-    graph->type = typeSave;
-    graph->nHops -= path->count;
+// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
+// max speed.
+static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  float nvLinkWidth = VEGA_XGMI_WIDTH;
+#else
+  float nvLinkWidth = gpu->gpu.cudaCompCap > 60 ? VOLTA_NVLINK_WIDTH : PASCAL_NVLINK_WIDTH;
+#endif
+  float maxWidth = 0.0;
+  for (int i=0; i<system->nodes[type].count; i++) {
+    struct ncclTopoLinkList* path = gpu->paths[type]+i;
+    float width = path->width;
+    if (path->count == 0) continue;
+    if (path->type == PATH_NVL) width = std::min(nvLinkWidth, width);
+    maxWidth = std::max(maxWidth, width);
  }
+  return maxWidth;
+}
+ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
+  system->maxWidth = 0.0;
+  int inter = system->nodes[NET].count;
+  if (inter == 0 && system->nodes[GPU].count == 1) {
+    system->maxWidth = LOC_WIDTH;
+    return ncclSuccess;
+  }
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
+  }
+  return ncclSuccess;
+}

-  for (int i=0; i<path->count; i++) {
-    if (path->list[i]->width < width) {
-      // Can't follow this path, rewind and exit
-      for (int j=0; j<i; j++) path->list[j]->width += width;
+static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) {
+  for (int l=0; l<node2->nlinks; l++) {
+    struct ncclTopoLink* link = node2->links+l;
+    if (link->remNode == node1) {
+      *revLink = link;
      return ncclSuccess;
    }
-    path->list[i]->width -= width;
  }
-  *node = path->list[path->count-1]->remNode;
+  WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
+  return ncclInternalError;
+}
+
+// This is unfortunately needed since manipulating floats often results in rounding errors.
+#define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000)
+
+static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float speed, int* steps) {
+  float pciSpeed = speed;
+  for (int step=0; step<path->count; step++) {
+    struct ncclTopoNode* node = path->list[step]->remNode;
+    if (node->type == CPU) {
+      // Account for P2P inefficiency through Intel CPU RC
+      if (path->type == PATH_PHB && start->type == GPU &&
+          node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 &&
+          node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+        pciSpeed = INTEL_P2P_OVERHEAD(speed);
+      }
+    }
+  }
+
+  struct ncclTopoNode* node = start;
+  for (int step=0; step<maxSteps; step++) {
+    struct ncclTopoLink* link = path->list[step];
+    struct ncclTopoLink* revLink = NULL;
+    float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed;
+    float revSpeed = 0;
+    if (link->remNode->type == GPU && start->type != GPU) {
+      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+      revSpeed += fwSpeed/8;
+    }
+    if (link->remNode->type == CPU && link->type == LINK_NVL) {
+      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+      revSpeed += fwSpeed;
+    }
+    if (link->width < fwSpeed || (revSpeed && revLink->width < revSpeed)) { *steps = step; return ncclSuccess; }
+    SUB_ROUND(link->width, fwSpeed);
+    if (revSpeed) SUB_ROUND(revLink->width, revSpeed);
+    node = link->remNode;
+  }
+  *steps = maxSteps;
+  return ncclSuccess;
+}
+
+// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
+static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) {
+  // First handle easy cases
+  *node = system->nodes[type2].nodes+index2;
+  if (type1 == -1) return ncclSuccess;
+  struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1;
+  struct ncclTopoLinkList* path = node1->paths[type2]+index2;
+  if (path->count == 0 ) return ncclSuccess;
+
+  // Now check link type
+  *node = NULL;
+  int intra = type1 == GPU && type2 == GPU;
+  float speed = intra ? graph->speedIntra : graph->speedInter;
+  int type = intra ? graph->typeIntra : graph->typeInter;
+
+  if (mult == 1 && (path->type > type)) return ncclSuccess;
+
+  speed *= mult;
+
+  // Check there is enough bandwidth on paths.
+  int step = 0;
+  NCCLCHECK(followPath(path, node1, path->count, speed, &step));
+  if (step < path->count) goto rewind;
+
+  // Enough bandwidth : return destination node.
+  graph->nHops += mult*path->count;
+  *node = system->nodes[type2].nodes+index2;
+  return ncclSuccess;
+
+rewind:
+  // Not enough bandwidth : rewind and exit.
+  NCCLCHECK(followPath(path, node1, step, -speed, &step));
  return ncclSuccess;
 }

@@ -81,22 +177,42 @@ static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
  return 0;
 }

-static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) {
-  for (int n=0; n<system->nodes[NET].count; n++) {
-    if (system->nodes[NET].nodes[n].used & flag) {
-      *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* index) {
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      *index = g;
      return ncclSuccess;
    }
  }
+  WARN("Could not find gpu rank %d\n", rank);
  return ncclInternalError;
 }

+static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    if (system->nodes[NET].nodes[n].id == id) {
+      *index = n;
+      return ncclSuccess;
+    }
+  }
+  WARN("Could not find net id %lx\n", id);
+  return ncclInternalError;
+}
+
+static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoLinkList** netPaths) {
+  int netId = graph->inter[graph->nChannels*2];
+  int n;
+  NCCLCHECK(getNetIndex(system, netId, &n));
+  *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
  const uint64_t flag = 1ULL<<(graph->nChannels);
  int ngpus = system->nodes[GPU].count;
  struct ncclTopoLinkList* paths = gpu->paths[GPU];
  struct ncclTopoLinkList* netPaths = NULL;
-  if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths));
+  if (sortNet) NCCLCHECK(getNetPaths(system, graph, &netPaths));

  struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES];
  memset(scores, 0, ngpus*sizeof(struct ncclGpuScore));
@@ -131,9 +247,13 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
  return ncclSuccess;
 }

-ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time);
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);

-#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so.
+// Try to keep all searchs within one second
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19)
+#define NCCL_SEARCH_TIMEOUT (1<<18)
+#define NCCL_SEARCH_TIMEOUT_TREE (1<<17)
+#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10)

 #define FORCED_ORDER_PCI 1
 #define FORCED_ORDER_REPLAY 2
@@ -143,7 +263,7 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
  if (graph->nChannels == 0) return ncclInternalError;
  int ngpus = system->nodes[GPU].count;
  int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
-  for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) {
+  for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
    *g = i;
    return ncclSuccess;
  }
@@ -151,44 +271,37 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
  return ncclSuccess;
 }

-ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time);
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time);

-ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) {
-  int typeSave = graph->type;
+ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
  const uint64_t flag = 1ULL<<(graph->nChannels);
-  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-  if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave));
+  struct ncclTopoNode* gpu;
+  NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
  if (gpu) {
    gpu->used ^= flag;
-    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time));
+    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time));
    gpu->used ^= flag;
-    if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave));
+    NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu));
  }
  return ncclSuccess;
 }

 ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
-  // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels
-  // since it would likely impact the rings algorithms too.
-  if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess;
+  // 1. Constraint to get the same nChannels between Rings and Trees
+  if (graph->nChannels < graph->minChannels) return ncclSuccess;

-  // 1. Try to get better bandwidth
+  // 2. Try to get better bandwidth
  if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
  if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
    *copy = 1;
    return ncclSuccess;
  }
-  // 2. Give an advantage when all channels are the same
-  if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) {
-    *copy = 1;
-    return ncclSuccess;
-  }
-  // 3. Less hops
-  if (graph->nHops < refGraph->nHops) *copy = 1;
+  // 3. Less hops (but not at the price of going cross NICs)
+  if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
  return ncclSuccess;
 }

-ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) {
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
  if ((*time) <= 0) return ncclSuccess;
  (*time)--;

@@ -196,55 +309,43 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
  if (step == ngpus) {
    // Determine whether we found a better solution or not
    int copy = 0;
-    int sameChannels = graph->sameChannels;
-    if (graph->nChannels > 0) {
-      int* intra = graph->intra+graph->nChannels*ngpus;
-      for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0;
-    }
    graph->nChannels++;
    NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, &copy));
    if (copy) {
      memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
-      if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1;
+      if (graph->nChannels == graph->maxChannels) *time = -1;
    }
-    if (graph->nChannels < MAXCHANNELS/2) {
-      NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time));
+    if (graph->nChannels < graph->maxChannels) {
+      NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time));
    }
    graph->nChannels--;
-    graph->sameChannels = sameChannels;
    return ncclSuccess;
  }
-  graph->intra[graph->nChannels*ngpus+step] = gpu->rank;
+  graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
+  int g = gpu - system->nodes[GPU].nodes;
  if (step == backToNet) {
    // first get back to NIC
    if (system->nodes[NET].count) {
-      int maxWidth = 0;
-      struct ncclTopoLinkList* paths = gpu->paths[NET];
+      int startNetIndex;
+      NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
+      struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
      for (int n=0; n<system->nodes[NET].count; n++) {
-        if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
-        maxWidth = std::max(paths[n].width, maxWidth);
-      }
-      for (int n=0; n<system->nodes[NET].count; n++) {
-        if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
-        if (paths[n].width == maxWidth) {
-          struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-          int typeSave = graph->type;
-          NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave));
-          if (net) {
-            graph->inter[graph->nChannels*2+1] = net->id;
-            NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time));
-            NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave));
-          }
+        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+        if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
+        NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
+        if (net) {
+          graph->inter[graph->nChannels*2+1] = net->id;
+          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
+          NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
        }
      }
    }
  } else if (step < system->nodes[GPU].count-1) {
    // Go to next GPU
-    struct ncclTopoLinkList* paths = gpu->paths[GPU];
    int next[NCCL_TOPO_MAX_NODES];
    int count;
    if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
-      next[0] = step+1;
+      next[0] = (busIdToCudaDev(gpu->id)+1)%system->nodes[GPU].count;
      count = 1;
    } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
      NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
@@ -253,64 +354,64 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
      NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
    }
    for (int i=0; i<count; i++) {
-      int g = next[i];
-      int nvlink = graph->nvlink;
-      graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0;
-      int speed = graph->speedIntra;
-      if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed);
-      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed));
-      graph->nvlink = nvlink;
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, step+1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
    }
  } else if (step == backToFirstRank) {
    // Find first GPU and loop back to it
-    int g;
-    int rank = graph->intra[graph->nChannels*ngpus];
-    for (g=0; g<ngpus; g++) {
-      if (system->nodes[GPU].nodes[g].rank == rank) break;
-    }
-    if (g == ngpus) {
-      WARN("Could not find GPU with rank %d\n", rank);
-      return ncclInternalError;
-    }
-    struct ncclTopoLinkList* paths = gpu->paths[GPU];
-    struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g;
-    int typeSave = graph->type;
-    NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave));
+    int p;
+    NCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels*ngpus], &p));
+    struct ncclTopoNode* firstGpu;
+    NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu));
    if (firstGpu) {
-      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time));
-      NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave));
+      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, time));
+      NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu));
    }
  } else {
    // Next path
-    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time));
+    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
  }
  return ncclSuccess;
 }

-ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) {
-  const uint64_t flag = 1ULL<<(graph->nChannels);
+ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
  const int speed = graph->speedInter;
  for (int n=0; n<system->nodes[NET].count; n++) {
    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
    struct ncclTopoNode* gpu;
-    if (net->used == 0) {
-      graph->inter[graph->nChannels*2] = net->id;
-      for (int i=0; i<system->nodes[NET].count; i++) {
-        if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
-      }
-      struct ncclTopoLinkList* paths = net->paths[GPU];
+    if (graph->collNet && net->net.collSupport == 0) continue;
+    if (net->net.width < speed) continue;
+    if (net->net.maxChannels == 0) continue;

-      // First try the PCI order to set a reference
-      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed));
-      // Then try to replay the last channel
-      if (graph->nChannels > 0) {
-        int g;
-        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
-        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed));
+    graph->inter[graph->nChannels*2] = net->id;
+    for (int i=0; i<system->nodes[NET].count; i++) {
+      if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
+          (system->nodes[NET].nodes[i].net.port == net->net.port)) {
+        system->nodes[NET].nodes[i].net.width -= speed;
+      }
+    }
+    net->net.maxChannels--;
+
+    // First try to replay the last channel
+    if (graph->nChannels > 0) {
+      int g;
+      NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
+    }
+    if (graph->nChannels == 0 || graph->sameChannels == 0) {
+      if (graph->nChannels == 0) {
+        // Always try the PCI order first to set a reference
+        struct ncclTopoLinkList* paths = net->paths[GPU];
+        // find the first GPU that is closest to NIC
+        int f = 0;
+        for (int i = 0; i<system->nodes[GPU].count; i++)
+          if (paths[i].count < paths[f].count) f = i;
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, NET, n, f));
      }

      // Then try the most local GPUs
-      int maxWidth = 0, minHops = 0xfffffff;
+      float maxWidth = 0;
+      int minHops = 0xfffffff;
+      struct ncclTopoLinkList* paths = net->paths[GPU];
      for (int g=0; g<system->nodes[GPU].count; g++) {
        if (paths[g].width > maxWidth) {
          maxWidth = paths[g].width;
@@ -329,14 +430,19 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
              gpu = system->nodes[GPU].nodes+g;
              int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
              if (tryGpuBidir == gpuUsed) {
-                NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed));
+                NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
              }
            }
          }
        }
      }
-      for (int i=0; i<system->nodes[NET].count; i++) {
-        if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+    }
+
+    net->net.maxChannels++;
+    for (int i=0; i<system->nodes[NET].count; i++) {
+      if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
+          (system->nodes[NET].nodes[i].net.port == net->net.port)) {
+        system->nodes[NET].nodes[i].net.width += speed;
      }
    }
  }
@@ -375,17 +481,152 @@ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, in
  return ncclSuccess;
 }

-ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) {
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) {
  int backToNet, backToFirstRank;
  NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
  if (system->nodes[NET].count) {
    // Start from NET
-    ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time);
+    ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
  } else {
-    // Start from GPU 0
-    NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra));
-    if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra));
-    NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra));
+    // Intra-node only.
+    if (graph->nChannels == 0) {
+      // Try PCI order first
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
+    } else {
+      // Also try to replay previous channel
+      int g;
+      NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, -1, -1, g));
+    }
+    if (graph->sameChannels == 0 || graph->nChannels == 0) {
+      // Finally, try all other possibilities unless we are forced to use the same channels
+      for (int g=0; g<system->nodes[GPU].count; g++) {
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+/************************************/
+/* User defined graph from XML file */
+/************************************/
+
+struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
+ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  int ngpus = system->nodes[GPU].count;
+  int* inter = graph->inter+2*c;
+  int* intra = graph->intra+ngpus*c;
+  int n=0, g=0;
+  for (int s=0; s<xmlChannel->nSubs; s++) {
+    struct ncclXmlNode* sub = xmlChannel->subs[s];
+    int dev;
+    NCCLCHECK(xmlGetAttrInt(sub, "dev", &dev));
+    if (strcmp(sub->name, "net") == 0) {
+      inter[n++] = dev;
+    } else if (strcmp(sub->name, "gpu") == 0) {
+      int rank = -1;
+      for (int g=0; g<ngpus; g++) {
+        if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
+      }
+      if (rank == -1) {
+        WARN("XML Import Channel : dev %d not found.", dev);
+        return ncclSystemError;
+      }
+      intra[g++] = rank;
+    }
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  int id;
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id));
+  if (graph->id != id) return ncclSuccess;
+
+  int crossNic;
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
+  if (graph->crossNic == 0 && crossNic == 1) return ncclSuccess;
+  graph->crossNic = crossNic;
+
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
+  NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra));
+  NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter));
+  const char* str;
+  NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
+  NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
+  NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
+  NCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType));
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels));
+  for (int s=0; s<xmlGraph->nSubs; s++) {
+    NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph));
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  for (int s=0; s<xmlGraphs->nSubs; s++) {
+    NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph));
+  }
+  return ncclSuccess;
+}
+
+/* And the reverse : graph->xml */
+ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) {
+  struct ncclXmlNode* xmlChannel;
+  int ngpus = system->nodes[GPU].count;
+  int* inter = graph->inter+2*c;
+  int* intra = graph->intra+ngpus*c;
+  NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
+  struct ncclXmlNode* node;
+  if (system->nodes[NET].count) {
+    NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
+    NCCLCHECK(xmlSetAttrInt(node, "dev", inter[0]));
+  }
+  for (int g=0; g<ngpus; g++) {
+    NCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
+    int dev = -1;
+    for (int i=0; i<ngpus; i++) {
+      if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
+    }
+    if (dev == -1) {
+      WARN("XML Export Channel : rank %d not found.", intra[g]);
+      return ncclInternalError;
+    }
+    NCCLCHECK(xmlSetAttrInt(node, "dev", dev));
+  }
+  if (system->nodes[NET].count) {
+    NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
+    NCCLCHECK(xmlSetAttrInt(node, "dev", inter[1]));
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) {
+  struct ncclXmlNode* xmlGraph;
+  NCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
+  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra));
+  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter));
+  const char* str;
+  NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
+  NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
+  NCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType));
+  NCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels));
+  for (int c=0; c<graph->nChannels; c++) {
+    NCCLCHECK(ncclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph));
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml) {
+  xml->maxIndex = 0;
+  struct ncclXmlNode* xmlGraphs;
+  NCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs));
+  NCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", NCCL_GRAPH_XML_VERSION));
+  for (int g=0; g<ngraphs; g++) {
+    NCCLCHECK(ncclTopoGetXmlFromGraph(graphs[g], system, xml, xmlGraphs));
  }
  return ncclSuccess;
 }
@@ -456,11 +697,16 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) {
  for (i=0; i<ngpus; i++) {
    struct ncclTopoNode* node = system->nodes[GPU].nodes+i;
    if (node->paths[GPU] == NULL) continue;
-    int sum = ngpus*(ngpus-1)/2 - node->rank;
+    int sum = ngpus*(ngpus-1)/2 - node->gpu.rank;
    int count = 0;
    for (int n = 0; n<ngpus; n++) {
-      if (node->paths[GPU][n].type != LINK_NVL) continue;
-      sum -= system->nodes[GPU].nodes[n].rank;
+      struct ncclTopoLink* link;
+      for (link = node->links; link->remNode; link++) {
+        if (link->remNode->gpu.rank == n) break;
+      }
+      if (!link->remNode) continue;
+      if (link->type != LINK_NVL) continue;
+      sum -= system->nodes[GPU].nodes[n].gpu.rank;
      count ++;
    }
    if(count != ngpus-2 || sum < 0 || sum > ngpus-1) {
@@ -492,28 +738,39 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) {
  return;
 }

+float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+#define NSPEEDS (sizeof(speedArray)/sizeof(float))
+
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
  int ngpus = system->nodes[GPU].count;
  int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
  graph->speedIntra = graph->speedInter = 0;
  if (graph->crossNic == 2) graph->crossNic = 0;
-  graph->nvlink = 0;
-  graph->type = LINK_LOC;
+  graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
+  graph->typeInter = PATH_PIX;
  graph->nChannels = 0;
  graph->sameChannels = 1;

-  char* str = getenv("NCCL_GRAPH");
+  char* str = getenv("NCCL_GRAPH_FILE");
+  if (str) {
+    struct ncclXml* xml;
+    NCCLCHECK(ncclCalloc(&xml, 1));
+    NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
+    NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph));
+    free(xml);
+    if (graph->nChannels > 0) return ncclSuccess;
+  }
+
  if (!str) parseChordalRing(system, &str);
  if (str) {
    NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
    for (int i=0; i<graph->nChannels*ngpus; i++) {
      // Translate gpu numbers into ranks
-      graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank;
+      graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].gpu.rank;
    }
    // TODO : let user specify NICs
    graph->inter[0] = graph->inter[1] = 0;
    graph->speedIntra = graph->speedInter = system->maxWidth;
-    graph->nvlink = 0;
    if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
      // Reverse the loop
      for (int c=0; c<graph->nChannels; c++) {
@@ -531,22 +788,24 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph

  struct ncclTopoGraph tmpGraph;
  memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
-  int bestSpeed = 0;

  // First try crossnic, then decrease speed and finally increase speedIntra.
-  tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth;
-  int maxSpeed = system->maxSpeed;
  tmpGraph.pattern = graph->pattern;
+  int pass = 1;
+  int speedIndex = 0;
+  while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
+  tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+  int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;

 search:
-  int time = NCCL_SEARCH_TIMEOUT;
-  int stepSpeed = system->maxWidth/4;
-  tmpGraph.nvlink = 1;
+  int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS :
+    tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT;
  tmpGraph.nChannels = 0;
-  tmpGraph.sameChannels = 1;
-  NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time));
+  globalTimeout -= time;
+
+  NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
 #if 0
-  printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
+  printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
  for (int c=0; c<graph->nChannels; c++) {
    printf("%2d : ", c);
    for (int g=0; g<ngpus; g++) {
@@ -555,13 +814,34 @@ search:
    printf("\n");
  }
 #endif
-  if (time == -1) goto done;
-  // We already have a solution and we timed out so lower speed will just timeout as well
-  if (time == 0 && graph->nChannels > 0) goto done;
-  if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra;
+  // Optimal solution, stop here
+  if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;

-  if (tmpGraph.speedIntra == tmpGraph.speedInter) {
-    // First pass, we don't have a solution yet ; try to go slower.
+  if (pass == 1) {
+    // First pass, we don't have a solution yet ; try other options
+
+    // Try having different channels
+    if (tmpGraph.sameChannels == 1) {
+      tmpGraph.sameChannels = 0;
+      goto search;
+    }
+    tmpGraph.sameChannels = 1;
+
+    if (time != -1) globalTimeout += time;
+    else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
+    if (globalTimeout < 0) goto done;
+
+    int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
+    if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
+      tmpGraph.typeIntra += 1;
+      goto search;
+    }
+    tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
+    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) {
+      tmpGraph.typeInter += 1;
+      goto search;
+    }
+    tmpGraph.typeInter = PATH_PIX;

    // Try a simpler tree
    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
@@ -574,50 +854,61 @@ search:
    }
    tmpGraph.pattern = graph->pattern;

-    if (tmpGraph.type < LINK_QPI) {
-      tmpGraph.type += 1;
-      goto search;
-    }
-    tmpGraph.type = graph->type;
-
    if (crossNic && tmpGraph.crossNic == 0) {
      // Try again with crossNic if permitted
      tmpGraph.crossNic = crossNic;
      goto search;
    }
-    tmpGraph.crossNic = graph->crossNic;
+    tmpGraph.crossNic = 0;
+
+    // Decrease speed until we find a solution
+    if ((speedIndex < NSPEEDS-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
+      tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
+      goto search;
+    }
+    speedIndex = 0;
+    while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
+    tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];

-    // Try to reduce speed per channel
-    tmpGraph.speedIntra = tmpGraph.speedInter -= stepSpeed;
-    if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= stepSpeed) goto search;
  }

 done:
-  // We have a solution now. See if we can increase speedIntra
-  if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+  // We have a solution. Start from that solution and move to pass 2.
+  if (pass == 1) {
+    time = -1;
+    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
+    speedIndex = 0;
+    while (speedArray[speedIndex] > graph->speedInter && speedIndex < NSPEEDS-1) speedIndex++;
+    tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+    tmpGraph.minChannels = graph->nChannels;
+    pass = 2;
+  }
+
+  // 3. See if we can increase speedIntra for trees (2 nodes or collnet)
+  if (pass == 2) {
+    if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
+        tmpGraph.speedIntra == graph->speedIntra && tmpGraph.speedIntra < tmpGraph.speedInter*2 &&
+        speedIndex > 0) {
+      tmpGraph.speedIntra = speedArray[--speedIndex];
+      goto search;
+    }
    time = -1;
    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
  }
-  if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) {
-    // Try to increase the intra speed only but keeping nChannels the same
-    tmpGraph.speedIntra += stepSpeed;
-    maxSpeed = tmpGraph.speedIntra * graph->nChannels;
-    if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search;
-  }

-  if (graph->nChannels == 0) {
+  if (graph->nChannels == 0 && graph->collNet == 0) {
    WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
-    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank;
+    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
    graph->inter[0] = graph->inter[1] = 0;
-    graph->speedIntra = graph->speedInter = stepSpeed;
-    graph->nvlink = 0;
+    graph->speedIntra = graph->speedInter = 0.1;
+    graph->typeIntra = graph->typeInter = PATH_SYS;
    graph->nChannels = 1;
  }
  return ncclSuccess;
 }

 ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
-  INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels);
+  INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels);
  int ngpus = system->nodes[GPU].count;

  char line[1024];
@@ -641,6 +932,18 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
  return ncclSuccess;
 }

+ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
+  char* str = getenv("NCCL_GRAPH_DUMP_FILE");
+  if (str) {
+    struct ncclXml* xml;
+    NCCLCHECK(ncclCalloc(&xml, 1));
+    NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
+    NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
+    free(xml);
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
  *dev = graph->inter[(channelId%graph->nChannels)*2+dir];
  return ncclSuccess;
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -10,27 +10,28 @@

 #include "graph.h"
 #include "core.h"
+#include <sched.h>

-#define LOC_WIDTH 5000
-#define PASCAL_NVLINK_WIDTH 18
-#define VOLTA_NVLINK_WIDTH 21
-#define PCI_WIDTH 12           // PCI Gen3 x16
-#define QPI_WIDTH 8
-#define SKL_QPI_WIDTH 12
-#define SKL_PCI_WIDTH 12
-#define SKL_CPUPCI_WIDTH 12
-#define P9_WIDTH 32
-#define NET_WIDTH 12           // 100Gbit
-#define ROME_QPI_WIDTH 18
-#define ROME_PCI_WIDTH 18
-#define ROME_CPUPCI_WIDTH 18
+#define LOC_WIDTH 5000.0
+#define PASCAL_NVLINK_WIDTH 18.0
+#define VOLTA_NVLINK_WIDTH 21.0
+#define PCI_WIDTH 12.0           // PCI Gen3 x16
+#define QPI_WIDTH 6.0
+#define SKL_QPI_WIDTH 9.0
+#define P9_WIDTH 32.0
+#define ARM_WIDTH 6.0
+#define NET_WIDTH 12.0           // 100Gbit
+#define VEGA_XGMI_WIDTH 20.0
+#define ROME_QPI_WIDTH 18.0
+#define ROME_PCI_WIDTH 18.0
+#define ROME_CPUPCI_WIDTH 18.0

-// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU
-// to GPU traffic consumed more PCI bandwidth.
+// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
+// to GPU traffic consumes more PCI bandwidth.
 #define INTEL_P2P(speed) (speed*9/12)
 #define INTEL_P2P_OVERHEAD(speed) (speed*12/9)

-#define NCCL_TOPO_NODE_TYPES 6
+#define NCCL_TOPO_NODE_TYPES 7
 #define GPU 0
 #define PCI 1
 #define NVS 2
@@ -39,37 +40,72 @@
 #define NET 5
 extern const char* topoNodeTypeStr[];

+// We want link types and path types to match as much as possible
 #define LINK_LOC 0
 #define LINK_NVL 1
 #define LINK_PCI 2
-#define LINK_QPI 3
-#define LINK_NET 4
+// Skipping 3 for PATH_PXB
+// Skipping 4 for PATH_PHB
+#define LINK_SYS 5
+#define LINK_NET 6
 extern const char* topoLinkTypeStr[];

+#define PATH_LOC 0
+#define PATH_NVL 1
+#define PATH_PIX 2
+#define PATH_PXB 3
+#define PATH_PHB 4
+#define PATH_SYS 5
+#define PATH_NET 6
+extern const char* topoPathTypeStr[];
+
 struct ncclTopoNode;
 struct ncclTopoLink {
  int type;
-  int width;
+  float width;
  struct ncclTopoNode* remNode;
 };
 #define NCCL_TOPO_MAX_LINKS 32
 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
-#define SELECT_PATH 1
-#define SELECT_LAST 2
-
-#define NET_GDR_MASK 0x70000000

 struct ncclTopoLinkList {
  struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
  int count;
-  int width;
+  float width;
  int type;
 };

+#define NCCL_TOPO_CPU_INTEL_BDW 1
+#define NCCL_TOPO_CPU_INTEL_SKL 2
+
+#define NCCL_TOPO_UNDEF (-1)
+
 struct ncclTopoNode {
  int type;
  int64_t id;
-  int rank;
+  // Type specific data
+  union {
+    struct {
+      int dev; // NVML dev number
+      int rank;
+      int cudaCompCap;
+      int gdrSupport;
+    }gpu;
+    struct {
+      uint64_t asic;
+      int port;
+      float width;
+      int gdrSupport;
+      int collSupport;
+      int maxChannels;
+    }net;
+    struct {
+      int arch;
+      int vendor;
+      int model;
+      cpu_set_t affinity;
+    }cpu;
+  };
  int nlinks;
  struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
  // Pre-computed paths to GPUs and NICs
@@ -85,60 +121,29 @@ struct ncclTopoNodeSet {

 struct ncclTopoSystem {
  struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
-  int maxSpeed;
-  int maxWidth;
-  int searchInitDone;
+  float maxWidth;
 };

-static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
+ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
+ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
+
+ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
+ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
+
+static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
+  *index = -1;
  for (int i=0; i<system->nodes[type].count; i++) {
    if (system->nodes[type].nodes[i].id == id) {
-      *node = system->nodes[type].nodes+i;
+      *index = i;
      return ncclSuccess;
    }
  }
-  if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
-    WARN("Error : tried to create too many nodes of type %d\n", type);
-    return ncclInternalError;
-  }
-  struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
-  system->nodes[type].count++;
-  n->type = type;
-  n->id = id;
-  if (type == GPU) {
-    // Create link to itself (used in some corner cases)
-    n->nlinks=1;
-    n->links[0].type = LINK_LOC;
-    n->links[0].remNode = n;
-    n->links[0].width = LOC_WIDTH;
-  }
-  *node = n;
-  return ncclSuccess;
+  return ncclInternalError;
 }

-static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) {
-  // Aggregate links into higher width for NVLink
-  struct ncclTopoLink* link;
-  for (link = node->links; link->remNode; link++) {
-    if (link->remNode == remNode && link->type == type) break;
-  }
-  if (link->remNode == NULL) node->nlinks++;
-  link->type = type;
-  link->remNode = remNode;
-  link->width += width;
-
-  // Sort links in BW descending order
-  struct ncclTopoLink linkSave;
-  memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
-  while (link != node->links) {
-    if ((link-1)->width >= linkSave.width) break;
-    memcpy(link, link-1, sizeof(struct ncclTopoLink));
-    link--;
-  }
-  memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
-
 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -53,12 +53,12 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
 }

 static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
-static const char* ncclAlgoStr[] = { "Tree", "Ring" };
+static const char* ncclAlgoStr[] = { "Tree", "Ring", "CollNet" };
 static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };

 // Latencies in us, Bandwidths in GB/s
 // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 37.9, 37.9, 40.4 }, { 20.5, 20.5, 27.9 } };
+static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 37.9, 37.9, 40.4 }, { 20.5, 20.5, 27.9 }, { 37.9, 37.9, 40.4 } };

 // NVLink, PCI, Network
 #define NCCL_HW_NVLINK 0
@@ -67,29 +67,32 @@ static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 37.9
 // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
 static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 { /* NVLINK */
-  { /* Tree (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 }, /* Ring (LL/LL128/Simple)*/ { 2.3, 2.3, 2.7 } },
+  { /* Tree (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 }, /* Ring (LL/LL128/Simple)*/ { 2.3, 2.3, 2.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 } },
  /* PCI */
-  { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 } },
+  { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
  /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 }, /* Ring (LL/LL128/Simple)*/ { 2.0, 2.0, 4.5 } }
+  { /* Tree (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 }, /* Ring (LL/LL128/Simple)*/ { 2.0, 2.0, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 } }
 };

 // LL128 max BW for the different collectives
 static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };

-ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) {
-  int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS;
-  comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
-  comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
-  comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
-
-  INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]);
+ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
+  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
+  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
+    getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);

  if (comm->nRanks <= 1) return ncclSuccess;

-  struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph };
-  int intraHw[2], hw[2];
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI;
+  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
+  int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;

  for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
@@ -98,11 +101,11 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
      comm->nRanks;

    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue;
+      if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;

      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter;
-        float busBw = graphs[a]->nChannels * speed * 0.6;
+        float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
+        float busBw = graphs[a]->nChannels * speed;

        // Various model refinements
        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL)    busBw *= 1.0/5.0;
@@ -110,9 +113,12 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0);
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3;
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
+        if (a == NCCL_ALGO_COLLNET) busBw *= .9;
+        if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
+        if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0;  // CollNet does not support LL128

        // Convert bus BW to algorithm BW
-        float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps;
+        float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
        comm->bandwidths[coll][a][p] = busBw * ratio;

        comm->latencies[coll][a][p] = baseLat[a][p];
@@ -128,11 +134,16 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
          } else {
            comm->latencies[coll][a][p] += nsteps*lat;
          }
-        } else {
+        } else if (a == NCCL_ALGO_TREE) {
          float intraLat = hwLat[intraHw[a]][a][p];
          float interLat = hwLat[NCCL_HW_NET][a][p];
          comm->latencies[coll][a][p] +=
            2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
+        } else {
+          float intraLat = hwLat[intraHw[a]][a][p];
+          float interLat = hwLat[NCCL_HW_NET][a][p];
+          comm->latencies[coll][a][p] +=
+            2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
        }
      }
    }
@@ -141,7 +152,7 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
  // Protocols/Algorithms enable/disable, and user overrides.
  // All are enabled except ll128 which is enabled by default only in certain cases.
  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 };
+  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 };

  const char *protoStr = getenv("NCCL_PROTO");
  if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
@@ -152,30 +163,32 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
    int pEnable = protoEnable[p];
    if (pEnable == 2 && p == NCCL_PROTO_LL128) {
      // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
-      pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
+      pEnable = (graphs[a]->typeInter <= LINK_PCI) && graphs[a]->typeIntra == LINK_NVL && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
    }
    if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
  }

  if (comm->rank == 0) {
    char line[1024];
-    int offset = 0;
    sprintf(line, "Latency/AlgBw |");
-    offset = strlen(line);
    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
-        offset = strlen(line);
+        sprintf(line+strlen(line), " %7s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
+      }
+    }
+    INFO(NCCL_TUNING, "%s", line);
+    sprintf(line, " Max NThreads |");
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
      }
    }
    INFO(NCCL_TUNING, "%s", line);
    for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
      sprintf(line, "%13s |", ncclFuncStr[c]);
-      offset = strlen(line);
      for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
-          offset = strlen(line);
+          sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
        }
      }
      INFO(NCCL_TUNING, "%s", line);
@@ -202,12 +215,41 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
    }
  }

-  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld",
+  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld/%ld/%ld",
      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
-      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE],
+      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL],
+      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128],
+      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE]);
+  return ncclSuccess;
+}
+
+// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
+// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
+static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
+  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .84,  .49,  .42,  .60,  .75,  .87,  .94,  .94,  .99,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 },
+  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .84,  .49,  .42,  .60,  .75,  .87,  .94,  .94,  .99,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 },
+  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .41,  .27,  .25,  .39,  .46,  .72,  .76,  .87,  .92,  .97,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 }
+};
+
+static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
+  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .25,  .41,  .55,  .56,  .78,  .94,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 },
+  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .25,  .41,  .55,  .56,  .78,  .94,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 },
+  {  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  .04,  .08,  .09,  .09,  .11,  .13,  .25,  .40,  .59,  .76,  .86,  1.0 ,  1.0 ,  1.0 ,  1.0 ,  1.0 }
+};
+
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) {
+  float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
+  if (bw == 0) {
+    *time = -1.0; return ncclSuccess;
+  }
+  int logSize = log2i(info->nBytes>>6);
+  if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
+  else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize];
+  *time = info->comm->latencies[info->coll][algorithm][protocol] + (info->nBytes) / (1000 * bw);
  return ncclSuccess;
 }
@@ -0,0 +1,819 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include "core.h"
+#include "nvmlwrap.h"
+#include "xml.h"
+
+/*******************/
+/* XML File Parser */
+/*******************/
+
+ncclResult_t xmlGetChar(FILE* file, char* c) {
+  if (fread(c, 1, 1, file) == 0) {
+    WARN("XML Parse : Unexpected EOF");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t xmlGetValue(FILE* file, char* value, char* last) {
+  char c;
+  NCCLCHECK(xmlGetChar(file, &c));
+  if (c != '"' && c != '\'') {
+#if INT_OK
+    int o = 0;
+    do {
+      value[o++] = c;
+      NCCLCHECK(xmlGetChar(file, &c));
+    } while (c >= '0' && c <= '9');
+    value[o] = '\0';
+    *last = c;
+    return ncclSuccess;
+#else
+    WARN("XML Parse : Expected (double) quote.");
+    return ncclInternalError;
+#endif
+  }
+  int o = 0;
+  do {
+    NCCLCHECK(xmlGetChar(file, &c));
+    value[o++] = c;
+  } while (c != '"');
+  value[o-1] = '\0';
+  NCCLCHECK(xmlGetChar(file, last));
+  return ncclSuccess;
+}
+
+ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
+  char c;
+  char* ptr = name;
+  int o = 0;
+  do {
+    NCCLCHECK(xmlGetChar(file, &c));
+    if (c == '=') {
+      ptr[o] = '\0';
+      if (value == NULL) {
+        WARN("XML Parse : Unexpected value with name %s\n", ptr);
+        return ncclInternalError;
+      }
+      return xmlGetValue(file, value, last);
+    }
+    ptr[o] = c;
+    if (o == MAX_STR_LEN-1) {
+      ptr[o] = '\0';
+      WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN);
+      return ncclInternalError;
+    }
+    o++;
+  } while (c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r');
+  ptr[o-1] = '\0';
+  *last = c;
+  return ncclSuccess;
+}
+
+// Shift the 3-chars string by one char and append c at the end
+#define SHIFT_APPEND(s, c) do { s[0]=s[1]; s[1]=s[2]; s[2]=c; } while(0)
+ncclResult_t xmlSkipComment(FILE* file, char* start, char next) {
+  // Start from something neutral with \0 at the end.
+  char end[4] = "...";
+
+  // Inject all trailing chars from previous reads. We don't need
+  // to check for --> here because there cannot be a > in the name.
+  for (int i=0; i<strlen(start); i++) SHIFT_APPEND(end, start[i]);
+  SHIFT_APPEND(end, next);
+
+  // Stop when we find "-->"
+  while (strcmp(end, "-->") != 0) {
+    int c;
+    if (fread(&c, 1, 1, file) != 1) {
+      WARN("XML Parse error : unterminated comment");
+      return ncclInternalError;
+    }
+    SHIFT_APPEND(end, c);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
+  node->type = NODE_TYPE_NONE;
+  char c = ' ';
+  while (c == ' ' || c == '\n' || c == '\r') {
+    if (fread(&c, 1, 1, file) == 0) return ncclSuccess;
+  }
+  if (c != '<') {
+    WARN("XML Parse error : expecting '<', got '%c'", c);
+    return ncclInternalError;
+  }
+  // Read XML element name
+  NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
+
+  // Check for comments
+  if (strncmp(node->name, "!--", 3) == 0) {
+    NCCLCHECK(xmlSkipComment(file, node->name+3, c));
+    return xmlGetNode(file, node);
+  }
+
+  // Check for closing tag
+  if (node->name[0] == '\0' && c == '/') {
+    node->type = NODE_TYPE_CLOSE;
+    // Re-read the name, we got '/' in the first call
+    NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
+    if (c != '>') {
+      WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
+      return ncclInternalError;
+    }
+    return ncclSuccess;
+  }
+
+  node->type = NODE_TYPE_OPEN;
+
+  // Get Attributes
+  int a = 0;
+  while (c == ' ') {
+    NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
+    if (a == MAX_ATTR_COUNT) {
+      INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
+      // Actually we need to still consume the extra attributes so we have an extra one.
+    } else a++;
+  }
+  node->nAttrs = a;
+  if (c == '/') {
+    node->type = NODE_TYPE_SINGLE;
+    char str[MAX_STR_LEN];
+    NCCLCHECK(xmlGetToken(file, str, NULL, &c));
+  }
+  if (c != '>') {
+    WARN("XML Parse : expected >, got '%c'", c);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+typedef ncclResult_t (*xmlHandlerFunc_t)(FILE*, struct ncclXml*, struct ncclXmlNode*);
+
+struct xmlHandler {
+  const char * name;
+  xmlHandlerFunc_t func;
+};
+
+ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) {
+  if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
+  while (1) {
+    if (xml->maxIndex == MAX_NODES) {
+      WARN("Error : XML parser is limited to 1024 nodes\n");
+      return ncclInternalError;
+    }
+    struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
+    memset(node, 0, sizeof(struct ncclXmlNode));
+    NCCLCHECK(xmlGetNode(file, node));
+    if (node->type == NODE_TYPE_NONE) {
+      if (head) {
+        WARN("XML Parse : unterminated %s", head->name);
+        return ncclInternalError;
+      } else {
+        // All done
+        return ncclSuccess;
+      }
+    }
+    if (head && node->type == NODE_TYPE_CLOSE) {
+      if (strcmp(node->name, head->name) != 0) {
+        WARN("XML Mismatch : %s / %s", head->name, node->name);
+        return ncclInternalError;
+      }
+      return ncclSuccess;
+    }
+    int found = 0;
+    for (int h=0; h<nHandlers; h++) {
+      if (strcmp(node->name, handlers[h].name) == 0) {
+        if (head) head->subs[head->nSubs++] = node;
+        node->parent = head;
+        node->nSubs = 0;
+        xml->maxIndex++;
+        NCCLCHECK(handlers[h].func(file, xml, node));
+        found = 1;
+        break;
+      }
+    }
+    if (!found) {
+      if (nHandlers) INFO(NCCL_GRAPH, "Ignoring element %s", node->name);
+      NCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0));
+    }
+  }
+}
+
+/**************/
+/* XML Writer */
+/**************/
+
+ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) {
+  for (int i=0; i<indent; i++) fprintf(file, " ");
+  fprintf(file, "<%s", node->name);
+
+  for (int a=0; a<node->nAttrs; a++) {
+    fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value);
+  }
+  if (node->nSubs == 0) {
+    fprintf(file, "/>\n");
+  } else {
+    fprintf(file, ">\n");
+    for (int s=0; s<node->nSubs; s++) {
+      NCCLCHECK(ncclTopoDumpXmlRec(indent+2, file, node->subs[s]));
+    }
+    for (int i=0; i<indent; i++) fprintf(file, " ");
+    fprintf(file, "</%s>\n", node->name);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) {
+  FILE* file = fopen(xmlTopoFile, "w");
+  if (file == NULL) {
+    WARN("Unable to open %s, not dumping topology.", xmlTopoFile);
+    return ncclSuccess;
+  }
+  NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes));
+  fclose(file);
+  return ncclSuccess;
+}
+
+/****************************************/
+/* Parser rules for our specific format */
+/****************************************/
+
+ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  struct xmlHandler handlers[] = { { "xgmi", ncclTopoXmlLoadNvlink } };
+#else
+  struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } };
+#endif
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "net", ncclTopoXmlLoadNet } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadCpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "nic", ncclTopoXmlLoadNic } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  int version;
+  NCCLCHECK(xmlGetAttrInt(head, "version", &version));
+  if (version != NCCL_TOPO_XML_VERSION) {
+    WARN("XML Topology has wrong version %d, %d needed", version, NCCL_TOPO_XML_VERSION);
+    return ncclInvalidUsage;
+  }
+  const char* name;
+  NCCLCHECK(xmlGetAttr(head, "name", &name));
+  if (name != NULL) INFO(NCCL_GRAPH, "Loading topology %s", name);
+  else INFO(NCCL_GRAPH, "Loading unnamed topology");
+
+  struct xmlHandler handlers[] = { { "cpu", ncclTopoXmlLoadCpu } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml) {
+  FILE* file = fopen(xmlTopoFile, "r");
+  if (file == NULL) {
+    WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
+    return ncclSuccess;
+  }
+  struct xmlHandler handlers[] = { { "system", ncclTopoXmlLoadSystem } };
+  xml->maxIndex = 0;
+  NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1));
+  fclose(file);
+  return ncclSuccess;
+}
+
+/**********************/
+/* XML creation       */
+/* from autodetection */
+/**********************/
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+static void memcpylower(char* dst, const char* src, const size_t size) {
+  for (int i=0; i<size; i++) dst[i] = tolower(src[i]);
+}
+static ncclResult_t getPciPath(const char* busId, char** path) {
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+  memcpylower(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+  memcpylower(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+  *path = realpath(busPath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", busPath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
+  char filePath[PATH_MAX];
+  sprintf(filePath, "%s/%s", path, fileName);
+  int offset = 0;
+  FILE* file;
+  if ((file = fopen(filePath, "r")) != NULL) {
+    while (feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) {
+      int len = fread(strValue+offset, 1, MAX_STR_LEN-offset, file);
+      offset += len;
+    }
+    fclose(file);
+  }
+  if (offset == 0) {
+    strValue[0] = '\0';
+    INFO(NCCL_GRAPH, "Topology detection : could not read %s, ignoring", filePath);
+  } else {
+    strValue[offset-1] = '\0';
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* path, const char* fileName, const char* attrName) {
+  char strValue[MAX_STR_LEN];
+  NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
+  if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
+  TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index));
+  if (index == -1) {
+    const char* numaId;
+    NCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId));
+    if (numaId == NULL) {
+      WARN("GetXmlFromCpu : could not find CPU numa ID.");
+      return ncclInternalError;
+    }
+    // Set affinity
+    char cpumaskPath[] = "/sys/devices/system/node/node0000";
+    sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
+    NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
+  }
+
+  NCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index));
+  if (index == -1) {
+    // Fill CPU type / vendor / model
+#if defined(__PPC__)
+    NCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64"));
+#elif defined(__aarch64__)
+    NCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64"));
+#elif defined(__x86_64__)
+    NCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64"));
+#endif
+  }
+
+#if defined(__x86_64__)
+  NCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index));
+  if (index == -1) {
+    union {
+      struct {
+        // CPUID 0 String register order
+        uint32_t ebx;
+        uint32_t edx;
+        uint32_t ecx;
+      };
+      char vendor[12];
+    } cpuid0;
+
+    asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0) : "memory");
+    char vendor[13];
+    strncpy(vendor, cpuid0.vendor, 12);
+    vendor[12] = '\0';
+    NCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor));
+  }
+
+  NCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index));
+  if (index == -1) {
+    union {
+      struct {
+        unsigned steppingId:4;
+        unsigned modelId:4;
+        unsigned familyId:4;
+        unsigned processorType:2;
+        unsigned resv0:2;
+        unsigned extModelId:4;
+        unsigned extFamilyId:8;
+        unsigned resv1:4;
+      };
+      uint32_t val;
+    } cpuid1;
+    asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1) : "memory");
+    int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4);
+    int modelId = cpuid1.modelId + (cpuid1.extModelId << 4);
+    NCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId));
+    NCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId));
+  }
+#endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) {
+  NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId));
+  if (*pciNode == NULL) {
+    NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode));
+  }
+  NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
+  return ncclSuccess;
+}
+
+// Check whether a string is in BDF format or not.
+// BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits.
+// There can be trailing chars.
+int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
+int checkBDFFormat(char* bdf) {
+  if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0;
+  if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) ||
+      isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) ||
+      isHex(bdf[11] == 0)) return 0;
+  return 1;
+}
+
+ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
+  // Fill info, then parent
+  const char* busId;
+  NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+  char* path = NULL;
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+  }
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    char deviceSpeedStr[MAX_STR_LEN];
+    float deviceSpeed;
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+    sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed);
+    char portSpeedStr[MAX_STR_LEN];
+    float portSpeed;
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr));
+    sscanf(portSpeedStr, "%f GT/s", &portSpeed);
+    NCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr));
+  }
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    char strValue[MAX_STR_LEN];
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+    int deviceWidth = strtol(strValue, NULL, 0);
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_width", strValue));
+    int portWidth = strtol(strValue, NULL, 0);
+    NCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth,portWidth)));
+  }
+  struct ncclXmlNode* parent = pciNode->parent;
+  if (parent == NULL) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+
+    // Save that for later in case next step is a CPU
+    char numaIdStr[MAX_STR_LEN];
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr));
+
+    // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI
+    // switch, or stop if we reach a CPU root complex.
+    int slashCount = 0;
+    int parentOffset;
+    for (parentOffset = strlen(path)-1; parentOffset>0; parentOffset--) {
+      if (path[parentOffset] == '/') {
+        slashCount++;
+        path[parentOffset] = '\0';
+        int start = parentOffset - 1;
+        while (start>0 && path[start] != '/') start--;
+        // Check whether the parent path looks like "BBBB:BB:DD.F" or not.
+        if (checkBDFFormat(path+start+1) == 0) {
+          // This a CPU root complex. Create a CPU tag and stop there.
+          struct ncclXmlNode* topNode;
+          NCCLCHECK(xmlFindTag(xml, "system", &topNode));
+          NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
+          if (parent == NULL) {
+            NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
+            NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
+          }
+        } else if (slashCount == 2) {
+          // Continue on the upper PCI switch
+          for (int i = strlen(path)-1; i>0; i--) {
+            if (path[i] == '/') {
+              NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path+i+1));
+              if (parent == NULL) {
+                NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
+                NCCLCHECK(xmlSetAttr(parent, "busid", path+i+1));
+              }
+              break;
+            }
+          }
+        }
+      }
+      if (parent) break;
+    }
+    pciNode->parent = parent;
+    parent->subs[parent->nSubs++] = pciNode;
+  }
+  if (strcmp(parent->name, "pci") == 0) {
+    NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+  } else if (strcmp(parent->name, "cpu") == 0) {
+    NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
+  }
+  free(path);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvmlDev, struct ncclXml* xml, struct ncclXmlNode** gpuNodeRet) {
+  struct ncclXmlNode* gpuNode = NULL;
+  NCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode));
+  if (gpuNode == NULL) NCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode));
+
+  int index = -1;
+
+  int dev = -1;
+  NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
+  if (index == -1) {
+    if (nvmlDev == NULL) {
+      //WARN("No NVML, trying to use CUDA instead");
+      const char* busId;
+      NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+      if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1;
+    } else {
+      NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
+    }
+    NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
+  }
+  NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
+  if (dev == -1) return ncclSuccess;
+
+  NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
+  if (index == -1) {
+    int cudaMajor, cudaMinor;
+    if (nvmlDev == NULL) {
+      hipDeviceProp_t devProp;
+      CUDACHECK(hipGetDeviceProperties(&devProp, dev));
+      cudaMajor = devProp.major; cudaMinor = devProp.minor;
+    } else {
+      NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
+    }
+    NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor));
+  }
+  int sm;
+  NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm));
+
+  struct ncclXmlNode* nvlNode = NULL;
+  NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode));
+  if (nvlNode == NULL) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    const char* busId;
+    NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+    if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) return ncclInternalError;
+    int deviceCnt;
+    CUDACHECK(hipGetDeviceCount(&deviceCnt));
+    for (int i=0; i<deviceCnt; i++) {
+      if (i != dev) {
+        uint32_t link_type, hops;
+        if (hipExtGetLinkTypeAndHopCount(dev, i, &link_type, &hops) == hipSuccess) {
+          if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) {
+            char busIdStr[] = "00000000:00:00.0";
+            CUDACHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), i));
+            char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+            for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+              lowerId[c] = tolower(busIdStr[c]);
+              if (busIdStr[c] == 0) break;
+            }
+            NCCLCHECK(xmlGetSubKv(gpuNode, "xgmi", &nvlNode, "target", lowerId));
+            if (nvlNode == NULL) {
+              NCCLCHECK(xmlAddNode(xml, gpuNode, "xgmi", &nvlNode));
+              NCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
+              NCCLCHECK(xmlSetAttrInt(nvlNode, "count", 1));
+            }
+          }
+        }
+      }
+    }
+#else
+    // NVML NVLink detection
+    int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : 6;
+
+    if (maxNvLinks > 0 && nvmlDev == NULL) {
+      WARN("No NVML device handle. Skipping nvlink detection.\n");
+      maxNvLinks = 0;
+    }
+
+    for (int l=0; l<maxNvLinks; ++l) {
+      // Check whether we can use this NVLink for P2P
+      unsigned canP2P;
+      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+      // Make sure the Nvlink is up. The previous call should have trained the link.
+      nvmlEnableState_t isActive;
+      if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+      // Try to figure out what's on the other side of the NVLink
+      nvmlPciInfo_t remoteProc;
+      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+      // Make a lower case copy of the bus ID for calling ncclDeviceType
+      // PCI system path is in lower case
+      char* p = remoteProc.busId;
+      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+        lowerId[c] = tolower(p[c]);
+        if (p[c] == 0) break;
+      }
+
+      NCCLCHECK(xmlGetSubKv(gpuNode, "nvlink", &nvlNode, "target", lowerId));
+      if (nvlNode == NULL) {
+        NCCLCHECK(xmlAddNode(xml, gpuNode, "nvlink", &nvlNode));
+        NCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
+        NCCLCHECK(xmlSetAttrInt(nvlNode, "count", 1));
+      } else {
+        int count;
+        NCCLCHECK(xmlGetAttrInt(nvlNode, "count", &count));
+        NCCLCHECK(xmlSetAttrInt(nvlNode, "count", count+1));
+      }
+    }
+#endif
+  }
+  // Fill target classes
+  for (int s=0; s<gpuNode->nSubs; s++) {
+    struct ncclXmlNode* sub = gpuNode->subs[s];
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    if (strcmp(sub->name, "xgmi") != 0) continue;
+#else
+    if (strcmp(sub->name, "nvlink") != 0) continue;
+#endif
+    int index;
+    NCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index));
+    if (index == -1) {
+      const char* busId;
+      NCCLCHECK(xmlGetAttr(sub, "target", &busId));
+      char* path;
+      NCCLCHECK(getPciPath(busId, &path));
+      NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+    }
+  }
+  *gpuNodeRet = gpuNode;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) {
+  struct ncclXmlNode* node;
+  NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
+  NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+  nvmlDevice_t nvmlDev;
+  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
+  NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
+  return ncclSuccess;
+}
+
+// Returns the subsystem name of a path, i.e. the end of the path
+// where sysPath/subsystem points to.
+ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
+  char subSysPath[PATH_MAX];
+  sprintf(subSysPath, "%s/subsystem", sysPath);
+  char* path = realpath(subSysPath, NULL);
+  if (path == NULL) {
+    subSys[0] = '\0';
+  } else {
+    int offset;
+    for (offset = strlen(path); offset > 0 && path[offset] != '/'; offset--);
+    strcpy(subSys, path+offset+1);
+    free(path);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) {
+  NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
+  if (*netNode != NULL) return ncclSuccess;
+
+  const char* pciSysPath = pciPath;
+  if (pciSysPath) {
+    char subSystem[PATH_MAX];
+    NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
+    // This is not a PCI device (virtual, usb, ...).
+    if (strcmp(subSystem, "pci") != 0) {
+      INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
+      pciSysPath = NULL;
+    }
+  }
+
+  struct ncclXmlNode* parent = NULL;
+  if (pciSysPath) {
+    int offset;
+    for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    strcpy(busId, pciSysPath+offset+1);
+    NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
+    if (parent == NULL) {
+      NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
+      NCCLCHECK(xmlSetAttr(parent, "busid", busId));
+      NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+    }
+  } else {
+    // Virtual NIC, no PCI device, attach to first CPU
+    NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
+  }
+
+  struct ncclXmlNode* nicNode = NULL;
+  NCCLCHECK(xmlGetSub(parent, "nic", &nicNode));
+  if (nicNode == NULL) {
+    NCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode));
+  }
+
+  // We know that this net does not exist yet (we searched for it at the
+  // beginning of this function), so we can add it.
+  NCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode));
+  NCCLCHECK(xmlSetAttr(*netNode, "name", netName));
+  return ncclSuccess;
+}
+
+/**************************************************/
+/* Parser rules for the user-defined graph search */
+/**************************************************/
+
+ncclResult_t ncclTopoXmlGraphLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadChannel(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "net", ncclTopoXmlGraphLoadNet }, { "gpu", ncclTopoXmlGraphLoadGpu } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadGraph(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "channel", ncclTopoXmlGraphLoadChannel } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadGraphs(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
+  int version;
+  NCCLCHECK(xmlGetAttrInt(head, "version", &version));
+  if (version != NCCL_GRAPH_XML_VERSION) {
+    WARN("XML Graph has wrong version %d, %d needed", version, NCCL_GRAPH_XML_VERSION);
+    return ncclInvalidUsage;
+  }
+  const char* name;
+  NCCLCHECK(xmlGetAttr(head, "name", &name));
+  if (name != NULL) INFO(NCCL_GRAPH, "Loading graphs for topology %s", name);
+  else INFO(NCCL_GRAPH, "Loading graphs");
+
+  struct xmlHandler handlers[] = { { "graph", ncclTopoXmlGraphLoadGraph } };
+  NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml) {
+  FILE* file = fopen(xmlGraphFile, "r");
+  if (file == NULL) {
+    WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno));
+    return ncclSystemError;
+  }
+  struct xmlHandler handlers[] = { { "graphs", ncclTopoXmlGraphLoadGraphs } };
+  xml->maxIndex = 0;
+  NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1));
+  fclose(file);
+  return ncclSuccess;
+}
@@ -0,0 +1,237 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef XML_H_
+#define XML_H_
+
+// A few constraints to make the implementation easy
+#define MAX_STR_LEN 256
+#define MAX_ATTR_COUNT 16
+#define MAX_SUBS 32
+#define MAX_NODES 1024
+
+#define NODE_TYPE_NONE 0
+#define NODE_TYPE_OPEN 1
+#define NODE_TYPE_CLOSE 2
+#define NODE_TYPE_SINGLE 3
+
+struct ncclXmlNode {
+  char name[MAX_STR_LEN];
+  struct {
+    char key[MAX_STR_LEN];
+    char value[MAX_STR_LEN];
+  } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
+  int nAttrs;
+  int type;
+  struct ncclXmlNode* parent;
+  struct ncclXmlNode* subs[MAX_SUBS];
+  int nSubs;
+};
+
+struct ncclXml {
+  struct ncclXmlNode nodes[MAX_NODES];
+  int maxIndex;
+};
+
+/* File functions */
+#define NCCL_TOPO_XML_VERSION 1
+ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml);
+ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml);
+#define NCCL_GRAPH_XML_VERSION 1
+ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml);
+
+/* Auto-detect functions */
+ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
+
+/**************/
+/* XML Struct */
+/* Functions  */
+/**************/
+
+static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index) {
+  *index = -1;
+  const int nAttrs = node->nAttrs;
+  for (int a=0; a<nAttrs; a++) {
+    if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
+      *index = a;
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttr(struct ncclXmlNode* node, const char* attrName, const char** value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  *value = index == -1 ? NULL : node->attrs[index].value;
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttrStr(struct ncclXmlNode* node, const char* attrName, const char** value) {
+  NCCLCHECK(xmlGetAttr(node, attrName, value));
+  if (*value == NULL) {
+    WARN("Attribute %s of node %s not found", attrName, node->name);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName, int* value) {
+  const char* str;
+  NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+  *value = strtol(str, NULL, 0);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
+  const char* str;
+  NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+  *value = strtof(str, NULL);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) {
+  *node = NULL;
+  for (int i=0; i<xml->maxIndex; i++) {
+    struct ncclXmlNode* n = xml->nodes+i;
+    if (strcmp(n->name, tagName) == 0) {
+      *node = n;
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue) {
+  *node = NULL;
+  for (int i=0; i<xml->maxIndex; i++) {
+    struct ncclXmlNode* n = xml->nodes+i;
+    if (strcmp(n->name, tagName) == 0) {
+      const char* value;
+      NCCLCHECK(xmlGetAttr(n, attrName, &value));
+      if (value && strcmp(value, attrValue) == 0) {
+        *node = n;
+        return ncclSuccess;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+  }
+  strncpy(node->attrs[index].value, value, MAX_STR_LEN);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+  }
+  snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+  }
+  snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetSub(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub) {
+  *sub = NULL;
+  for (int s=0; s<node->nSubs; s++) {
+    if (strcmp(node->subs[s]->name, subName) == 0) {
+      *sub = node->subs[s];
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetSubKv(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const char* attrValue) {
+  *sub = NULL;
+  for (int s=0; s<node->nSubs; s++) {
+    struct ncclXmlNode* subNode = node->subs[s];
+    if (strcmp(subNode->name, subName) == 0) {
+      const char* value;
+      NCCLCHECK(xmlGetAttr(subNode, attrName, &value));
+      if (value && strcmp(value, attrValue) == 0) {
+        *sub = node->subs[s];
+        return ncclSuccess;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const int attrValue) {
+  char strValue[10];
+  snprintf(strValue, 10, "%d", attrValue);
+  NCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue));
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub) {
+  if (xml->maxIndex == MAX_NODES) {
+    WARN("Error : too many XML nodes (max %d)", MAX_NODES);
+    return ncclInternalError;
+  }
+  struct ncclXmlNode* s = xml->nodes+xml->maxIndex++;
+  s->nSubs = 0;
+  s->nAttrs = 0;
+  *sub = s;
+  s->parent = parent;
+  if (parent) parent->subs[parent->nSubs++] = s;
+  strncpy(s->name, subName, MAX_STR_LEN);
+  return ncclSuccess;
+}
+
+// Dictionary for STR -> INT conversions. No dictionary size information,
+// there needs to be a last element with str == NULL.
+struct kvDict {
+  const char* str;
+  int value;
+};
+
+static ncclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) {
+  struct kvDict* d = dict;
+  while (d->str) {
+    if (strncmp(str, d->str, strlen(d->str)) == 0) {
+      *value = d->value;
+      return ncclSuccess;
+    }
+    d++;
+  }
+  WARN("KV Convert to int : could not find value of '%s' in dictionary", str);
+  return ncclInternalError;
+}
+static ncclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) {
+  struct kvDict* d = dict;
+  while (d->str) {
+    if (value == d->value) {
+      *str = d->str;
+      return ncclSuccess;
+    }
+    d++;
+  }
+  WARN("KV Convert to str : could not find value %d in dictionary", value);
+  return ncclInternalError;
+}
+
+#endif
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALIGN_H_
+#define NCCL_ALIGN_H_
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+#endif
@@ -10,6 +10,7 @@

 #include "nccl.h"
 #include "checks.h"
+#include "align.h"
 #include <sys/mman.h>

 static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
@@ -61,4 +62,19 @@ static bool hasFineGrainVramPcie() {
  else
    return false;
 }
+
+// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
+// allocated on separate pages as those pages will be marked DONTFORK
+// and if they are shared, that could cause a crash in a child process
+static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
+  size_t page_size = sysconf(_SC_PAGESIZE);
+  void* p;
+  int size_aligned = ROUNDUP(size, page_size);
+  int ret = posix_memalign(&p, page_size, size_aligned);
+  if (ret != 0) return ncclSystemError;
+  memset(p, 0, size);
+  *ptr = p;
+  return ncclSuccess;
+}
+
 #endif
@@ -57,7 +57,7 @@
  ncclResult_t res = call; \
  if (res != ncclSuccess) { \
    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
    return res; \
  } \
 } while (0);
@@ -66,7 +66,7 @@
  res = call; \
  if (res != ncclSuccess) { \
    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
    goto label; \
  } \
 } while (0);
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COLL_NET_H_
+#define COLL_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+
+extern ncclCollNet_t* ncclCollNet;
+typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+// Translation to external API
+static const char* collNetName() { return ncclCollNet->name; }
+static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
+static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
+  NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
+static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+
+static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; }
+
+#endif
@@ -1,4 +1,3 @@
-#include "hip/hip_runtime.h"
 /*************************************************************************
 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
@@ -32,7 +31,8 @@

 #define DECL_COLL3(coll, op, dtype) \
  DECL_COLL4(coll##Ring, op, dtype) \
-  DECL_COLL4(coll##Tree, op, dtype)
+  DECL_COLL4(coll##Tree, op, dtype) \
+  DECL_COLL4(coll##CollNet, op, dtype)

 #define DECL_COLL2(coll, op) \
  DECL_COLL3(coll, op, i8) \
@@ -24,8 +24,6 @@ struct cudaLaunchParams {
 #endif
 #endif

-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
 #define CACHE_LINE_SIZE 64
 #define MEM_ALIGN 4096
 #define CUDA_IPC_MIN 2097152UL
@@ -95,14 +93,11 @@ struct ncclComm {
  // Channels for collectives
  int nChannels;

-  // Only nvlink is used for inter-GPU communication
-  int nvlink;
-
  // Algorithm/Protocols thresholds
  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  int maxThreads[NCCL_NUM_PROTOCOLS];
+  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];

  // An internal CUDA stream for NCCL kernel CGMD launches
  int groupCudaStream;
@@ -140,6 +135,9 @@ struct ncclComm {
  // Global proxy thread
  pthread_t proxyThread;
  struct ncclProxyState proxyState;
+
+  // Whether this communicator uses collNet
+  int collNetSupport;
 };

 #endif
@@ -55,9 +55,10 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
 #define NCCL_NUM_FUNCTIONS 5
 typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;

-#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring
+#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET 2

 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
 #define NCCL_PROTO_LL 0
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -19,7 +19,7 @@ static int hexToInt(char c) {

 #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))

-ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
+ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
  uint32_t cpumasks[CPU_SET_N_U32];
  int m = CPU_SET_N_U32-1;
  cpumasks[m] = 0;
@@ -29,11 +29,6 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file

 // Let code temporarily downgrade WARN into INFO
 extern thread_local int ncclDebugNoWarn;
-#define NOWARN(a, ret) do { \
-  ncclDebugNoWarn = 1; \
-  ret = a; \
-  ncclDebugNoWarn = 0; \
-} while (0)

 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
@@ -10,6 +10,7 @@

 #include "nccl.h"
 #include "rccl_bfloat16.h"
+#include "align.h"
 #include <stdint.h>

 // Convert volatile access to atomic
@@ -24,14 +25,6 @@
 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8

-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
 union ncclLLFifoLine {
  /* Flags have to be *after* data, because otherwise, an incomplete receive
     from the network may receive the flag but not the data.
@@ -84,6 +77,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)

+#define NCCL_DIRECT_GPU 0x01
+#define NCCL_DIRECT_NIC 0x10
+
 struct ncclConnInfo {
  // Regular comm mechanism
  char *buff;         // Local for recv, remote for send
@@ -190,6 +186,8 @@ struct ncclChannel {
      struct ncclRing ring;
      struct ncclTree treeUp;
      struct ncclTree treeDn;
+      struct ncclTree collTreeUp;
+      struct ncclTree collTreeDn;

      int id;
      int nthreads;
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -14,17 +14,6 @@
 #include <ctype.h>
 #include <stdio.h>

-enum ncclPathDist {
-  PATH_PIX  = 0,
-  PATH_PXB  = 1,
-  PATH_PHB  = 2,
-  PATH_NODE = 3,
-  PATH_SYS  = 4,
-  PATH_ARRAY_SIZE = 5
-};
-
-extern const char* pathDists[PATH_ARRAY_SIZE];
-
 ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);

 struct ncclTopoSystem;
@@ -36,32 +25,47 @@ ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
 void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
-ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system);

 // Query topology
-ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink);
-ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink);
-ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance);
 ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
-ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance);
-ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p);
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+
+// Set CPU affinity
+ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
+
+#define NCCL_TOPO_CPU_ARCH_X86 1
+#define NCCL_TOPO_CPU_ARCH_POWER 2
+#define NCCL_TOPO_CPU_ARCH_ARM 3
+#define NCCL_TOPO_CPU_VENDOR_INTEL 1
+#define NCCL_TOPO_CPU_VENDOR_AMD 2
+#define NCCL_TOPO_CPU_TYPE_BDW 1
+#define NCCL_TOPO_CPU_TYPE_SKL 2
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);

 #define NCCL_TOPO_MAX_NODES 256

+// Init search. Needs to be done before calling ncclTopoCompute
+ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
+
 #define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
 #define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Split tree (send/recv from different ranks) flowing in both directions
 #define NCCL_TOPO_PATTERN_TREE 3            // Simple tree (send/recv from same rank) flowing in both directions
 #define NCCL_TOPO_PATTERN_RING 4            // Ring
 struct ncclTopoGraph {
  // Input / output
+  int id; // ring : 0, tree : 1, collnet : 2
  int pattern;
  int crossNic;
+  int collNet;
+  int minChannels;
+  int maxChannels;
  // Output
  int nChannels;
-  int speedIntra;
-  int speedInter;
-  int type;
-  int nvlink;
+  float speedIntra;
+  float speedInter;
+  int typeIntra;
+  int typeInter;
  int sameChannels;
  int nHops;
  int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
@@ -70,6 +74,7 @@ struct ncclTopoGraph {
 ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);

 ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);

 struct ncclTopoRanks {
  int ringRecv[MAXCHANNELS];
@@ -83,12 +88,16 @@ struct ncclTopoRanks {
 };

 ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
    struct ncclTopoRanks* topoRanks);

 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
    struct ncclTopoRanks** allTopoRanks, int* rings);

-ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph);
+ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+
+ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
+#include "info.h"
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time);

 #endif
@@ -18,7 +18,9 @@ typedef enum {
  ncclPatternPipelineTo,
  ncclPatternTreeUp,
  ncclPatternTreeDown,
-  ncclPatternTreeUpDown
+  ncclPatternTreeUpDown,
+  ncclPatternCollTreeUp,
+  ncclPatternCollTreeDown
 } ncclPattern_t;

 // Used to pass NCCL call information between functions
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,6 +8,7 @@
 #define NCCL_NET_H_

 #include "nccl.h"
+#include <stdint.h>

 #define NCCL_NET_HANDLE_MAXSIZE 64

@@ -20,43 +21,17 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

 typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Return the device path in /sys. NCCL will call free on this path.
-  ncclResult_t (*pciPath)(int dev, char** path);
-  // Return whether this device supports host pointers and/or CUDA pointers
-  // as data from the current GPU. Supported types should be composed with
-  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
-  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connectHandle
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int type, void** request);
-  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, void* data, int size, int type, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*flush)(void* recvComm, void* data, int size);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v1_t;
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+}ncclNetProperties_v3_t;
+
+typedef ncclNetProperties_v3_t ncclNetProperties_t;

 typedef struct {
  // Name of the network (mainly for logs)
@@ -65,12 +40,8 @@ typedef struct {
  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
  // Return the number of adapters.
  ncclResult_t (*devices)(int* ndev);
-  // Return the device path in /sys. NCCL will call free on this path.
-  ncclResult_t (*pciPath)(int dev, char** path);
-  // Return whether this device supports host pointers and/or CUDA pointers
-  // as data from the current GPU. Supported types should be composed with
-  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
-  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create a connection.
@@ -99,10 +70,52 @@ typedef struct {
  ncclResult_t (*closeSend)(void* sendComm);
  ncclResult_t (*closeRecv)(void* recvComm);
  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v2_t;
+} ncclNet_v3_t;

-typedef ncclNet_v2_t ncclNet_t;
+typedef ncclNet_v3_t ncclNet_t;

-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v3_t;
+
+typedef ncclCollNet_v3_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3

 #endif // end include guard
@@ -17,7 +17,7 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
 static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
+static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
@@ -31,33 +31,43 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }

+// Test whether the current GPU support GPU Direct RDMA.
 #define GPU_BUF_SIZE (2*1024*1024)
-static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) {
-  int support;
-  NCCLCHECK(ncclNet->ptrSupport(dev, &support));
-  *supportedTypes = support & ~NCCL_PTR_CUDA;
-  // The network supports GPU Direct RDMA ; verify the GPU supports it as well.
-  if (support & NCCL_PTR_CUDA) {
+static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+  int netDevs;
+  NCCLCHECK(ncclNetDevices(&netDevs));
+  *gdrSupport = 0;
+  for (int dev=0; dev<netDevs; dev++) {
+    // Find a net device which is GDR-capable
+    ncclNetProperties_t props;
+    NCCLCHECK(ncclNet->getProperties(dev, &props));
+    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    if (!hasFineGrainVramPcie()) continue;
+#endif
+
+    // Allocate memory on the GPU and try to register it on the NIC.
    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
    ncclNetHandle_t handle;
    void* gpuPtr = NULL;
    void* mHandle = NULL;
-    ncclResult_t res;
-    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup);
-    NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup);
-    NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup);
-    CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup);
-    NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res);
-    if (res != ncclSuccess) goto cleanup;
-    NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup);
-    NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup);
-    NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup);
-    *supportedTypes |= NCCL_PTR_CUDA;
-cleanup:
-    if (gpuPtr) hipFree(gpuPtr);
-    if (rComm) ncclNetCloseRecv(rComm);
-    if (sComm) ncclNetCloseSend(sComm);
-    if (lComm) ncclNetCloseListen(lComm);
+    NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
+    NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
+    NCCLCHECK(ncclNetAccept(lComm, &rComm));
+    CUDACHECK(hipExtMallocWithFlags(&gpuPtr, GPU_BUF_SIZE, hipDeviceMallocFinegrained));
+    ncclDebugNoWarn = NCCL_NET;
+    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
+      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+      *gdrSupport = 1;
+    }
+    ncclDebugNoWarn = 0;
+    CUDACHECK(hipFree(gpuPtr));
+    NCCLCHECK(ncclNetCloseRecv(rComm));
+    NCCLCHECK(ncclNetCloseSend(sComm));
+    NCCLCHECK(ncclNetCloseListen(lComm));
+    break;
  }
  return ncclSuccess;
 }
@@ -283,6 +283,7 @@ static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char*
 }

 static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+  static int shownIfName = 0;
  int nIfs = 0;
  // Allow user to force the INET socket family selection
  int sock_family = envSocketFamily();
@@ -290,6 +291,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
  char* env = getenv("NCCL_SOCKET_IFNAME");
  if (env && strlen(env) > 1) {
    // Specified by user : find or fail
+    if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
  } else {
    // Try to automatically pick the right one
@@ -53,6 +53,8 @@ struct ncclProxyArgs {
  int nsteps;
  uint64_t opCount;
  int protocol;
+  ncclDataType_t dtype;
+  ncclRedOp_t redOp;
  int state;   // add component before this line -- it is left out during initialization

  // Internal state
@@ -80,7 +82,7 @@ struct ncclProxyState {

 struct ncclTransportComm {
  ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
-  ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
+  ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
  ncclResult_t (*free)(void*);
  ncclResult_t (*proxy)(struct ncclProxyArgs*);
 };
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -14,7 +14,7 @@ int ncclCudaCompCap();

 // PCI Bus ID <-> int64 conversion functions
 ncclResult_t int64ToBusId(int64_t id, char* busId);
-ncclResult_t busIdToInt64(char* busId, int64_t* id);
+ncclResult_t busIdToInt64(const char* busId, int64_t* id);

 ncclResult_t getBusId(int cudaDev, int64_t *busId);

@@ -37,4 +37,6 @@ static long log2i(long n) {
 return l;
 }

+int busIdToCudaDev(int64_t busId);
+
 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -12,11 +12,10 @@
 #include "transport.h"
 #include "group.h"
 #include "net.h"
+#include "coll_net.h"
 #include "enqueue.h"
 #include "graph.h"
 #include "argcheck.h"
-#include "cpuset.h"
-#include <sched.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <hip/hip_runtime.h>
@@ -27,6 +26,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include "graph/topo.h"

 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -46,6 +46,7 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);

 ncclNet_t* ncclNet = NULL;
+ncclCollNet_t* ncclCollNet = NULL;

 // Returns ncclInternalError if anything fails, causing that network to be ignored.
 ncclResult_t initNet(ncclNet_t* net) {
@@ -56,7 +57,15 @@ ncclResult_t initNet(ncclNet_t* net) {
  return ncclSuccess;
 }

-ncclResult_t initNetPlugin(ncclNet_t** net) {
+ncclResult_t initCollNet(ncclCollNet_t* collnet) {
+  int ndev;
+  if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
+  if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError;
+  if (ndev <= 0) return ncclSystemError;
+  return ncclSuccess;
+}
+
+ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) {
  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
  if (netPluginLib == NULL) {
    // dlopen does not guarantee to set errno, but dlerror only gives us a
@@ -72,13 +81,17 @@ ncclResult_t initNetPlugin(ncclNet_t** net) {
  ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
  if (extNet == NULL) {
    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
-    goto cleanup;
-  }
-  if (initNet(extNet) == ncclSuccess) {
+  } else if (initNet(extNet) == ncclSuccess) {
    *net = extNet;
+    // Check for CollNet
+    ncclCollNet_t* extCollNet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL));
+    if (extCollNet == NULL) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol.");
+    } else if (initCollNet(extCollNet) == ncclSuccess) {
+      *collnet = extCollNet;
+    }
    return ncclSuccess;
  }
-cleanup:
  if (netPluginLib != NULL) dlclose(netPluginLib);
  return ncclSuccess;
 }
@@ -87,7 +100,7 @@ ncclResult_t initNet() {
  // Always initialize bootstrap network
  NCCLCHECK(bootstrapNetInit());

-  NCCLCHECK(initNetPlugin(&ncclNet));
+  NCCLCHECK(initNetPlugin(&ncclNet, &ncclCollNet));
  if (ncclNet != NULL) return ncclSuccess;
  if (initNet(&ncclNetIb) == ncclSuccess) {
    ncclNet = &ncclNetIb;
@@ -98,6 +111,8 @@ ncclResult_t initNet() {
  return ncclSuccess;
 }

+NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
+
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static ncclResult_t ncclInit() {
@@ -106,6 +121,7 @@ static ncclResult_t ncclInit() {
  if (!initialized) {
    initEnv();
    initNet();
+    INFO(NCCL_INIT, "Using network %s", ncclNetName());
    initialized = true;
  }
  pthread_mutex_unlock(&initLock);
@@ -321,6 +337,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  else
    comm->hostDevComm.collTraceThread = 0;
 #endif
+  comm->collNetSupport = 0;

  *comret = comm;
  return ncclSuccess;
@@ -334,7 +351,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  // Copy userRanks and peers
  for (int r=0; r<comm->nChannels; r++) {
    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
-    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks+1));
  }

  // Duplicate the dev comm on the device
@@ -374,19 +391,11 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
  info->shmDev = statbuf.st_dev;

  info->busId = comm->busId;
-  int netDevs;

-  NCCLCHECK(ncclNetDevices(&netDevs));
-  for (int n=0; n<netDevs; n++) {
-    int ptrSupport;
-    NCCLCHECK(ncclNetPtrSupport(n, &ptrSupport));
-    if (ptrSupport & NCCL_PTR_CUDA) info->gdrSupport |= (1 << n);
-  }
+  NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
  return ncclSuccess;
 }

-static ncclResult_t setCpuAffinity(int cudaDev);
-
 template <int type>
 static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
  for (int t=0; t<NTRANSPORTS; t++) {
@@ -395,14 +404,8 @@ static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopo
    int ret = 0;
    NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
    if (ret) {
-      cpu_set_t affinitySave;
-      sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-      int cudaDev;
-      CUDACHECK(hipGetDevice(&cudaDev));
-      setCpuAffinity(cudaDev);
      connector->transportComm = transportComm;
      NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
-      sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
      return ncclSuccess;
    }
  }
@@ -509,7 +512,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
  struct ncclConnector* conn;
  for (int i=0; i<nrecv; i++) {
    int peer = peerRecv[i];
-    if (peer == -1) continue;
+    if (peer == -1 || peer >= comm->nRanks) continue;
    conn = &channel->peers[peer].recv;
    if (conn->connected) { ++nSkippedRecv; continue; }
    memset(&connect, 0, sizeof(connect));
@@ -518,7 +521,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
  }
  for (int i=0; i<nsend; i++) {
    int peer = peerSend[i];
-    if (peer == -1) continue;
+    if (peer == -1 || peer >= comm->nRanks) continue;
    conn = &channel->peers[peer].send;
    if (conn->connected) { ++nSkippedSend; continue; }
    memset(&connect, 0, sizeof(connect));
@@ -527,29 +530,148 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
  }
  for (int i=0; i<nsend; i++) {
    int peer = peerSend[i];
-    if (peer == -1) continue;
+    if (peer == -1 || peer >= comm->nRanks) continue;
    conn = &channel->peers[peer].send;
    if (conn->connected) {++nSkippedSend; continue; }
    memset(&connect, 0, sizeof(connect));
    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
    conn->connected = 1;
  }
  for (int i=0; i<nrecv; i++) {
    int peer = peerRecv[i];
-    if (peer == -1) continue;
+    if (peer == -1 || peer >= comm->nRanks) continue;
    conn = &channel->peers[peer].recv;
    if (conn->connected) {++nSkippedRecv; continue; }
    memset(&connect, 0, sizeof(connect));
    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
    conn->connected = 1;
  }
  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
  return ncclSuccess;
 }

+extern struct ncclTransport collNetTransport;
+
+// All ranks must participate in collNetSetup call
+// type: 0 for send, 1 for recv
+// return: 0 - unsupported, 1 - supported
+static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int collNetChannels, int rank, int nranks,  int masterRank, int masterPeer, int nMasters, int type) {
+  int rankInCollNet = -1;
+  int supported = 0;
+  int isMaster = (rank == masterRank) ? 1 : 0;
+  struct {
+    int collNetRank;
+    ncclConnect connect;
+  } sendrecvExchange;
+
+  // check if we can connect to collnet, whose root is the nranks-th rank
+  struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
+  peerInfo->rank = nranks;
+  int ret = 1;
+  if (isMaster) {
+    NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo));
+  }
+
+  // send master receives connect info from peer recv master
+  if (isMaster && type == 0) {
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
+    rankInCollNet = sendrecvExchange.collNetRank;
+    INFO(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
+  }
+
+  // select
+  struct ncclPeer* root = channel->peers+nranks;
+  struct ncclConnector* conn = (type == 1) ? &root->recv : &root->send;
+  struct ncclTransportComm* transportComm = (type == 1) ? &(collNetTransport.recv) : &(collNetTransport.send);
+  conn->transportComm = transportComm;
+  // setup
+  struct ncclConnect myConnect;
+  if (isMaster && ret > 0) {
+    NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->buffSize, channel->id));
+  }
+  // prepare connect handles
+  ncclResult_t res;
+  struct {
+    int isMaster;
+    ncclConnect connect;
+  } *allConnects = NULL;
+  ncclConnect *masterConnects = NULL;
+  NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
+  if (type == 1) {  // recv side: AllGather
+    // all ranks must participate
+    NCCLCHECK(ncclCalloc(&allConnects, nranks));
+    allConnects[rank].isMaster = isMaster;
+    memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
+    NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
+    // consolidate
+    int c = 0;
+    for (int r = 0; r < nranks; r++) {
+      if (allConnects[r].isMaster) {
+        memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
+        if (r == rank) rankInCollNet = c;
+        c++;
+      }
+    }
+  } else { // send side : copy in connect info received from peer recv master
+    if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
+  }
+  // connect
+  if (isMaster && ret > 0) {
+    NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
+  }
+  // recv side sends connect info to send side
+  if (isMaster && type == 1) {
+    sendrecvExchange.collNetRank = rankInCollNet;
+    memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
+    INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
+  }
+  if (ret > 0) {
+    supported = 1;
+  }
+cleanup:
+  if (allConnects != NULL) free(allConnects);
+  if (masterConnects != NULL) free(masterConnects);
+  return supported;
+}
+
+static ncclResult_t checkCollNetSetup(struct ncclComm* comm, int rank, int collNetSetupFail) {
+  int nranks = comm->nRanks;
+  // AllGather collNet setup results
+  int* allGatherFailures;
+  NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
+  allGatherFailures[rank] = collNetSetupFail;
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
+  for (int i=0; i<nranks; i++) {
+    if (allGatherFailures[i] != 0) {
+      collNetSetupFail = 1;
+      break;
+    }
+  }
+  free(allGatherFailures);
+  if (collNetSetupFail) {
+    if (rank == 0) WARN("Cannot initialize CollNet, using %s instead", ncclNetName());
+    // Free collNet resources
+    for (int r=0; r<comm->nChannels; r++) {
+      struct ncclChannel* channel = comm->channels+r;
+      struct ncclPeer* peer = channel->peers+nranks;
+      if (peer->send.transportResources && peer->send.transportComm) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+      if (peer->recv.transportResources && peer->recv.transportComm) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
+      peer->send.transportResources = NULL; // avoid double free
+      peer->recv.transportResources = NULL; // avoid double free
+    }
+    // Set support to 0
+    comm->collNetSupport = 0;
+  } else {
+    comm->collNetSupport = 1;
+  }
+  return ncclSuccess;
+}
+
 NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);

 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
  // We use 3 AllGathers
@@ -575,7 +697,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  NCCLCHECK(fillInfo(comm, myInfo, commHash));
  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));

-  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
  for (int i = 0; i < nranks; i++) {
    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
    if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
@@ -594,60 +716,82 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
  // Recompute paths after trimming
  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
-  // Compute max speed to accelerate search
-  NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
+  // Init search
+  NCCLCHECK(ncclTopoSearchInit(comm->topo));
  // Print final topology
  NCCLCHECK(ncclTopoPrint(comm->topo));

  // Get rings and trees
-  struct ncclTopoGraph treeGraph;
-  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
-  treeGraph.crossNic = ncclParamCrossNic();
-  // We communicate only half the data between node with trees on 2 nodes.
-  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
-  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
  struct ncclTopoGraph ringGraph;
+  ringGraph.id = 0;
  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
  ringGraph.crossNic = ncclParamCrossNic();
+  ringGraph.collNet = 0;
+  ringGraph.minChannels = 1;
+  ringGraph.maxChannels = MAXCHANNELS/2;
  NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));

+  struct ncclTopoGraph treeGraph;
+  treeGraph.id = 1;
+  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+  treeGraph.crossNic = ncclParamCrossNic();
+  treeGraph.collNet = 0;
+  treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
+  treeGraph.maxChannels = ringGraph.nChannels;
+  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
+
+  struct ncclTopoGraph collNetGraph;
+  collNetGraph.id = 2;
+  collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+  collNetGraph.collNet = 1;
+  collNetGraph.crossNic = ncclParamCrossNic();
+  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
+  NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
+
+  if (comm->rank == ncclParamGraphDumpFileRank()) {
+    struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
+    NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
+  }
+
  // AllGather3 - begin
+  struct ncclGraphInfo {
+    int sameChannels;
+    float speedIntra;
+    float speedInter;
+    int typeIntra;
+  };

  struct {
    int cudaCompCap;
    int fullCudaCompCap;
-    int nvlink;
    int nChannels;
-    struct {
-      int sameChannels;
-      int speedIntra;
-      int speedInter;
-      int nvlink;
-    } tree;
-    struct {
-      int sameChannels;
-      int speedIntra;
-      int speedInter;
-      int nvlink;
-    } ring;
+    struct ncclGraphInfo tree;
+    struct ncclGraphInfo ring;
+    struct ncclGraphInfo collNet;
    struct ncclTopoRanks topoRanks;
  } *allGather3Data;

  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
-  allGather3Data[rank].nvlink = treeGraph.nvlink;
-  allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+  allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
+    std::min(treeGraph.nChannels, ringGraph.nChannels);
  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
  allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
  allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
-  allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
+  allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
  allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
  allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
  allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
-  allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
+  allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
+  allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
+  allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
+  allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
+  allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;

-  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
+  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));

  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));

@@ -675,9 +819,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
  }

-  comm->nvlink = 1;
-  for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
-
  int nChannelsOrig = comm->nChannels;
  struct ncclTopoRanks** allTopoRanks;
  NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
@@ -688,11 +829,15 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
    treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
    treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
-    treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
+    treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
    ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
    ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
    ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
-    ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
+    ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+    collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
+    collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
+    collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
+    collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
  }

  if (comm->nChannels < nChannelsOrig) {
@@ -705,6 +850,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));

  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+  if (comm->nNodes > 1 &&
+      ncclParamCollNetEnable() == 1 &&
+      collNetSupport()) {
+    NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank));
+  }

  free(allTopoRanks);
  free(nodesFirstRank);
@@ -714,7 +864,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm

  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);

-  NCCLCHECK(ncclSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph));
+  NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));

  char line[1024];
  line[0]='\0';
@@ -728,21 +878,58 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  line[1023] = '\0';
  INFO(NCCL_INIT, "Trees%s", line);

+  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+  // on the host is local.
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank));
+  ncclResult_t ret;
+
  // Connect with prev/next for each ring
  struct ncclConnect *connect;
-  NCCLCHECK(ncclCalloc(&connect, 2));
+  NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
-    NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
+    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
    if (comm->nRanks == 1) continue;
-    NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
-    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
-    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
+    NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
+    NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
+    NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
+  }
+
+  // Check if we can setup CollNet
+  if (comm->nNodes > 1 &&
+      ncclParamCollNetEnable() == 1 &&
+      collNetSupport()) {
+    int logicChannels = comm->nChannels/2;
+    int collNetSetupFail = 0;
+    const int recvIndex = 0;  // recv GPU index is always 0
+    const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;  // send GPU index depends on topo pattern
+    for (int c=0; c<logicChannels; c++) {
+      struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
+      struct ncclChannel* channelSend = comm->channels+c;
+      NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
+      NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
+      const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
+      const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
+      if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
+        collNetSetupFail = 1;
+      if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
+        collNetSetupFail = 1;
+    }
+    // Verify CollNet setup across ranks
+    NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
  }
  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
  free(connect);
  free(rings);

+  // We should have allocated all buffers, collective fifos, ... we can
+  // restore the affinity.
+affinity_restore:
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  if (ret != ncclSuccess) return ret;
+
  // Compute intra ranks (using AllGather1 data)
  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
  for (int i = 0; i < nranks; i++) {
@@ -771,98 +958,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  return ncclSuccess;
 }

-static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
-  CPU_ZERO_S(sizeof(cpu_set_t), mask);
-  char* cudaPath;
-  NCCLCHECK(ncclTopoCudaPath(cudaDev, &cudaPath));
-  char path[PATH_MAX];
-  strncpy(path, cudaPath, PATH_MAX-1);
-  snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
-  path[PATH_MAX-1] = '\0';
-  int fd;
-  SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
-  char affinityStr[sizeof(cpu_set_t)*2 + 1];
-  int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
-  if (r > 0) {
-    affinityStr[r] = '\0';
-    NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
-  }
-  close(fd);
-  free(cudaPath);
-  return ncclSuccess;
-}
-
-NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
-
-static ncclResult_t setCpuAffinity(int cudaDev) {
-  // Query the CPU affinity set we were provided
-  cpu_set_t mask;
-  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
-
-#ifdef ENABLE_TRACE
-  {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
-    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
-  }
-#endif
-
-  // Find the CPUs that are local to the supplied GPU
-  cpu_set_t gpuMask;
-  NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
-
-#ifdef ENABLE_TRACE
-  {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
-    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
-  }
-#endif
-
-  cpu_set_t finalMask;
-  if (ncclParamIgnoreCpuAffinity())
-    // Ignore the CPU affinity set and use the GPU one instead
-    finalMask = gpuMask;
-  else
-    // Use a subset of the GPU affinity set
-    CPU_AND(&finalMask, &mask, &gpuMask);
-
-  // If there is a non empty set, use it to set affinity
-  if (CPU_COUNT(&finalMask)) {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
-    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
-    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
-  }
-  return ncclSuccess;
-}
-
 ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
-  cpu_set_t affinitySave;
-  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-
-  // Make sure all host memory allocation are close to the GPU
-  CUDACHECK(hipSetDevice(cudaDev));
-  NCCLCHECK(setCpuAffinity(cudaDev));
  ncclResult_t res;

+  CUDACHECK(hipSetDevice(cudaDev));
  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);

-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
-
  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);

  return ncclSuccess;
 cleanup:
  if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
  *newcomm = NULL;
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
  return res;
 }

@@ -20,7 +20,7 @@ ncclResult_t wrapNvmlShutdown(void) {
 }

 ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  return ncclSuccess;
+  return ncclSystemError;
 }

 ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
@@ -29,7 +29,7 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
 }

 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
-  return ncclSuccess;
+  return ncclSystemError;
 }

 ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
@@ -38,17 +38,16 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min
 }

 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
-  return ncclSuccess;
+  return ncclSystemError;
 }

 ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
-  return ncclSuccess;
+  return ncclSystemError;
 }

 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
    nvmlNvLinkCapability_t capability, unsigned int *capResult) {
-  *capResult = 0;
-  return ncclSuccess;
+  return ncclSystemError;
 }

 ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -26,7 +26,7 @@ ncclResult_t int64ToBusId(int64_t id, char* busId) {
  return ncclSuccess;
 }

-ncclResult_t busIdToInt64(char* busId, int64_t* id) {
+ncclResult_t busIdToInt64(const char* busId, int64_t* id) {
  const int size = strlen(busId);
  char* hexStr;
  NCCLCHECK(ncclCalloc(&hexStr, size));
@@ -101,6 +101,7 @@ static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {

  struct ncclPeer* peerComm = args->channel->peers+peer;
  struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
+  if (connector->transportComm == NULL) return ncclInternalError;
  if (connector->transportComm->proxy == NULL) return ncclSuccess;

  struct ncclProxyArgs* op;
@@ -131,6 +132,18 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r
    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
  }
+  if (pattern == ncclPatternCollTreeUp) {
+    // CollTree up
+    struct ncclTree* tree = &args->channel->collTreeUp;
+    NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
+    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+  }
+  if (pattern == ncclPatternCollTreeDown) {
+    // CollTree down
+    struct ncclTree* tree = &args->channel->collTreeDn;
+    NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
+    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+  }
  return ncclSuccess;
 }

@@ -0,0 +1,431 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "coll_net.h"
+#include "graph.h"
+#include <assert.h>
+
+struct collNetRecvConnectInfo {
+  collNetHandle_t collNetHandle;
+};
+
+struct collNetSendConnectInfo {
+  void* collNetComm;
+  void* mhandle;
+  void* llMhandle;
+  struct reqSlot* reqFifo;
+};
+
+struct ncclLLDataLine {
+  uint32_t data1;
+  uint32_t data2;
+};
+static_assert(sizeof(struct ncclLLDataLine) == sizeof(union ncclLLFifoLine)>>1, "ncclLLDataLine is not half size of ncclLLFifoLine");
+
+struct reqSlot {
+  volatile void* recvBuff;
+  volatile int size;
+};
+
+struct collNetSendResources {
+  void* collNetSendComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  struct ncclLLDataLine* llData;
+  int netDev;
+  int useGdr;
+  int buffSize;
+  void* sendMhandle;
+  void* llSendMhandle;
+  void* recvMhandle;
+  void* llRecvMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
+  uint64_t llLastCleaning;
+  struct reqSlot* reqFifo;
+  int collNetRank;
+};
+
+struct collNetRecvResources {
+  void* netListenComm;
+  void* collNetRecvComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  struct ncclLLDataLine* llData;
+  int netDev;
+  int useGdr;
+  int buffSize;
+  void* mhandle;
+  void* llMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
+  uint64_t llLastCleaning;
+  struct reqSlot* reqFifo;
+  int collNetRank;
+};
+
+/* Determine if we can communicate with the peer */
+ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  *ret = 1;
+  return ncclSuccess;
+}
+
+/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
+ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+  struct collNetSendResources* sendResources;
+  NCCLCHECK(ncclCalloc(&sendResources, 1));
+  send->transportResources = sendResources;
+
+  NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &sendResources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr));
+
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostSendMem, (void**)&sendResources->devHostSendMem, sendSize));
+
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  if (sendResources->useGdr) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&sendResources->devRecvMem), recvSize, true));
+  }
+  NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostRecvMem, (void**)&sendResources->devHostRecvMem, recvSize));
+  NCCLCHECK(ncclIbMalloc((void**)&(sendResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
+  sendResources->buffSize = buffSize;
+
+  INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), sendResources->netDev,
+      sendResources->useGdr ? "/GDRDMA" : "");
+
+  return ncclSuccess;
+}
+
+/* Setup recv connector */
+ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+  struct collNetRecvResources* recvResources;
+  NCCLCHECK(ncclCalloc(&recvResources, 1));
+  recv->transportResources = recvResources;
+
+  NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &recvResources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr));
+
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostSendMem, (void**)&recvResources->devHostSendMem, sendSize));
+
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  if (recvResources->useGdr) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&recvResources->devRecvMem), recvSize, true));
+  }
+  NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostRecvMem, (void**)&recvResources->devHostRecvMem, recvSize));
+  NCCLCHECK(ncclIbMalloc((void**)&(recvResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
+  recvResources->buffSize = buffSize;
+
+  INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), recvResources->netDev,
+      recvResources->useGdr ? "/GDRDMA" : "");
+
+  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
+  NCCLCHECK(collNetListen(recvResources->netDev, &info->collNetHandle, &recvResources->netListenComm));
+
+  return ncclSuccess;
+}
+
+ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
+  // Setup device pointers
+  struct collNetSendResources* sendResources = (struct collNetSendResources*)send->transportResources;
+  sendResources->collNetRank = rank;
+
+  // Get info from recv side
+  struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
+  sendResources->reqFifo = sInfo->reqFifo;
+  sendResources->collNetSendComm = sInfo->collNetComm;
+  sendResources->recvMhandle = sInfo->mhandle;
+  sendResources->llRecvMhandle = sInfo->llMhandle;
+
+  // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
+  struct ncclRecvMem* sRecvMem = sendResources->useGdr ? sendResources->devRecvMem : sendResources->devHostRecvMem;
+  // Register buffers
+  NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sRecvMem->buff, sendResources->buffSize,
+        sendResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &sendResources->sendMhandle));
+  NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sendResources->llData,
+        NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &sendResources->llSendMhandle));
+
+  send->conn.buff = sRecvMem->buff;
+  send->conn.llBuff = sendResources->devHostRecvMem->llBuff;
+  send->conn.direct |= sendResources->useGdr ? NCCL_DIRECT_NIC : 0;
+
+  // Head/Tail/Opcount/Fifos are always on host
+  send->conn.tail = &sendResources->devHostRecvMem->tail;
+  send->conn.opCountRem = &sendResources->devHostRecvMem->opCount;
+  send->conn.fifo = sendResources->devHostRecvMem->sizesFifo;
+  send->conn.head = &sendResources->devHostSendMem->head;
+  send->conn.opCountLoc = &sendResources->devHostSendMem->opCount;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+
+  return ncclSuccess;
+}
+
+ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
+  // Setup device pointers
+  struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recv->transportResources;
+  struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
+  recvResources->collNetRank = rank;
+
+  // Intermediate buffering on GPU for GPU Direct RDMA
+  struct ncclRecvMem* rRecvMem = recvResources->useGdr ? recvResources->devRecvMem : recvResources->devHostRecvMem;
+  recv->conn.buff = rRecvMem->buff;
+  recv->conn.llBuff = recvResources->devHostRecvMem->llBuff;  // recv LL buff always on host
+  recv->conn.direct |= recvResources->useGdr ? NCCL_DIRECT_NIC : 0;
+
+  // Head/Tail/Opcount are always on host
+  recv->conn.tail = &recvResources->devHostRecvMem->tail;
+  recv->conn.opCountLoc = &recvResources->devHostRecvMem->opCount;
+  recv->conn.head = &recvResources->devHostSendMem->head;
+  recv->conn.opCountRem = &recvResources->devHostSendMem->opCount;
+
+  // Connect to coll comm
+  collNetHandle_t** handlePtrs = NULL;
+  NCCLCHECK(ncclCalloc(&handlePtrs, nranks));
+  for (int i = 0; i < nranks; i++) {
+    struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
+    handlePtrs[i] = &(info->collNetHandle);
+  }
+  ncclResult_t res;
+  NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, recvResources->netListenComm, &recvResources->collNetRecvComm), res, cleanup);
+
+  // Register buffers
+  NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, rRecvMem->buff, recvResources->buffSize,
+        recvResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &recvResources->mhandle));
+  NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, recvResources->llData,
+        NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &recvResources->llMhandle));
+
+  // Create shared info between send and recv proxies
+  NCCLCHECK(ncclCalloc(&(recvResources->reqFifo), NCCL_STEPS));
+
+  // Pass info to send side
+  sInfo->reqFifo = recvResources->reqFifo;
+  sInfo->collNetComm = recvResources->collNetRecvComm;
+  sInfo->mhandle = recvResources->mhandle;
+  sInfo->llMhandle = recvResources->llMhandle;
+
+cleanup:
+  if (handlePtrs != NULL) free(handlePtrs);
+  // Close listen comm
+  NCCLCHECK(collNetCloseListen(recvResources->netListenComm));
+
+  return res;
+}
+
+ncclResult_t collNetSendFree(void* sendTransportResources) {
+  struct collNetSendResources* sendResources = (struct collNetSendResources*)sendTransportResources;
+  NCCLCHECK(ncclCudaHostFree(sendResources->hostSendMem));
+  NCCLCHECK(ncclCudaHostFree(sendResources->hostRecvMem));
+  if (sendResources->collNetSendComm) {
+    NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->sendMhandle));
+    NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->llSendMhandle));
+  }
+  if (sendResources->useGdr)
+    CUDACHECK(hipFree(sendResources->devRecvMem));
+  free(sendResources->llData);
+  free(sendResources);
+  return ncclSuccess;
+}
+
+ncclResult_t collNetRecvFree(void* recvTransportResources) {
+  struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recvTransportResources;
+  NCCLCHECK(ncclCudaHostFree(recvResources->hostSendMem));
+  if (recvResources->collNetRecvComm) {
+    NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->mhandle));
+    NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->llMhandle));
+  }
+  NCCLCHECK(ncclCudaHostFree(recvResources->hostRecvMem));
+  if (recvResources->useGdr)
+    CUDACHECK(hipFree(recvResources->devRecvMem));
+  free(recvResources->llData);
+  free(recvResources->reqFifo);
+
+  // Make sure SendFree is called before RecvFree
+  if (recvResources->collNetRecvComm) {
+    NCCLCHECK(collNetCloseColl(recvResources->collNetRecvComm));
+  }
+  free(recvResources);
+  return ncclSuccess;
+}
+
+ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
+  if (args->protocol == NCCL_PROTO_LL128) {
+    WARN("CollNet does not support LL128");
+    return ncclInternalError;
+  }
+  struct collNetSendResources* resources = (struct collNetSendResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    STORE(&resources->hostRecvMem->opCount, args->opCount);
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    struct reqSlot* reqFifo = resources->reqFifo;
+    if (args->head < args->end) {
+      int buffSlot = args->tail%NCCL_STEPS;
+      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS
+          && reqFifo[buffSlot].recvBuff != NULL) {
+        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+        if (args->protocol == NCCL_PROTO_LL) {
+          int size = LOAD(sizesFifo+buffSlot);
+          if (size != -1) {
+            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
+            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+            union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+            int ready = 1;
+            for (int i=0; i<nFifoLines; i++) {
+              volatile uint32_t *f1 = &lines[i].flag1;
+              volatile uint32_t *f2 = &lines[i].flag2;
+              if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
+            }
+            if (ready) {
+              //separate data from flag
+              struct ncclLLDataLine* sendBuff = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
+              for (int i=0; i<nFifoLines; i++) {
+                volatile uint32_t *d1 = &lines[i].data1;
+                volatile uint32_t *d2 = &lines[i].data2;
+                sendBuff[i].data1 = LOAD(d1);
+                sendBuff[i].data2 = LOAD(d2);
+              }
+              int count = nFifoLines*sizeof(struct ncclLLDataLine) / ncclTypeSize(args->dtype);
+              NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->llSendMhandle, resources->llRecvMhandle, args->requests+buffSlot));
+              if (args->requests[buffSlot] != NULL) {
+                TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
+                STORE(sizesFifo+buffSlot, -1);
+                // Make sure size is reset to zero before we update the head.
+                __sync_synchronize();
+                args->tail += args->sliceSteps;
+                args->idle = 0;
+              }
+            }
+          }
+        } else if (args->tail < LOAD(recvTail)) {
+          int stepSize = args->channel->buffSize/NCCL_STEPS;
+          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+          // Send through network
+          if (LOAD(sizesFifo+buffSlot) != -1) {
+            int count = LOAD(sizesFifo+buffSlot)/ncclTypeSize(args->dtype);
+            NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localMem->buff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->sendMhandle, resources->recvMhandle, args->requests+buffSlot));
+            if (args->requests[buffSlot] != NULL) {
+              TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
+              STORE(sizesFifo+buffSlot, -1);
+              // Make sure size is reset to zero before we update the head.
+              __sync_synchronize();
+              args->tail += args->sliceSteps;
+              args->idle = 0;
+            }
+          }
+        }
+      }
+      if (args->head < args->tail) {
+        int done, size;
+        int buffSlot = args->head%NCCL_STEPS;
+        NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
+        if (done) {
+          TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->head, buffSlot, args->requests[buffSlot], size);
+          reqFifo[buffSlot].size = size;
+          // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
+          // (reordered store after store is possible on POWER, though not on x86)
+          __sync_synchronize();
+          reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy
+          args->head += args->sliceSteps;
+          STORE(&resources->hostSendMem->head, args->head);
+          args->idle = 0;
+        }
+      }
+    }
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
+  if (args->protocol == NCCL_PROTO_LL128) {
+    WARN("CollNet does not support LL128");
+    return ncclInternalError;
+  }
+  struct collNetRecvResources* resources = (struct collNetRecvResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    STORE(&resources->hostSendMem->opCount, args->opCount);
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine) : args->channel->buffSize ) / NCCL_STEPS;
+    struct reqSlot* reqFifo = resources->reqFifo;
+    if (args->head < args->end) {
+      struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+      char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)resources->llData : localMem->buff;
+      void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : resources->mhandle;
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+        int buffSlot = args->tail%NCCL_STEPS;
+        reqFifo[buffSlot].recvBuff = localBuff+buffSlot*stepSize;
+        TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, localBuff+buffSlot*stepSize);
+        args->tail += args->sliceSteps;
+        args->idle = 0;
+      }
+      if (args->tail > args->head) {
+        int buffSlot = args->head%NCCL_STEPS;
+        if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
+          TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->head, buffSlot, reqFifo[buffSlot].size);
+          args->head += args->sliceSteps;
+          if (args->protocol == NCCL_PROTO_LL) { // ll
+            // re-attach flag
+            uint32_t flag = args->head;
+            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(resources->hostRecvMem->llBuff)+buffSlot*NCCL_LL_SLICE_LINES;
+            struct ncclLLDataLine* recvData = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
+            int nFifoLines = DIVUP(reqFifo[buffSlot].size, sizeof(struct ncclLLDataLine));
+            for (int i=0; i<nFifoLines; i++) {
+              lines[i].v[0] = ((uint64_t)flag << 32) + recvData[i].data1;
+              lines[i].v[1] = ((uint64_t)flag << 32) + recvData[i].data2;
+            }
+          } else if (args->protocol == NCCL_PROTO_SIMPLE) {
+            if (resources->useGdr) collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle);
+            STORE(&resources->hostRecvMem->tail, args->head);
+          }
+          args->idle = 0;
+        }
+      }
+    }
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport collNetTransport = {
+  "COL",
+  collNetCanConnect,
+  { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy },
+  { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy }
+};
@@ -55,52 +55,6 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
  return ncclSuccess;
 }

-NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
-NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
-
-static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) {
-  *useGdr = 0;
-
-  int cudaDev;
-  CUDACHECK(hipGetDevice(&cudaDev));
-
-  if (!hasFineGrainVramPcie()) {
-    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev);
-    return ncclSuccess;
-  }
-
-  if (read) { // For reads (sends) only enable under certain conditions
-    int gdrReadParam = ncclParamNetGdrRead();
-    if (gdrReadParam == 0) return ncclSuccess;
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    return ncclSuccess;
-#else
-    if (gdrReadParam < 0) {
-       int nvlink;
-       NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink));
-       if (!nvlink) return ncclSuccess;
-    }
-#endif
-  }
-
-  // Check if we are close enough that it makes sense to enable GDR
-  int netGdrLevel = ncclParamNetGdrLevel();
-  int distance;
-  NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance));
-  if (distance >= netGdrLevel) {
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel);
-    return ncclSuccess;
-  }
-
-  // Finally, check if the NIC supports it
-  int flags;
-  NCCLCHECK(ncclNetPtrSupport(netDev, &flags));
-  if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
-  *useGdr = 1;
-  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read);
-  return ncclSuccess;
-}
-
 /* Determine if we will use this transport for this peer and return connect
 * information for this peer */
 ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
@@ -109,7 +63,7 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  send->transportResources = resources;

  NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
-  NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));

  int sendSize = sizeof(struct ncclSendMem);
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -132,7 +86,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  recv->transportResources = resources;

  NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
-  NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));

  int sendSize = sizeof(struct ncclSendMem);
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -151,7 +105,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  return ncclSuccess;
 }

-ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  // Setup device pointers
  struct netSendResources* resources = (struct netSendResources*)send->transportResources;

@@ -160,6 +114,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  send->conn.buff = recvMem->buff;
  send->conn.llBuff = resources->devHostRecvMem->llBuff;
  send->conn.ll128Buff = recvMem->ll128Buff;
+  send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;

  // Head/Tail/Opcount/Fifos are always on host
  send->conn.tail = &resources->devHostRecvMem->tail;
@@ -184,7 +139,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
 }

 /* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  // Setup device pointers
  struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;

@@ -193,6 +148,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  recv->conn.buff = recvMem->buff;
  recv->conn.llBuff = recvMem->llBuff;
  recv->conn.ll128Buff = recvMem->ll128Buff;
+  recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;

  // Head/Tail/Opcount are always on host
  recv->conn.tail = &resources->devHostRecvMem->tail;
@@ -29,13 +29,19 @@
 #define MAXNAMESIZE 64
 static char ncclIbIfName[MAX_IF_NAME_SIZE];
 static union socketAddress ncclIbIfAddr;
+
 static int ncclNIbDevs = -1;
 struct ncclIbDev {
  int device;
+  uint64_t guid;
  uint8_t port;
  uint8_t link;
+  int speed;
  ibv_context* context;
  char devName[MAXNAMESIZE];
+  char* pciPath;
+  int realPort;
+  int maxQp;
 };

 #define MAX_IB_PORT 15
@@ -54,20 +60,7 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
 NCCL_PARAM(IbSl, "IB_SL", 0);
 NCCL_PARAM(IbTc, "IB_TC", 0);
-
-// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
-// allocated on separate pages as those pages will be marked DONTFORK
-// and if they are shared, that could cause a crash in a child process
-static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
-  size_t page_size = sysconf(_SC_PAGESIZE);
-  void* p;
-  int size_aligned = ROUNDUP(size, page_size);
-  int ret = posix_memalign(&p, page_size, size_aligned);
-  if (ret != 0) return ncclSystemError;
-  memset(p, 0, size);
-  *ptr = p;
-  return ncclSuccess;
-}
+NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);

 pthread_t ncclIbAsyncThread;
 static void* ncclIbAsyncThreadMain(void* args) {
@@ -86,6 +79,39 @@ static void* ncclIbAsyncThreadMain(void* args) {

 NCCL_PARAM(IbDisable, "IB_DISABLE", 0);

+static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) {
+  char devicePath[PATH_MAX];
+  snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
+  char* p = realpath(devicePath, NULL);
+  if (p == NULL) {
+    WARN("Could not find real path of %s", *devicePath);
+  } else {
+    // Merge multi-port NICs into the same PCI device
+    p[strlen(p)-1] = '0';
+    // And keep the real port aside (the ibv port is always 1 on recent cards)
+    *realPort = 0;
+    for (int d=0; d<ncclNIbDevs; d++) {
+      if (strcmp(p, ncclIbDevs[d].pciPath) == 0) (*realPort)++;
+    }
+  }
+  *path = p;
+  return ncclSuccess;
+}
+
+static int ibvWidths[] = { 1, 4, 8, 12 };
+static int ibvSpeeds[] = { 2500, 5000, 10000, 10000, 14000, 25000, 50000 };
+static int firstBitSet(int val, int max) {
+  int i = 0;
+  while (i<max && ((val & (1<<i)) == 0)) i++;
+  return i;
+}
+static int ncclIbWidth(int width) {
+  return ibvWidths[firstBitSet(width, sizeof(ibvWidths)/sizeof(int)-1)];
+}
+static int ncclIbSpeed(int speed) {
+  return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds)/sizeof(int)-1)];
+}
+
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
  if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
  if (ncclParamIbDisable()) return ncclInternalError;
@@ -146,10 +172,14 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
          TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
              portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
          ncclIbDevs[ncclNIbDevs].device = d;
+          ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
          ncclIbDevs[ncclNIbDevs].port = port;
          ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+          ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
          ncclIbDevs[ncclNIbDevs].context = context;
          strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+          NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
+          ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
          ncclNIbDevs++;
          nPorts++;
          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
@@ -181,17 +211,6 @@ ncclResult_t ncclIbDevices(int* ndev) {
  return ncclSuccess;
 }

-ncclResult_t ncclIbPciPath(int dev, char** path) {
-  char devicepath[PATH_MAX];
-  snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName);
-  *path = realpath(devicepath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", devicepath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
 // Detect whether GDR can work on a given NIC with the current CUDA device
 // Returns :
 // ncclSuccess : GDR works
@@ -209,19 +228,24 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
  return ncclSuccess;
 }

-ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
-  *supportedTypes = NCCL_PTR_HOST;
-
-  if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
-    return ncclSuccess;
-  }
-  *supportedTypes |= NCCL_PTR_CUDA;
+static ncclResult_t GetSocketAddr(union socketAddress* addr) {
+  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
  return ncclSuccess;
 }

-static ncclResult_t GetSocketAddr(union socketAddress* addr) {
-  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
+ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
+  props->name = ncclIbDevs[dev].devName;
+  props->pciPath = ncclIbDevs[dev].pciPath;
+  props->guid = ncclIbDevs[dev].guid;
+  props->ptrSupport = NCCL_PTR_HOST;
+  if (ncclIbGdrSupport(dev) != ncclSuccess) {
+    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
+  } else {
+    props->ptrSupport |= NCCL_PTR_CUDA;
+  }
+  props->speed = ncclIbDevs[dev].speed;
+  props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
+  props->maxComms = ncclIbDevs[dev].maxQp;
  return ncclSuccess;
 }

@@ -330,7 +354,8 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
  qpInitAttr.send_cq = verbs->cq;
  qpInitAttr.recv_cq = verbs->cq;
  qpInitAttr.qp_type = IBV_QPT_RC;
-  qpInitAttr.cap.max_send_wr = MAX_REQUESTS;
+  // We might send 2 requests per send (RDMA_WRITE+RDMA_WRITE_WITH_IMM)
+  qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
  qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
  qpInitAttr.cap.max_send_sge = 1;
  qpInitAttr.cap.max_recv_sge = 1;
@@ -632,6 +657,10 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
  wr.opcode = IBV_WR_SEND;
  wr.send_flags = IBV_SEND_SIGNALED;

+  int useAr = 0;
+  if (size > ncclParamIbArThreshold()) {
+    useAr = 1;
+  }
 #if USE_RDMA_WRITE
  __sync_synchronize(); // order the readyPtr load against rkey load below
  // Sanity checks to catch user collective call count/size mismatches
@@ -641,7 +670,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
        size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead);
    return ncclInternalError;
  }
-  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM;
  wr.wr.rdma.remote_addr = LOAD(&slot->addr);
  wr.wr.rdma.rkey = LOAD(&slot->rkey);
  wr.imm_data = size; // Send the message size via imm_data
@@ -656,6 +685,19 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo

  struct ibv_send_wr* bad_wr;
  NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+
+#if USE_RDMA_WRITE
+  // When using adaptive routing, send the bulk of the data first as an
+  // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+  // completion.
+  if (useAr) {
+    wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+    wr.sg_list = NULL;
+    wr.num_sge = 0;
+    wr.send_flags &= ~IBV_SEND_SIGNALED;
+    NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+  }
+#endif
  *request = req;
  return ncclSuccess;
 }
@@ -840,8 +882,7 @@ ncclNet_t ncclNetIb = {
  "IB",
  ncclIbInit,
  ncclIbDevices,
-  ncclIbPciPath,
-  ncclIbPtrSupport,
+  ncclIbGetProperties,
  ncclIbListen,
  ncclIbConnect,
  ncclIbAccept,
@@ -20,16 +20,31 @@
 #include <fcntl.h>

 /* Init functions */
-static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress ncclNetIfAddrs[MAX_IFS];
 static int ncclNetIfs = -1;
+struct ncclSocketDev {
+  union socketAddress addr;
+  char devName[MAX_IF_NAME_SIZE];
+  char* pciPath;
+};
+static struct ncclSocketDev ncclSocketDevs[MAX_IFS];
+
 pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;

+static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
+  char devicePath[PATH_MAX];
+  snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName);
+  // May return NULL if the file doesn't exist.
+  *pciPath = realpath(devicePath, NULL);
+  return ncclSuccess;
+}
+
 ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
  if (ncclNetIfs == -1) {
    pthread_mutex_lock(&ncclSocketLock);
    if (ncclNetIfs == -1) {
-      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      char names[MAX_IF_NAME_SIZE*MAX_IFS];
+      union socketAddress addrs[MAX_IFS];
+      ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
      if (ncclNetIfs <= 0) {
        WARN("NET/Socket : no interface found");
        return ncclInternalError;
@@ -38,8 +53,11 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
        char addrline[1024];
        line[0] = '\0';
        for (int i=0; i<ncclNetIfs; i++) {
-          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
-              socketToString(&ncclNetIfAddrs[i].sa, addrline));
+          strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
+          memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
+          NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
+              socketToString(&addrs[i].sa, addrline));
        }
        line[1023] = '\0';
        INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
@@ -50,30 +68,44 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
  return ncclSuccess;
 }

-ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
-  *supportedTypes = NCCL_PTR_HOST;
-  return ncclSuccess;
-}
-
 ncclResult_t ncclSocketDevices(int* ndev) {
  *ndev = ncclNetIfs;
  return ncclSuccess;
 }

-ncclResult_t ncclSocketPciPath(int dev, char** path) {
-  char devicepath[PATH_MAX];
-  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
-  *path = realpath(devicepath, NULL);
-  if (*path == NULL) {
-    INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
-    return ncclSystemError;
+static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
+  *speed = 0;
+  char speedPath[PATH_MAX];
+  sprintf(speedPath, "/sys/class/net/%s/speed", devName);
+  int fd = open(speedPath, O_RDONLY);
+  if (fd != -1) {
+    char speedStr[] = "        ";
+    if (read(fd, speedStr, sizeof(speedStr)-1) > 0) {
+      *speed = strtol(speedStr, NULL, 0);
+    }
+    close(fd);
  }
+  if (*speed <= 0) {
+    INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath);
+    *speed = 10000;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
+  props->name = ncclSocketDevs[dev].devName;
+  props->pciPath = ncclSocketDevs[dev].pciPath;
+  props->guid = dev;
+  props->ptrSupport = NCCL_PTR_HOST;
+  NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
+  props->port = 0;
+  props->maxComms = 65536;
  return ncclSuccess;
 }

 ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
  if (dev >= ncclNetIfs) return ncclInternalError;
-  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
+  memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
  return ncclSuccess;
 }

@@ -197,7 +229,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
    // Auto-detection
    int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
    char vendorPath[PATH_MAX];
-    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclSocketDevs[dev].devName);
    char* rPath = realpath(vendorPath, NULL);
    int fd = open(rPath, O_RDONLY);
    free(rPath);
@@ -487,8 +519,7 @@ ncclNet_t ncclNetSocket = {
  "Socket",
  ncclSocketInit,
  ncclSocketDevices,
-  ncclSocketPciPath,
-  ncclSocketPtrSupport,
+  ncclSocketGetProperties,
  ncclSocketListen,
  ncclSocketConnect,
  ncclSocketAccept,
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -47,9 +47,6 @@ struct p2pRecvResources {

 #include <sys/types.h>

-NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
-NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
-
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 int busIdToCudaDev(int64_t busId) {
  int ndev;
@@ -69,96 +66,51 @@ int busIdToCudaDev(int64_t busId) {

 /* Determine if two peers can communicate through p2p */
 ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
-  int cpuCount;
-  NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount));
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  int p2pLevel = PATH_ARRAY_SIZE;
-#else
-  // Do not use P2P across sockets by default (provided CUDA permits it).
-  // When we are on a single socket, don't even use P2P through the CPU as
-  // it should be able to sustain two flows to sysmem faster than PCI P2P.
-  int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE;
+  if (!hasFineGrainVramPcie())  {
+    *ret = 0;
+    return ncclSuccess;
+  }
 #endif
-  if (ncclParamP2pDisable() == 1) p2pLevel = 0;
-  if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
-
-  // Disable P2P
-  *ret = 0;
-
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  if (!hasFineGrainVramPcie()) return ncclSuccess;
-#endif
-
-  if (p2pLevel == 0) return ncclSuccess;

  // Rule out different nodes
-  if (info1->hostHash != info2->hostHash) return ncclSuccess;
+  if (info1->hostHash != info2->hostHash) {
+    *ret = 0;
+    return ncclSuccess;
+  }
+
+  // Check topology / p2p level.
+  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret));
+  if (*ret == 0) return ncclSuccess;

  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
  int cudaDev1 = busIdToCudaDev(info1->busId);
  int cudaDev2 = busIdToCudaDev(info2->busId);
  if (cudaDev1 == -1 || cudaDev2 == -1) {
-    // Peer's CUDA device is not visible in this process
 #if CUDART_VERSION >= 10010
-    // But in CUDA 10.1 we can still communicate with 'invisible' devices
-    TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId);
-    // Check for NVLink/NVswitch including P2P access
-    int nvlink;
-    NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
-    if (nvlink > 0) {
-      *ret = 1;
-      return ncclSuccess;
-    }
+    // CUDA 10.1 and later can use P2P with invisible devices.
+    return ncclSuccess;
+#else
+    // Peer's CUDA device is not visible in this process : we can't communicate with it.
+    *ret = 0;
+    return ncclSuccess;
 #endif
-    return ncclSuccess;
  }

-  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId);
-
-  // Do not detect topology if we're on the same GPU. Note this is not really supported.
-  if (cudaDev1 == cudaDev2) {
-    *ret = 1;
-    return ncclSuccess;
-  }
-
-  // See if CUDA can do P2P
+  // Check that CUDA can do P2P
  int p2p;
  if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {
    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
         cudaDev1, info1->busId, cudaDev2, info2->busId);
+    *ret = 0;
    return ncclSuccess;
  }
-  if (p2p == 0) return ncclSuccess;
-
-  int nvlink = 0;
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  uint32_t link_type, hops;
-  if (hipExtGetLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) != hipSuccess) {
-    p2p = 0;
+  if (p2p == 0) {
+    INFO(NCCL_INIT|NCCL_P2P,"Could not enable P2P between dev %d(=%lx) and dev %d(=%lx)",
+         cudaDev1, info1->busId, cudaDev2, info2->busId);
+    *ret = 0;
    return ncclSuccess;
  }
-  static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
-  static unsigned long long link_status_print_once_mask = 0;
-  if (!(link_status_print_once_mask & (1 << (cudaDev1*8 + cudaDev2)))) {
-    INFO(NCCL_INIT, "%d -> %d: link type %s hops %d", cudaDev1, cudaDev2,
-      link_type_name[link_type], hops);
-    link_status_print_once_mask |= (1 << (cudaDev1*8 + cudaDev2));
-  }
-  if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI)
-    nvlink = (hops == 1);
-#else  // Check for NVLink/NVswitch
-  NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
-#endif
-  if (nvlink > 0) {
-    *ret = 1;
-    return ncclSuccess;
-  }
-  // Finally compute the PCI distance and compare with the p2pLevel.
-  int distance;
-  NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance));
-  if (distance < p2pLevel) {
-    *ret = 1;
-  }
  return ncclSuccess;
 }

@@ -301,13 +253,13 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
 }

 /* Connect/Send to this peer */
-static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
  struct ncclRecvMem* remDevMem;
  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
  if (info->direct) {
    remDevMem = (struct ncclRecvMem*)(info->directPtr);
-    send->conn.direct = 1;
+    send->conn.direct |= NCCL_DIRECT_GPU;
  } else {
    //TRACE_DUMP_IPC(&info->devIpc);
    hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
@@ -339,13 +291,13 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
 }

 /* Connect/Recv from this peer */
-ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
  struct ncclSendMem* remDevMem;
  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
  if (info->direct) {
    remDevMem = (struct ncclSendMem*)(info->directPtr);
-    recv->conn.direct = 1;
+    recv->conn.direct |= NCCL_DIRECT_GPU;
    recv->conn.ptrExchange = &remDevMem->ptrExchange;
  } else {
    //TRACE_DUMP_IPC(&info->devIpc);
@@ -104,7 +104,7 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
 }

 /* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  // Setup device pointers
  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
  struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -129,7 +129,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  return ncclSuccess;
 }

-ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  // Setup device pointers
  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
@@ -8,7 +8,8 @@ HIPCC = $(HIP_PATH)/bin/hipcc
 EXE = topo_expl
 CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE

-files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/graph/search.cc ../../src/graph/connect.cc
+files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc \
+	../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc

 all: $(EXE)

@@ -23,397 +23,114 @@ THE SOFTWARE.
 #ifndef MODEL_H_
 #define MODEL_H_

-class CpuDevices {
-private:
-  char *cpuName;
-  int interCpuWidth;
-  int cpuPciWidth;
-  int p2pPciWidth;
-
-public:
-  CpuDevices(const char *cpuname, const int intercpuwidth, const int cpupciwidth, const int p2ppciwidth) :
-    cpuName((char *)cpuname), interCpuWidth(intercpuwidth), cpuPciWidth(cpupciwidth), p2pPciWidth(p2ppciwidth) {}
-
-  CpuDevices() : cpuName(0), interCpuWidth(0), cpuPciWidth(0), p2pPciWidth(0) {}
-
-  ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) {
-    strcpy(name, cpuName);
-    *interCpu = interCpuWidth;
-    *cpuPci = cpuPciWidth;
-    *p2pPci = p2pPciWidth;
-    return ncclSuccess;
-  }
-};
-
-class GpuDevices {
-private:
-  int nGpus;
-  uint64_t *busIds;
-  char **gpuPciPaths;
-  int *gpuNumaIds;
-  int *connMatrix;
-
-public:
-  GpuDevices(const int ngpus, const uint64_t *busids, const char **gpupcipaths, const int *gpunumaids, const int *connmatrix) :
-    nGpus(ngpus), busIds((uint64_t *)busids), gpuPciPaths((char **)gpupcipaths), gpuNumaIds((int *)gpunumaids), connMatrix((int *)connmatrix) {}
-
-  GpuDevices () : nGpus(0), busIds(0), gpuPciPaths(0), gpuNumaIds(0), connMatrix(0) {}
-
-  int getnDevs() { return nGpus; }
-
-  uint64_t getBusId(int dev) { return busIds[dev]; }
-
-  ncclResult_t getPciPath(char* busId, char** path) {
-    char tempBusId[] = "0000:00:00.0";
-    *path = (char *)malloc(PATH_MAX);
-    int i;
-    for (i = 0; i < nGpus; i++) {
-      NCCLCHECK(int64ToBusId(busIds[i], tempBusId));
-      if (strcmp(busId, tempBusId) == 0)
-        break;
-    }
-    if (i < nGpus)
-      strcpy(*path, gpuPciPaths[i]);
-    else {
-      WARN("Could not find real path of %s", busId);
-      return ncclSystemError;
-    }
-    return ncclSuccess;
-  }
-
-  int p2pCanConnect(int device1, int device2) {
-    // connection matrix are 8 GPUs
-    int dist = connMatrix[device1*8+device2];
-    if (dist == 255)
-      return 0;
-    //if (dist%15 == 0 && dist/15 != 1) {
-    //  return 0;
-    //}
-    return 1;
-  };
-
-  hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) {
-    // connection matrix are 8 GPUs
-    int dist = connMatrix[device1*8+device2];
-
-    if (dist%15 == 0) {
-      *linktype = 4;
-      *hopcount = dist/15;
-    }
-    else if (dist%20 == 0) {
-      *linktype = 2;
-      *hopcount = dist/20;
-    }
-    else if (dist%36 == 0) {
-      *linktype = 1;
-      *hopcount = dist/36;
-    }
-    return hipSuccess;
-  }
-
-  virtual int getNumaId(char *path) {
-    int n;
-    // search for all GPUs
-    for (n = 0; n < nGpus; n++)
-      if (strcmp(path, gpuPciPaths[n]) == 0)
-        break;
-    if (n < nGpus)
-      return gpuNumaIds[n];
-    return -1;
-  }
-};
-
-class NetDevices {
-private:
-  int nNetDevs;
-  char **netPciPaths;
-  uint64_t *netGuids;    // IB ports on same card share the same GUID
-  int *netNumaIds;
-
-public:
-  NetDevices(const int nnetdevs, const char **netpcipaths, const uint64_t *netguids, const int *netnumaids) :
-    nNetDevs(nnetdevs), netPciPaths((char **)netpcipaths), netGuids((uint64_t *)netguids), netNumaIds((int *)netnumaids) {}
-
-  NetDevices() : nNetDevs(0), netPciPaths(0), netGuids(0), netNumaIds(0) {}
-
-  int getnDevs() { return nNetDevs; }
-
-  ncclResult_t getPciPath(int dev, char** path) {
-    *path = (char *)malloc(PATH_MAX);
-    if (dev < nNetDevs)
-      strcpy(*path, netPciPaths[dev]);
-    else {
-      WARN("Could not find real path of %d", dev);
-      return ncclSystemError;
-    }
-    return ncclSuccess;
-  }
-
-  virtual int getNumaId(char *path) {
-    int n;
-    // search for all NICs
-    for (n = 0; n < nNetDevs; n++)
-      if (strcmp(path, netPciPaths[n]) == 0)
-        break;
-    if (n < nNetDevs)
-      return netNumaIds[n];
-    return -1;
-  }
-
-  uint64_t getIbGuid(char* path) {
-    int n;
-    for (n = 0; n < nNetDevs; n++)
-      if (strcmp(path, netPciPaths[n]) == 0)
-        break;
-    if (n < nNetDevs)
-      return netGuids[n];
-    WARN("Invalid IB path %s", path);
-    return 0;
-  }
-};
+#include <vector>
+#include "topo.h"
+#include "xml.h"
+#include "utils.h"

 class NodeModel {
 private:
-  CpuDevices cpus;
-  GpuDevices gpus;
-  NetDevices netdevs;

 public:
-  int nodeId;
-  int currRank;
-  int firstRank;
+  std::vector<struct ncclTopoSystem*> systems;
  uint64_t hostHash;  // auto-generated
  uint64_t pidHash;   // auto-generated
-  char description[256];
+  int nodeId;
+  int firstRank;
+  int currRank;

-  int rankToCudaDev(int rank) { return rank - firstRank; }
-
-  int getnGpus() { return gpus.getnDevs(); }
-
-  int getnNetDevs() { return netdevs.getnDevs(); }
-
-  ncclResult_t getGpuPciPath(char* busId, char** path) {
-    return gpus.getPciPath(busId, path);
+  NodeModel(const char *xml_file) {
+    char filename[PATH_MAX];
+    ssize_t count = readlink("/proc/self/exe", filename, PATH_MAX);
+    while (--count > 0) {
+      if (filename[count] == '/') {
+        filename[count+1] = 0;
+        break;
+      }
+    };
+    strcat(filename, "models/");
+    strcat(filename, xml_file);
+    struct ncclTopoSystem* system;
+    ncclTopoGetSystem(filename, &system);
+    systems.push_back(system);
+    for (int i=0; i<getNumGpus()-1; i++) {
+      ncclTopoGetSystem(filename, &system);
+      systems.push_back(system);
+    }
+    hostHash = ((uint64_t)rand() << 32) | rand();
+    pidHash = ((uint64_t)rand() << 32) | rand();
  }

-  ncclResult_t getNetPciPath(int dev, char** path) {
+  struct ncclTopoSystem* getSystem(int rank) { return systems[rank-firstRank]; }

-    return netdevs.getPciPath(dev, path);
+  int getNumGpus() {
+    return systems[0]->nodes[GPU].count;
  }

-  uint64_t getGpuBusId(int dev) {
-    return gpus.getBusId(dev);
+  int rankToCudaDev(int rank) {
+    for (int i=0; i<getNumGpus(); i++) {
+      if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank)
+        return systems[0]->nodes[GPU].nodes[i].gpu.dev;
+    }
+    return -1;
  }

-  int p2pCanConnect(int device1, int device2) { return gpus.p2pCanConnect(device1, device2); }
-
-  hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) {
-    return gpus.getLinkTypeAndHopCount(device1, device2, linktype, hopcount);
+  int64_t getGpuBusId(int rank) {
+    for (int i=0; i<getNumGpus(); i++) {
+      if (rank == systems[0]->nodes[GPU].nodes[i].gpu.rank)
+        return systems[0]->nodes[GPU].nodes[i].id;
+    }
+    return -1;
  }

-  uint64_t getIbGuid(char* path) {
-    return netdevs.getIbGuid(path);
+  int busIdToCudaDev(int64_t busId) {
+    for (int i=0; i<getNumGpus(); i++)
+      if (systems[0]->nodes[GPU].nodes[i].id == busId)
+        return systems[0]->nodes[GPU].nodes[i].gpu.dev;
+    return -1;
  }

+  void setRanks() {
+    for (int r=0; r<getNumGpus(); r++)
+      for (int i=0; i<getNumGpus(); i++)
+        systems[r]->nodes[GPU].nodes[i].gpu.rank += firstRank;
+  }
+
+  int p2pCanConnect(int device1, int device2) { return 1; }
  int shmCanConnect(int device1, int device2) { return 1; }
  int netCanConnect(int device1, int device2) { return 1; }

-  virtual int getNumaId(char *path) {
-    int numa = gpus.getNumaId(path);
-    if (numa != -1) return numa;
-    numa = netdevs.getNumaId(path);
-    if (numa != -1) return numa;
-    WARN("Invalid path %s for getNumaId", path);
-    return 0;
-  }
-
-  virtual ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) {
-    return cpus.getCpuWidths(name, interCpu, cpuPci, p2pPci);
-  }
-
-  NodeModel(CpuDevices cpu, GpuDevices gpu, NetDevices net, const char *desc) :
-    cpus(cpu), gpus(gpu), netdevs(net) {
-      strncpy(description, desc, 256);
-  }
-
-  NodeModel() {}
-
  ~NodeModel() {}
 };

 class NetworkModel {
 private:
-  int nNodes;
  int nRanks;
-  NodeModel nodes[NCCL_TOPO_MAX_NODES];
+  std::vector<NodeModel*> nodes;

 public:
-  void AddNode(NodeModel node) {
-    nodes[nNodes] = node;
-    nodes[nNodes].nodeId = nNodes;
-    nodes[nNodes].firstRank = nRanks;
-    nodes[nNodes].hostHash = ((uint64_t)rand() << 32) | rand();
-    nodes[nNodes].pidHash = ((uint64_t)rand() << 32) | rand();
-    nNodes++;
-    nRanks += node.getnGpus();
+  void AddNode(NodeModel* node) {
+    node->nodeId = nodes.size();
+    node->firstRank = nRanks;
+    node->setRanks();
+    nRanks += node->getNumGpus();
+    nodes.push_back(node);
  }

-  int GetNNodes() { return nNodes; }
-
-  int GetNRanks() { return nRanks; }
-
  NodeModel* GetNode(int rank) {
-    int node_id;
-
-    if(rank < 0 || rank >= nRanks)
-      return 0;
-
-    for(node_id = nNodes-1; node_id >= 0; node_id--)
-      if(rank >= nodes[node_id].firstRank) break;
-
-    if (node_id >= 0) {
-      nodes[node_id].currRank = rank;
-      return nodes+node_id;
+    for (auto & node : nodes) {
+      if (rank >= node->firstRank && rank < node->firstRank+node->getNumGpus()) {
+        node->currRank = rank;
+        return node;
+      }
    }
-    else
-      return 0;
+    return NULL;
  }

-  NetworkModel() : nNodes(0), nRanks(0) {}
-};
+  int GetNNodes() { return nodes.size(); }
+  int GetNRanks() { return nRanks; }

-
-const static uint64_t busIds_8[] = { 0x1d000, 0x20000, 0x23000, 0x26000, 0x3f000, 0x43000, 0x46000, 0x49000 };
-
-const static char* gpuPciPaths_8[] = {
-  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:08.0/0000:1b:00.0/0000:1c:00.0/0000:1d:00.0",
-  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:0c.0/0000:1e:00.0/0000:1f:00.0/0000:20:00.0",
-  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:10.0/0000:21:00.0/0000:22:00.0/0000:23:00.0",
-  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:14.0/0000:24:00.0/0000:25:00.0/0000:26:00.0",
-  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:04.0/0000:3d:00.0/0000:3e:00.0/0000:3f:00.0",
-  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:0c.0/0000:41:00.0/0000:42:00.0/0000:43:00.0",
-  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:10.0/0000:44:00.0/0000:45:00.0/0000:46:00.0",
-  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:14.0/0000:47:00.0/0000:48:00.0/0000:49:00.0",
-};
-
-const static int gpuPciNumaIds_8[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-const static char* netPciPaths_1[] = {
-  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0",
-};
-
-const static char* netPciPaths_1_1[] = {
-  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0",
-};
-
-const static uint64_t netGuids_1[] = {
-  0xb8599f030007053aL,
-};
-
-const static int netPciNumaIds_1[] = { 0 };
-
-const static char* netPciPaths_2[] = {
-  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0",
-  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0",
-};
-
-const static uint64_t netGuids_2[] = {
-  0xb8599f030007053aL,
-  0x506b4b030027bbf2L,
-};
-
-const static int netPciNumaIds_2[] = { 0, 0 };
-
-const static uint64_t rome_busIds_8[] = { 0x63000, 0x23000, 0x26000, 0x03000, 0xe3000, 0xc3000, 0xc6000, 0xa3000 };
-
-const static char* rome_gpuPciPaths_8[] = {
-  "/sys/devices/pci0000:60/0000:60:03.1/0000:61:00.0/0000:62:00.0/0000:63:00.0",
-  "/sys/devices/pci0000:20/0000:20:01.1/0000:21:00.0/0000:22:00.0/0000:23:00.0",
-  "/sys/devices/pci0000:20/0000:20:03.1/0000:24:00.0/0000:25:00.0/0000:26:00.0",
-  "/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0",
-  "/sys/devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/0000:e2:00.0/0000:e3:00.0",
-  "/sys/devices/pci0000:c0/0000:c0:01.1/0000:c1:00.0/0000:c2:00.0/0000:c3:00.0",
-  "/sys/devices/pci0000:c0/0000:c0:03.1/0000:c4:00.0/0000:c5:00.0/0000:c6:00.0",
-  "/sys/devices/pci0000:a0/0000:a0:03.1/0000:a1:00.0/0000:a2:00.0/0000:a3:00.0",
-};
-
-const static int rome_gpuPciNumaIds_8[] = { 0, 0, 0, 0, 4, 4, 4, 4 };
-
-const static char* rome_netPciPaths_1[] = {
-  "/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0",
-};
-
-const static uint64_t rome_netGuids_1[] = {
-  0xb8599f030007053aL,
-};
-
-const static int rom_netPciNumaIds_1[] = { 0 };
-
-const static char* rome_netPciPaths_2[] = {
-  "/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0",
-  "/sys/devices/pci0000:80/0000:80:01.1/0000:81:00.0",
-};
-
-const static uint64_t rome_netGuids_2[] = {
-  0xb8599f030007053aL,
-  0x506b4b030027bbf2L,
-};
-
-const static int rom_netPciNumaIds_2[] = { 0, 4 };
-
-const int conn_mat_pcie[64] = {
-  0 , 40, 40, 40, 40, 40, 40, 40,
-  40, 0 , 40, 40, 40, 40, 40, 40,
-  40, 40, 0 , 40, 40, 40, 40, 40,
-  40, 40, 40, 0 , 40, 40, 40, 40,
-  40, 40, 40, 40, 0 , 40, 40, 40,
-  40, 40, 40, 40, 40, 0 , 40, 40,
-  40, 40, 40, 40, 40, 40, 0 , 40,
-  40, 40, 40, 40, 40, 40, 40, 0 ,
-};
-
-const int conn_mat_4p2h[64] = {
-  0 , 15, 15, 30, 40, 40, 40, 40,
-  15, 0 , 30, 15, 40, 40, 40, 40,
-  15, 30, 0 , 15, 40, 40, 40, 40,
-  30, 15, 15, 0 , 40, 40, 40, 40,
-  40, 40, 40, 40, 0 , 15, 15, 30,
-  40, 40, 40, 40, 15, 0 , 30, 15,
-  40, 40, 40, 40, 15, 30, 0 , 15,
-  40, 40, 40, 40, 30, 15, 15, 0 ,
-};
-
-const int conn_mat_8p6l[64] = {
-  0 , 15, 15, 15, 15, 30, 15, 15,
-  15, 0 , 15, 15, 30, 15, 15, 15,
-  15, 15, 0 , 15, 15, 15, 15, 30,
-  15, 15, 15, 0 , 15, 15, 30, 15,
-  15, 30, 15, 15, 0 , 15, 15, 15,
-  30, 15, 15, 15, 15, 0 , 15, 15,
-  15, 15, 15, 30, 15, 15, 0 , 15,
-  15, 15, 30, 15, 15, 15, 15, 0 ,
-};
-
-const int conn_mat_8p6l_1[64] = {
- 0 , 15, 15, 30, 15, 15, 15, 15,
- 15, 0 , 30, 15, 15, 15, 15, 15,
- 15, 30, 0 , 15, 15, 15, 15, 15,
- 30, 15, 15, 0 , 15, 15, 15, 15,
- 15, 15, 15, 15, 0 , 15, 15, 30,
- 15, 15, 15, 15, 15, 0 , 30, 15,
- 15, 15, 15, 15, 15, 30, 0 , 15,
- 15, 15, 15, 15, 30, 15, 15, 0 ,
-};
-
-const int conn_mat_rome[64] = {
-  0 , 40, 40, 40, 72, 72, 72, 72,
-  40, 0 , 40, 40, 72, 72, 72, 72,
-  40, 40, 0 , 40, 72, 72, 72, 72,
-  40, 40, 40, 0 , 72, 72, 72, 72,
-  72, 72, 72, 72, 0 , 40, 40, 40,
-  72, 72, 72, 72, 40, 0 , 40, 40,
-  72, 72, 72, 72, 40, 40, 0 , 40,
-  72, 72, 72, 72, 40, 40, 40, 0 ,
+  NetworkModel() : nRanks(0) {}
 };

 #endif
@@ -12,11 +12,11 @@
 #include <hip/hip_fp16.h>

 #define NCCL_MAJOR 2
-#define NCCL_MINOR 5
-#define NCCL_PATCH 6
+#define NCCL_MINOR 6
+#define NCCL_PATCH 2
 #define NCCL_SUFFIX ""

-#define NCCL_VERSION_CODE 2506
+#define NCCL_VERSION_CODE 2602
 #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))

 #define RCCL_BFLOAT16 1
@@ -13,32 +13,34 @@ struct allGather1Data_t {
  struct ncclComm* comm;
 };

-struct allGather3Data_t {
+// AllGather3 - begin
+struct ncclGraphInfo {
+  int sameChannels;
+  float speedIntra;
+  float speedInter;
+  int typeIntra;
+};
+
+struct allGather3Data_t{
  int cudaCompCap;
  int fullCudaCompCap;
-  int nvlink;
  int nChannels;
-  struct {
-    int sameChannels;
-    int speedIntra;
-    int speedInter;
-    int nvlink;
-  } tree;
-  struct {
-    int sameChannels;
-    int speedIntra;
-    int speedInter;
-    int nvlink;
-  } ring;
+  struct ncclGraphInfo tree;
+  struct ncclGraphInfo ring;
+  struct ncclGraphInfo collNet;
  struct ncclTopoRanks topoRanks;
 };

+ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system);
+
+ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
+
 ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data);

-ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data,
-  struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph);
+ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data,
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);

 ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
-  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph);
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);

 #endif
@@ -41,24 +41,14 @@ THE SOFTWARE.
 #include <sys/stat.h>
 #include <unistd.h>
 #include "model.h"
+#include "topo.h"

 extern NodeModel *node_model;

-static ncclResult_t dummyNetDevices(int* ndev) {
-  *ndev = node_model->getnNetDevs();
-  return ncclSuccess;
-}
-
-static ncclResult_t dummyNetPciPath(int dev, char** path) {
-  node_model->getNetPciPath(dev, path);
-  return ncclSuccess;
-}
-
 ncclNet_t ncclNetDummy = {
  "IB",
  0,
-  dummyNetDevices,
-  dummyNetPciPath,
+  0,
  0,
  0,
  0,
@@ -76,24 +66,9 @@ ncclNet_t ncclNetDummy = {

 ncclNet_t* ncclNet = &ncclNetDummy;

-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  return ncclSuccess;
-}
-
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 int busIdToCudaDev(int64_t busId) {
-  int cudaDev;
-
-  for (cudaDev = 0; cudaDev < node_model->getnGpus(); cudaDev++) {
-    if (node_model->getGpuBusId(cudaDev) == busId)
-      break;
-  }
-
-  if (cudaDev < node_model->getnGpus())
-    return cudaDev;
-  else
-    WARN("Invalid busId %lx", busId);
-  return 0;
+  return node_model->busIdToCudaDev(busId);
 }

 /* Determine if two peers can communicate with P2P */
@@ -177,6 +152,8 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  int netDev, useGdr = 0;

  NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &netDev));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, netDev, 1, &useGdr));
+
  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev,
      useGdr ? "/GDRDMA" : "");
  return ncclSuccess;
@@ -188,15 +165,8 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  int netDev, useGdr = 0;

  NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &netDev));
-  // Check if we are close enough that it makes sense to enable GDR
-  int netGdrLevel = ncclParamNetGdrLevel();
-  int distance;
-  NCCLCHECK(ncclTopoNetDistance(topo, myInfo->busId, netDev, &distance));
-  if (distance >= netGdrLevel) {
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), myInfo->busId, netDev, distance, netGdrLevel);
-  }
-  else
-    useGdr = 1;
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, netDev, 0, &useGdr));
+
  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev,
      useGdr ? "/GDRDMA" : "");
  return ncclSuccess;
@@ -0,0 +1,43 @@
+<system version="1">
+  <cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="30" rank="0" gdr="1">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="30" rank="1" gdr="1">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="30" rank="2" gdr="1">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="30" rank="3" gdr="1">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1a:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xf2bb2700034b6b50" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,46 @@
+<system version="1">
+  <cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1a:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="0" sm="30" rank="0" gdr="0">
+          <xgmi target="0000:3d:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:b1:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:3d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="1" sm="30" rank="1" gdr="0">
+          <xgmi target="0000:1a:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:88:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:01:00.0" class="0x020000" link_speed="8 GT/s" link_width="4">
+      <nic>
+        <net name="eno1" dev="0" speed="10000" port="0" guid="0x0" maxconn="65536" gdr="0"/>
+      </nic>
+    </pci>
+    <nic>
+      <net name="virbr0" dev="1" speed="10000" port="0" guid="0x1" maxconn="65536" gdr="0"/>
+    </nic>
+  </cpu>
+  <cpu numaid="1" affinity="fffc00,0fffc000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:86:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:88:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="2" sm="30" rank="2" gdr="0">
+          <xgmi target="0000:3d:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:b1:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:af:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:b1:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="3" sm="30" rank="3" gdr="0">
+          <xgmi target="0000:1a:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:88:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,77 @@
+<system version="1">
+  <cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="30" rank="0" gdr="1">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="30" rank="1" gdr="1">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="30" rank="2" gdr="1">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="30" rank="3" gdr="1">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1a:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xf2bb2700034b6b50" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+    <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="4" sm="30" rank="4" gdr="1">
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="5" sm="30" rank="5" gdr="1">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="6" sm="30" rank="6" gdr="1">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="7" sm="30" rank="7" gdr="1">
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,77 @@
+<system version="1">
+  <cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="30" rank="0" gdr="1">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="30" rank="1" gdr="1">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="30" rank="2" gdr="1">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="30" rank="3" gdr="1">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+    </pci>
+    <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="4" sm="30" rank="4" gdr="1">
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="5" sm="30" rank="5" gdr="1">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="6" sm="30" rank="6" gdr="1">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="7" sm="30" rank="7" gdr="1">
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:4c:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xf2bb2700034b6b50" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,82 @@
+<system version="1">
+  <cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="30" rank="0" gdr="1">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="30" rank="1" gdr="1">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="30" rank="2" gdr="1">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="30" rank="3" gdr="1">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1a:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xf2bb2700034b6b50" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+    <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="4" sm="30" rank="4" gdr="1">
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="5" sm="30" rank="5" gdr="1">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="6" sm="30" rank="6" gdr="1">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="7" sm="30" rank="7" gdr="1">
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:4c:00.0" class="0x020700" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_1" dev="1" speed="100000" port="1" guid="0xb8599f030007053a" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,52 @@
+<system version="1">
+  <cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="30" rank="0" gdr="0">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="30" rank="1" gdr="0">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="30" rank="2" gdr="0">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="30" rank="3" gdr="0">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+    </pci>
+    <pci busid="0000:5e:00.0" class="0x060400" link_speed="8 GT/s" link_width="8">
+      <pci busid="0000:60:00.0" class="0x020000" link_speed="2.5 GT/s" link_width="1">
+        <nic>
+          <net name="enp96s0f0" dev="0" speed="10000" port="0" guid="0x0" maxconn="65536" gdr="0"/>
+        </nic>
+      </pci>
+    </pci>
+    <nic>
+      <net name="veth608058c" dev="1" speed="10000" port="0" guid="0x1" maxconn="65536" gdr="0"/>
+    </nic>
+  </cpu>
+</system>
@@ -0,0 +1,107 @@
+<system version="1">
+  <cpu numaid="0" affinity="00000000,00000000,00000000,00000000,00000000,000000ff,ffff0000,00ffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:61:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:63:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="0" sm="30" rank="0" gdr="0">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:24:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="1" sm="30" rank="1" gdr="0">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:24:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:22:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:24:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="2" sm="30" rank="2" gdr="0">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="3" sm="30" rank="3" gdr="0">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:24:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="00000000,00000000,00000000,00000000,00000000,ffffff00,0000ffff,ff000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="4" sm="30" rank="4" gdr="0">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:24:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="5" sm="30" rank="5" gdr="0">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:24:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="6" sm="30" rank="6" gdr="0">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:24:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="7" sm="30" rank="7" gdr="0">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:c4:00.0" class="0x020000" link_speed="2.5 GT/s" link_width="1">
+      <nic>
+        <net name="enp196s0" dev="0" speed="10000" port="0" guid="0x0" maxconn="65536" gdr="0"/>
+      </nic>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,53 @@
+<system version="1">
+  <cpu numaid="0" affinity="00ff00ff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="30" rank="0" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="30" rank="1" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="30" rank="2" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="30" rank="3" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:1a:00.0" class="0x020700" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0x3a050700039f59b8" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+    <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="4" sm="30" rank="4" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="5" sm="30" rank="5" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="6" sm="30" rank="6" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="7" sm="30" rank="7" gdr="1"/>
+        </pci>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,53 @@
+<system version="1">
+  <cpu numaid="0" affinity="00ff00ff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="30" rank="0" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="30" rank="1" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="30" rank="2" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="30" rank="3" gdr="1"/>
+        </pci>
+      </pci>
+    </pci>
+    <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="4" sm="30" rank="4" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="5" sm="30" rank="5" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="6" sm="30" rank="6" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="7" sm="30" rank="7" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:4c:00.0" class="0x020700" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xb8599f030007053a" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,58 @@
+<system version="1">
+  <cpu numaid="0" affinity="00ff00ff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="30" rank="0" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="30" rank="1" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="30" rank="2" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="30" rank="3" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:1a:00.0" class="0x020700" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0x3a050700039f59b8" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+    <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="4" sm="30" rank="4" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="5" sm="30" rank="5" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="6" sm="30" rank="6" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="7" sm="30" rank="7" gdr="1"/>
+        </pci>
+      </pci>
+      <pci busid="0000:4c:00.0" class="0x020700" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_1" dev="1" speed="100000" port="1" guid="0xb8599f030007053a" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -29,7 +29,6 @@ THE SOFTWARE.
 #include "net.h"
 #include "graph.h"
 #include "argcheck.h"
-#include "cpuset.h"
 #include <sched.h>
 #include <fcntl.h>
 #include <unistd.h>
@@ -65,18 +64,18 @@ bool cmdOptionExists(char** begin, char** end, const std::string& option) {

 const char *model_descriptions[] = {
  "single node VEGA20 4P1H",
+  "single node VEGA20 4P1H Alt. Model",
  "single node VEGA20 4P2H",
  "single node gfx908 4P3L",
  "single node gfx908 8P6L",
-  "single node gfx908 8P6L Alt. Connection",
-  "single node 8 VEGA20 PCIe on Rome",
-  "single node gfx908 8P6L on Rome",
+  "single node 8 VEGA20 PCIe",
  "4 nodes with 8 GPUs PCIe 1 NIC",
+  "4 nodes with 8 GPUs PCIe 1 NIC 2nd PLX Bridge",
  "4 nodes with 8 GPUs PCIe 2 NIC",
  "2 nodes VEGA20 4P1H",
  "4 nodes with 8 VEGA20 GPUs XGMI 4P2H 1 NIC",
-  "4 nodes 8 GPUs PCIe 2 NICs on Rome",
-  "3 nodes 8 GPUs PCIe + 1 Rome 8 GPUs PCIe + 2 nodes gfx908 4P3L",
+  "4 nodes with 8 VEGA20 GPUs XGMI 4P2H 1 NIC 2nd Hive",
+  "4 nodes with 8 VEGA20 GPUs XGMI 4P2H 2 NIC",
  NULL,
 };

@@ -97,97 +96,75 @@ int main(int argc,char* argv[])
  if (mi)
    model_id = atol(mi);

-  // CPU, GPU and NIC devices on Skylake
-  CpuDevices skylake("Skylake", SKL_QPI_WIDTH, SKL_CPUPCI_WIDTH, SKL_PCI_WIDTH);
-  GpuDevices vg20_pcie(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_pcie);
-  GpuDevices vg20_4p1h(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h);
-  GpuDevices vg20_4p2h(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h);
-  GpuDevices gfx908_4p3l(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l);
-  GpuDevices gfx908_8p6l(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l);
-  GpuDevices gfx908_8p6l_1(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l_1);
-  NetDevices nic_1(1, netPciPaths_1, netGuids_1, netPciNumaIds_1);
-  NetDevices nic_1_1(1, netPciPaths_1_1, netGuids_1, netPciNumaIds_1);
-  NetDevices nic_2(2, netPciPaths_2, netGuids_2, netPciNumaIds_2);
-
-  // CPU, GPU and NIC devices on Rome
-  CpuDevices rome("Rome", ROME_QPI_WIDTH, ROME_CPUPCI_WIDTH, ROME_PCI_WIDTH);
-  GpuDevices vg20_pcie_rome(8, rome_busIds_8, rome_gpuPciPaths_8, rome_gpuPciNumaIds_8, conn_mat_rome);
-  NetDevices nic_1_rome(1, rome_netPciPaths_1, rome_netGuids_1, rom_netPciNumaIds_1);
-  NetDevices nic_2_rome(2, rome_netPciPaths_2, rome_netGuids_2, rom_netPciNumaIds_2);
-
-  // 8 GPUs PCIe 1 NIC
-  NodeModel model_8pcie_1nic(skylake, vg20_pcie, nic_1, "Skylake 8 GPUs PCIe");
-
-  // 8 GPUs PCIe 2 NIC
-  NodeModel model_8pcie_2nic(skylake, vg20_pcie, nic_2, "Skylake 8 GPUs PCIe 2 NIC");
-
-  // VEGA20 4P1H, use VEGA20 4P2H model
-  NodeModel model_vg20_4p1h_1nic(skylake, vg20_4p1h, nic_1, "Skylake VEGA20 4P1H");
-
-  // VEGA20 GPUs XGMI 4P2H 1 NIC
-  NodeModel model_vg20_4p2h_1nic(skylake, vg20_4p2h, nic_1_1, "Skylake VEGA20 4P2H");
-
-  // gfx908 4P3L
-  NodeModel model_gfx908_4p_1nic(skylake, gfx908_4p3l, nic_1, "Skylake gfx908 4P3L");
-
-  // gfx908 8P6L
-  NodeModel model_gfx908_8p_1nic(skylake, gfx908_8p6l, nic_1, "Skylake gfx908 8P6L");
-
-  // gfx908 8P6L alternative connection
-  NodeModel model_gfx908_8p_1nic_1(skylake, gfx908_8p6l_1, nic_1, "Skylake gfx908 8P6L Alt. Connection");
-
-  // 8 GPUs PCIe on Rome
-  NodeModel model_8pcie_1nic_rome(rome, vg20_pcie_rome, nic_1_rome, "Rome 8 GPUs PCIe");
-
-  // 8 GPUs PCIe 2 NICs on Rome
-  NodeModel model_8pcie_2nic_rome(rome, vg20_pcie_rome, nic_2_rome, "Rome 8 GPUs PCIe 2 NICs");
-
-  // gfx908 8P6L on Rome
-  NodeModel model_gfx908_8p_1nic_rome(rome, gfx908_8p6l, nic_1, "Rome gfx908 8P6L");
-
  NetworkModel network;
+  NodeModel* node;

  switch(model_id) {
    case 0:
-      network.AddNode(model_vg20_4p1h_1nic);
+      node = new NodeModel("topo_4p1h.xml");
+      network.AddNode(node);
      break;
    case 1:
-      network.AddNode(model_vg20_4p2h_1nic);
+      node = new NodeModel("topo_4p1h_1.xml");
+      network.AddNode(node);
      break;
    case 2:
-      network.AddNode(model_gfx908_4p_1nic);
+      node = new NodeModel("topo_4p2h.xml");
+      network.AddNode(node);
      break;
    case 3:
-      network.AddNode(model_gfx908_8p_1nic);
+      node = new NodeModel("topo_4p3l.xml");
+      network.AddNode(node);
      break;
    case 4:
-      network.AddNode(model_gfx908_8p_1nic_1);
+      node = new NodeModel("topo_8p6l.xml");
+      network.AddNode(node);
      break;
    case 5:
-      network.AddNode(model_8pcie_1nic_rome);
+      node = new NodeModel("topo_8p_pcie.xml");
+      network.AddNode(node);
      break;
    case 6:
-      network.AddNode(model_gfx908_8p_1nic_rome);
+      for (int i=0; i<4; i++) {
+        node = new NodeModel("topo_8p_pcie.xml");
+        network.AddNode(node);
+      }
      break;
    case 7:
-      for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_1nic);
+      for (int i=0; i<4; i++) {
+        node = new NodeModel("topo_8p_pcie_1.xml");
+        network.AddNode(node);
+      }
      break;
    case 8:
-      for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic);
+      for (int i=0; i<4; i++) {
+        node = new NodeModel("topo_8p_pcie_2nic.xml");
+        network.AddNode(node);
+      }
      break;
    case 9:
-      for (int i = 0; i < 2; i ++) network.AddNode(model_vg20_4p1h_1nic);
+      for (int i=0; i<2; i++) {
+        node = new NodeModel("topo_4p1h.xml");
+        network.AddNode(node);
+      }
      break;
    case 10:
-      for (int i = 0; i < 4; i ++) network.AddNode(model_vg20_4p2h_1nic);
+      for (int i=0; i<4; i++) {
+        node = new NodeModel("topo_4p2h.xml");
+        network.AddNode(node);
+      }
      break;
    case 11:
-      for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic_rome);
+      for (int i=0; i<4; i++) {
+        node = new NodeModel("topo_4p2h_1.xml");
+        network.AddNode(node);
+      }
      break;
    case 12:
-      for (int i = 0; i < 3; i ++) network.AddNode(model_8pcie_1nic);
-      network.AddNode(model_8pcie_1nic_rome);
-      for (int i = 0; i < 2; i ++) network.AddNode(model_gfx908_4p_1nic);
+      for (int i=0; i<4; i++) {
+        node = new NodeModel("topo_4p2h_2nic.xml");
+        network.AddNode(node);
+      }
      break;
    default:
      printf("Invalid model_id %d\n", model_id);
@@ -203,8 +180,8 @@ int main(int argc,char* argv[])
  for (int i = 0; i < nranks; i++) {
    node_model = network.GetNode(i);
    assert(node_model!=0);
-    printf("Rank %d: node %d (%s) GPU busId %lx\n", i, node_model->nodeId,
-      node_model->description, node_model->getGpuBusId(node_model->rankToCudaDev(i)));
+    printf("Rank %d: node %d cudaDev %d GPU busId %lx\n", i, node_model->nodeId,
+      node_model->rankToCudaDev(i), node_model->getGpuBusId(i));
  }

  NCCLCHECK(ncclCalloc(&comm, nranks));
@@ -220,21 +197,22 @@ int main(int argc,char* argv[])
    comm[i].nRanks = nranks;
    node_model = network.GetNode(i);
    assert(node_model!=0);
+    comm[i].topo = node_model->getSystem(i);
    bootstrapAllGather(&comm[i], allGather1Data);
  }

-  struct ncclTopoGraph treeGraph, ringGraph;
+  struct ncclTopoGraph treeGraph, ringGraph, collNetGraph;

  for (int i = 0; i < nranks; i++) {
    node_model = network.GetNode(i);
    assert(node_model!=0);
-    initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph, ringGraph);
+    initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph, ringGraph, collNetGraph);
  }

  for (int i = 0; i < nranks; i++) {
    node_model = network.GetNode(i);
    assert(node_model!=0);
-    initTransportsRank_3(&comm[i], allGather3Data, treeGraph, ringGraph);
+    initTransportsRank_3(&comm[i], allGather3Data, treeGraph, ringGraph, collNetGraph);
  }

  free(allGather3Data);
@@ -25,12 +25,19 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include "xml.h"
+#include "coll_net.h"
 #include "model.h"
 #include "utils.h"

 extern NodeModel *node_model;

 NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
+NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
+
+thread_local int ncclDebugNoWarn = 0;
+ncclCollNet_t* ncclCollNet = NULL;

 // Get current Compute Capability
 int ncclCudaCompCap() {
@@ -43,7 +50,7 @@ ncclResult_t int64ToBusId(int64_t id, char* busId) {
  return ncclSuccess;
 }

-ncclResult_t busIdToInt64(char* busId, int64_t* id) {
+ncclResult_t busIdToInt64(const char* busId, int64_t* id) {
  const int size = strlen(busId);
  char* hexStr;
  NCCLCHECK(ncclCalloc(&hexStr, size));
@@ -87,9 +94,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
  if (ncclDebugLevel == -1) ncclDebugInit();
  if (level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE) return;
  char buffer[1024];
-  size_t len;
-  len = snprintf(buffer, sizeof(buffer),
-                   "[%d:%d] ", node_model->nodeId, node_model->currRank);
+  size_t len = 0;
+  if (node_model) len = snprintf(buffer, sizeof(buffer),
+    "[%d:%d] ", node_model->nodeId, node_model->currRank);
  va_list args;
  va_start(args, fmt);
  vsprintf(buffer+len, fmt, args);
@@ -102,6 +109,16 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
  }
 }

+ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system) {
+  struct ncclXml* xml;
+  NCCLCHECK(ncclCalloc(&xml, 1));
+  NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml));
+  NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
+  free(xml);
+  return ncclSuccess;
+}
+
+
 ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) {
  // AllGather1 - begin
  allGather1Data[comm->rank].peerInfo.rank = comm->rank;
@@ -110,12 +127,12 @@ ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t *
  allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash;
  allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash;
  allGather1Data[comm->rank].peerInfo.shmDev = 0x19;
-  allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(node_model->rankToCudaDev(comm->rank));
+  allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(comm->rank);
  return ncclSuccess;
 }

-ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data,
-  struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph) {
+ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data,
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
  // We use 3 AllGathers
  // 1. { peerInfo, comm }
  // 2. ConnectTransport[nranks], ConnectValue[nranks]
@@ -147,45 +164,70 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
  // AllGather1 - end

  // Topo detection / System graph creation
-  NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
+  //NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
  // Compute paths between GPUs and NICs
  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
  // Remove inaccessible GPUs and unused NICs
  NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
  // Recompute paths after trimming
  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
-  // Compute max speed to accelerate search
-  NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
+  // Init search
+  NCCLCHECK(ncclTopoSearchInit(comm->topo));
  // Print final topology
  NCCLCHECK(ncclTopoPrint(comm->topo));

  // Get rings and trees
-  //struct ncclTopoGraph treeGraph;
-  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
-  treeGraph.crossNic = ncclParamCrossNic();
-  // We communicate only half the data between node with trees on 2 nodes.
-  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
-  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
  //struct ncclTopoGraph ringGraph;
+  ringGraph.id = 0;
  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
  ringGraph.crossNic = ncclParamCrossNic();
+  ringGraph.collNet = 0;
+  ringGraph.minChannels = 1;
+  ringGraph.maxChannels = MAXCHANNELS/2;
  NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));

+  //struct ncclTopoGraph treeGraph;
+  treeGraph.id = 1;
+  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+  treeGraph.crossNic = ncclParamCrossNic();
+  treeGraph.collNet = 0;
+  treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
+  treeGraph.maxChannels = ringGraph.nChannels;
+  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
+
+  //struct ncclTopoGraph collNetGraph;
+  collNetGraph.id = 2;
+  collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+  collNetGraph.collNet = 1;
+  collNetGraph.crossNic = ncclParamCrossNic();
+  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
+  NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
+
+  if (comm->rank == ncclParamGraphDumpFileRank()) {
+    struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
+    NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
+  }
+
  // AllGather3 - begin
  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
-  allGather3Data[rank].nvlink = treeGraph.nvlink;
  allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
  allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
  allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
-  allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
+  allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
  allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
  allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
  allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
-  allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
+  allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
+  allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
+  allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
+  allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
+  allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;

-  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
+  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
  //INFO(NCCL_GRAPH, "%d: nvlink %d nChannels %d tree.sameChannels %d tree.speedIntra %d tree.speedInter %d tree.nvlink %d ring.sameChannels %d ring.speedIntra %d ring.speedInter %d ring.nvlink %d",
  //  rank, allGather3Data[rank].nvlink, allGather3Data[rank].nChannels, allGather3Data[rank].tree.sameChannels, allGather3Data[rank].tree.speedIntra, allGather3Data[rank].tree.speedInter, allGather3Data[rank].tree.nvlink,
  //  allGather3Data[rank].ring.sameChannels, allGather3Data[rank].ring.speedIntra, allGather3Data[rank].ring.speedInter, allGather3Data[rank].ring.nvlink);
@@ -203,14 +245,8 @@ static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopo
    int ret = 0;
    NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
    if (ret) {
-      //cpu_set_t affinitySave;
-      //sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-      //int cudaDev;
-      //CUDACHECK(hipGetDevice(&cudaDev));
-      //setCpuAffinity(cudaDev);
      connector->transportComm = transportComm;
      NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
-      //sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
      return ncclSuccess;
    }
  }
@@ -265,21 +301,26 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
  return ncclSuccess;
 }

+
 ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
  struct ncclChannel* channel = comm->channels+channelid;
  channel->id = channelid;

  // Setup intermediate buffering
-  //channel->buffSize = ncclParamBuffsize();
+  //int buffSize = ncclParamBuffsize();
+  int cpuArch, cpuVendor, cpuModel;
+  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+  //channel->buffSize = buffSize != -2 ? buffSize :
+  //  cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;

  // Ring index to user rank table.
  //NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));

  // Communication structures with peers.
-  //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
-  for (size_t i=0; i<comm->nRanks; ++i) {
+  //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
+  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
+  for (size_t i=0; i<comm->nRanks+1; ++i) {
    channel->peers[i].send.comm = comm;
    channel->peers[i].recv.comm = comm;
  }
@@ -307,8 +348,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
  return ncclSuccess;
 }

-ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph,
-  struct ncclTopoGraph& ringGraph) {
+ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
  int rank = comm->rank;
  int nranks = comm->nRanks;
  //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
@@ -329,6 +370,15 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    if (i == comm->rank) comm->node = node;
  }

+  char line[1024];
+  sprintf(line, "nodesFirstRank: ");
+  int offset = strlen(line);
+  for (int i=0; i<comm->nNodes; i++) {
+    sprintf(line+offset, "%d ", nodesFirstRank[i]);
+    offset = strlen(line);
+  }
+  INFO(NCCL_INIT, "%s", line);
+
  // Determine the minimum CUDA Compute capability of all GPUs
  int myCompCap = allGather3Data[rank].cudaCompCap;
  int minCompCap = myCompCap, maxCompCap = myCompCap;
@@ -337,9 +387,6 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
  }

-  comm->nvlink = 1;
-  for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
-
  int nChannelsOrig = comm->nChannels;
  struct ncclTopoRanks** allTopoRanks;
  NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
@@ -350,11 +397,15 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
    treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
    treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
-    treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
+    treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
    ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
    ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
    ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
-    ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
+    ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+    collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
+    collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
+    collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
+    collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
  }

  if (comm->nChannels < nChannelsOrig) {
@@ -366,24 +417,23 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  int *rings;
  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));

-  char line[1024];
-  sprintf(line, "nodesFirstRank: ");
-  int offset = strlen(line);
-  for (int i=0; i<comm->nNodes; i++) {
-    sprintf(line+offset, "%d ", nodesFirstRank[i]);
-    offset = strlen(line);
-  }
-  INFO(NCCL_INIT, "%s", line);
-
  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+  if (comm->nNodes > 1 &&
+      ncclParamCollNetEnable() == 1 &&
+      collNetSupport()) {
+    NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank));
+  }

  free(allTopoRanks);
  free(nodesFirstRank);
+  //free(allGather3Data);

  // AllGather3 - end

  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);

+  NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+
  line[0]='\0';
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclTree* treeUp = &comm->channels[c].treeUp;
@@ -395,26 +445,56 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  line[1023] = '\0';
  INFO(NCCL_INIT, "Trees%s", line);

-  free(rings);
-
-  // Done with AllGather1 data
-  //free(allGather1Data);
-
-  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+  // on the host is local.
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank));
+  ncclResult_t ret;

  // Connect with prev/next for each ring
  struct ncclConnect *connect;
-  NCCLCHECK(ncclCalloc(&connect, 2));
+  NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
-    NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
+    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
    if (comm->nRanks == 1) continue;
-    NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
-    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
-    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
+    NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
+    NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
+    NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
  }
+
+  // Check if we can setup CollNet
+#if 0
+  if (comm->nNodes > 1 &&
+      ncclParamCollNetEnable() == 1 &&
+      collNetSupport()) {
+    int logicChannels = comm->nChannels/2;
+    int collNetSetupFail = 0;
+    const int recvIndex = 0;  // recv GPU index is always 0
+    const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;  // send GPU index depends on topo pattern
+    for (int c=0; c<logicChannels; c++) {
+      struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
+      struct ncclChannel* channelSend = comm->channels+c;
+      NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
+      NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
+      const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
+      const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
+      if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
+        collNetSetupFail = 1;
+      if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
+        collNetSetupFail = 1;
+    }
+    // Verify CollNet setup across ranks
+    NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
+  }
+#endif
  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
  free(connect);
+  free(rings);
+
+affinity_restore:
+  if (ret != ncclSuccess) return ret;

  return ncclSuccess;
 }