2.5.6-1 (#255)

Add LL128 Protocol. Rewrite the topology detection and tree/ring creation (#179). Improve tree performance by sending/receiving from different GPUs. Add model-based tuning to switch between the different algorithms and protocols. Rework P2P/SHM detection in containers (#155, #248). Detect duplicated devices and return an error (#231). Add tuning for GCP [ROCm/rccl commit: 299c554dcc]
2019-11-19 14:57:39 -08:00
commit 71560fd67b
@@ -25,8 +25,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)

 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
-                -gencode=arch=compute_35,code=sm_35 \
+CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
                -gencode=arch=compute_50,code=sm_50 \
                -gencode=arch=compute_60,code=sm_60 \
                -gencode=arch=compute_61,code=sm_61
@@ -46,7 +45,10 @@ endif
 CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
 CXXFLAGS   += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
 CXXFLAGS   += -I $(CUDA_INC)
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
+# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
+# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
+# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt

@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 4
-NCCL_PATCH   := 8
+NCCL_MINOR   := 5
+NCCL_PATCH   := 6
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -17,7 +17,7 @@ DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))

 PKG_TIMESTAMP  := $(shell date -R)
 ARCH           := $(shell uname -m)
-PKG_ARCH       ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
+PKG_ARCH       ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g"| sed -e "s/aarch64/arm64/g")
 PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
 ifeq ($(PKG_MULTIARCH),)
 # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
@@ -9,10 +9,11 @@ include ../makefiles/version.mk

 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
-                misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
+                misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
-                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
+                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
+                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc

 ##### lib files
 LIBNAME     := libnccl.so
@@ -94,17 +95,17 @@ $(PKGDIR)/nccl.pc : nccl.pc.in
 $(INCDIR)/%.h : %.h
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(INCDIR)
-	cp -f $< $@
+	install -m 644 $< $@

 $(INCDIR)/nccl_%.h : include/nccl_%.h
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(INCDIR)
-	cp -f $< $@
+	install -m 644 $< $@

 $(PKGDIR)/%.pc : %.pc
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(PKGDIR)
-	cp -f $< $@
+	install -m 644 $< $@

 $(OBJDIR)/%.o : %.cc
 	@printf "Compiling  %-35s > %s\n" $< $@
@@ -117,8 +118,8 @@ $(OBJDIR)/%.o : %.cc
 	@rm -f $(@:%.o=%.d.tmp)

 clean :
-	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
 	$(MAKE) -C collectives/device clean
+	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}

 install : lib
 	mkdir -p $(PREFIX)/lib
@@ -13,11 +13,6 @@
 #include <unistd.h>
 #include <sys/types.h>

-// Always use sockets for bootstrap
-struct bootstrapNetHandle {
-  union socketAddress connectAddr;
-};
-
 struct bootstrapNetComm {
  int fd;
 };
@@ -68,36 +63,36 @@ static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr
 /* Socket Interface Selection type */
 enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };

-static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) {
-  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
-  static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large");
+static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
+  union socketAddress* connectAddr = (union socketAddress*) netHandle;
+  static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
  // if dev >= 0, listen based on dev
  if (dev >= 0) {
-    NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr)));
+    NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
  } else if (dev == findSubnetIf) {
    // handle stores a remote address
    // need to find a local addr that is in the same network as the remote addr
    union socketAddress localAddr;
    char ifName[MAX_IF_NAME_SIZE];
-    if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+    if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
      WARN("NET/Socket : No usable listening interface found");
      return ncclSystemError;
    }
    // pass the local address back
-    memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
+    memcpy(connectAddr, &localAddr, sizeof(localAddr));
  } // Otherwise, handle stores a local address
  struct bootstrapNetComm* comm;
  NCCLCHECK(bootstrapNetNewComm(&comm));
-  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
  *listenComm = comm;
  return ncclSuccess;
 }

-static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) {
+static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
+  union socketAddress* connectAddr = (union socketAddress*) netHandle;
  struct bootstrapNetComm* comm;
  NCCLCHECK(bootstrapNetNewComm(&comm));
-  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
-  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
+  NCCLCHECK(connectAddress(&comm->fd, connectAddr));
  *sendComm = comm;
  return ncclSuccess;
 }
@@ -145,21 +140,12 @@ static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
  return ncclSuccess;
 }

-ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) {
-  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
-  NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str));
+ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
+  union socketAddress* connectAddr = (union socketAddress*) netHandle;
+  NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
  return ncclSuccess;
 }

-struct extId {
-  ncclNetHandle_t extHandleRoot;
-  void* extListenComm;
-  uint64_t hostHash;
-  pid_t pid;
-  int fd;
-  pthread_t boostrapThread;
-};
-
 struct extInfo {
  int rank;
  int nranks;
@@ -177,9 +163,8 @@ static ncclResult_t setFilesLimit() {
  return ncclSuccess;
 }

-static void *bootstrapRoot(void* commId) {
+static void *bootstrapRoot(void* listenComm) {
  struct extInfo info;
-  struct extId* id = (struct extId*)commId;
  ncclNetHandle_t *rankHandles = NULL;
  ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
  ncclNetHandle_t zero = { 0 }; // for sanity checking
@@ -191,7 +176,7 @@ static void *bootstrapRoot(void* commId) {
  /* Receive addresses from all ranks */
  int nranks = 0, c = 0;
  do {
-    NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
    NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
    NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);

@@ -216,22 +201,22 @@ static void *bootstrapRoot(void* commId) {
    memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));

    ++c;
+    TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d",  info.rank, c, nranks);
  } while (c < nranks);
-  TRACE(NCCL_INIT, "COLLECTED HANDLES");
+  TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);

  // Send the connect handle for the next rank in the AllGather ring
  for (int r=0; r<nranks; ++r) {
    int next = (r+1) % nranks;
    void *tmpSendComm;
-    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
    NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
    NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
  }
-  TRACE(NCCL_INIT, "SENT OUT HANDLES");
+  TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);

 out:
-  bootstrapNetCloseListen(id->extListenComm);
-  free(commId);
+  bootstrapNetCloseListen(listenComm);
  if (rankHandles) free(rankHandles);
  if (rankHandlesRoot) free(rankHandlesRoot);

@@ -239,31 +224,28 @@ out:
  return NULL;
 }

-ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
-  struct extId* id = (struct extId*)commId;
-  id->hostHash = getHostHash();
-  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
-  ncclUniqueId* threadIdCopy;
-  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
-  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
-  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
+  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
+  void* listenComm;
+  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
+  pthread_t thread;
+  pthread_create(&thread, NULL, bootstrapRoot, listenComm);
  return ncclSuccess;
 }

-ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
-  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
-  extId* id = (extId*)out;
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
+  static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  memset(id, 0, sizeof(ncclUniqueId));
+  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;

  char* env = getenv("NCCL_COMM_ID");
  if (env) {
-    if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) {
+    if (bootstrapNetCreateHandle(netHandle, env) != 0) {
      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
      return ncclInvalidArgument;
    }
-    id->pid = -1;
  } else {
-    id->pid = getpid();
-    NCCLCHECK(bootstrapCreateRoot(out, false));
+    NCCLCHECK(bootstrapCreateRoot(id, false));
  }

  return ncclSuccess;
@@ -286,9 +268,9 @@ struct extState {
  int dev;
 };

-ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
-  struct extId* id = (struct extId*)commId;
-  bool idFromEnv = id->pid < 0;
+ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
+  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
+  bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
  struct extState* state;
  NCCLCHECK(ncclCalloc(&state, 1));
  state->rank = rank;
@@ -303,8 +285,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
  void *tmpSendComm, *tmpRecvComm;
  // Pass the remote address to listen via info
  if (idFromEnv) {
-    memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-    memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
  }
  // listen will return the local address via info (specify interface type 'findSubnetIf')
  state->dev = idFromEnv ? findSubnetIf : 0;
@@ -323,7 +305,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
  }

  // send info on my listening socket to root
-  NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+  NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
  NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));

@@ -334,7 +316,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
  NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
  NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));

-  NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
  // Accept the connect request from the previous rank in the AllGather ring
  NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));

@@ -377,7 +359,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
  struct extState* state = (struct extState*)commState;
  void* tmpSendComm;
-  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
+  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
  NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
  NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
@@ -465,3 +447,13 @@ ncclResult_t bootstrapClose(void* commState) {

  return ncclSuccess;
 }
+
+ncclResult_t bootstrapAbort(void* commState) {
+  struct extState* state = (struct extState*)commState;
+  bootstrapNetCloseListen(state->extBstrapListenComm);
+  bootstrapNetCloseSend(state->extBstrapRingSendComm);
+  bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
+  free(state->peerBstrapHandles);
+  free(state);
+  return ncclSuccess;
+}
@@ -5,7 +5,6 @@
 ************************************************************************/

 #include "enqueue.h"
-#include "collectives.h"

 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
@@ -68,4 +68,4 @@ $(DEVOBJ) : $(LIBOBJ)
 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@

 clean:
-	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test
+	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
@@ -11,7 +11,7 @@
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -19,15 +19,15 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
  const ssize_t size = args->N;
  const int nranks = comm->nRanks;
  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;

-  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -129,3 +129,67 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {

 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin AllGather steps ///////////////
+    ssize_t offset;
+    int nelem = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    if (thisInput + chunkOffset == thisOutput + offset) { // In place
+      LLprims.send(thisInput+chunkOffset, nelem);
+    } else {
+      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+    }
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      LLprims.recvCopySend(thisOutput+offset, nelem);
+    }
+
+    // step k-1: final store
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    LLprims.recv(thisOutput+offset, nelem);
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
@@ -11,7 +11,7 @@
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -27,7 +27,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
  T * __restrict__ thisOutput = (T*)args->ThisOutput;

  ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+    prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
@@ -85,23 +85,28 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclTree* tree = &channel->tree;
  const ssize_t size = args->N;
  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = args->lastChunkSize;
+  int chunkSize = args->lastChunkSize;
+  const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
  const ssize_t loopSize = args->nChannels*chunkSize;

+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;

  do {
+    struct ncclTree* tree = &channel->treeUp;
    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
      // Up
      ssize_t offset = gridOffset + bid*chunkSize;
@@ -117,8 +122,9 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
  } while(0);

  do {
+    struct ncclTree* tree = &channel->treeDn;
    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
      // Down
      ssize_t offset = gridOffset + bid*chunkSize;
@@ -149,6 +155,8 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
  //const int rank = comm->rank;
  const int nranks = comm->nRanks;
  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
+
  const ssize_t loopSize = args->nChannels*nranks*chunkSize;

  // Compute pointers
@@ -156,10 +164,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
  T * __restrict__ thisOutput = (T*)args->ThisOutput;

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->lastChunkSize;
-    }
-    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);

    /////////////// begin AllReduce steps ///////////////
    ssize_t offset;
@@ -168,7 +173,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {

    // step 0: push data to next GPU
    slice = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
    nelem = min(chunkSize, size-offset);

    LLprims.send(thisInput+offset, nelem);
@@ -176,7 +181,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
    // k-2 steps: reduce and copy to next GPU
    for (int j=2; j<nranks; ++j) {
      slice = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
+      offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
      nelem = min(chunkSize, size-offset);

      LLprims.recvReduceSend(thisInput+offset, nelem);
@@ -185,7 +190,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
    // step k-1: reduce this buffer and data, which will produce the final
    // result that we store in this data and push to the next GPU
    slice = ring->devUserRanks[0];
-    offset = chunkOffset + slice * chunkSize;
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
    nelem = min(chunkSize, size-offset);

    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
@@ -193,7 +198,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
    // k-2 steps: copy to next GPU
    for (int j=1; j<nranks-1; ++j) {
      slice = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
+      offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
      nelem = min(chunkSize, size-offset);

      LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -201,7 +206,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {

    // Make final copy from buffer to dest.
    slice = ring->devUserRanks[1];
-    offset = chunkOffset + slice * chunkSize;
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
    nelem = min(chunkSize, size-offset);

    // Here we need to copy from buffer to this output.
@@ -216,16 +221,21 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclTree* tree = &channel->tree;
  const ssize_t size = args->N;
  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
  const ssize_t loopSize = args->nChannels*chunkSize;

+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;

  do {
+    struct ncclTree* tree = &channel->treeUp;
    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
    ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -243,6 +253,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
  } while(0);

  do {
+    struct ncclTree* tree = &channel->treeDn;
    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
    ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -259,3 +270,141 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
    }
  } while(0);
 }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+  const ssize_t loopSize = args->nChannels*nranks*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
+
+    /////////////// begin AllReduce steps ///////////////
+    ssize_t offset;
+    int nelem;
+    int slice;
+
+    // step 0: push data to next GPU
+    slice = ring->devUserRanks[nranks-1];
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+    nelem = min(chunkSize, size-offset);
+
+    LLprims.send(thisInput+offset, nelem);
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvReduceSend(thisInput+offset, nelem);
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring->devUserRanks[0];
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+    nelem = min(chunkSize, size-offset);
+
+    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvCopySend(thisOutput+offset, nelem);
+    }
+
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+    nelem = min(chunkSize, size-offset);
+
+    // Here we need to copy from buffer to this output.
+    LLprims.recv(thisOutput+offset, nelem);
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* treeUp = &channel->treeUp;
+  struct ncclTree* treeDn = &channel->treeDn;
+  const ssize_t size = args->N;
+  ssize_t chunkSize = args->lastChunkSize;
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
+  const ssize_t loopSize = args->nChannels*chunkSize;
+  int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
+
+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  if (treeUp->up == -1) {
+    // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
+    ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+    }
+  } else {
+    if (tid < nthreadsSplit) {
+      // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+      ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Up
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (treeUp->down[0] == -1) {
+          LLprims.send(thisInput+offset, nelem);
+        } else {
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
+    } else {
+      // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+      ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Down
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (treeDn->down[0] == -1) {
+          LLprims.recv(thisOutput+offset, nelem);
+        } else {
+          LLprims.recvCopySend(thisOutput+offset, nelem);
+        }
+      }
+    }
+  }
+}
@@ -11,7 +11,7 @@
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -29,7 +29,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
  T * __restrict__ thisOutput = (T*)args->ThisOutput;

  ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+    prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -100,3 +100,51 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {

 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int nelem = min(chunkSize, size-offset);
+    if (rank == root) {
+      if (thisInput == thisOutput) {
+        LLprims.send(thisInput+offset, nelem);
+      } else {
+        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+      }
+    } else if (nextRank == root) {
+      LLprims.recv(thisOutput + offset, nelem);
+    } else {
+      LLprims.recvCopySend(thisOutput + offset, nelem);
+    }
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
@@ -7,9 +7,8 @@
 #ifndef NCCL_DEVICE_COMMON_H_
 #define NCCL_DEVICE_COMMON_H_

-#include "../collectives.h"
+#include "collectives.h"
 #include "devcomm.h"
-#include "nccl.h"

 // Exit If Abort Barrier across CTA: make sure all threads exit consistently
 // Each thread sets a predicate to true if abort == 1
@@ -31,17 +30,19 @@ extern __device__ ncclKern_t ncclFuncs[];
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
  int* d = (int*)dst;
  int* s = (int*)src;
-  // When aggregation is effective, if some threads have aborted inside the LL kernel,
-  // make sure the rest of the threads abort as well
-  exitIfAbortBarrier(0);
  for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
-  __syncthreads();
 }
-static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm) {
+  // Check whether the last operation was aborted and make sure all threads exit
+  int abort = tid == 0 ? *(comm->abortFlag) : 0;
+  exitIfAbortBarrier(abort);
  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+  __syncthreads();
  if (tid == 0) hostColl->active = 0;
 }

+extern __device__ volatile uint64_t* ncclShmem;
+
 /* Functions for aggregation case */
 #define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
 __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
@@ -51,10 +52,11 @@ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
 #if NCCL_OP == 0
 /* Kernels with the first operation inlined */
 #define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
-__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
 __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  int tid = threadIdx.x; \
  int bid = blockIdx.x; \
+  __shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
+  ncclShmem = shmem; \
  __shared__ struct ncclColl localColl; \
 \
  struct ncclDevComm* comm = firstColl.args.comm; \
@@ -65,7 +67,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
    c = &firstColl; \
  } else { \
    c = &localColl; \
-    load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
+    load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \
  } \
  while (1) { \
    if (tid < c->args.nThreads) { \
@@ -84,7 +86,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
 \
    /* Load next collective operation*/ \
    c = &localColl; /* for bid 0 */ \
-    load_coll(c, channel->devCollectives+nextIndex, tid); \
+    load_coll(c, channel->devCollectives+nextIndex, tid, comm); \
  } \
 }
 #else
@@ -93,13 +95,14 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \

 // Only generate inline kernels for LL
 #define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
-  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
  IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
+  IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \

 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
-  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
+  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)

 #if NCCL_TYPE == 0
 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
@@ -263,8 +263,6 @@ __device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthread
  }
 }

-#define WARP_SIZE 32
-
 template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
 __device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
@@ -8,13 +8,16 @@
 #include "collectives.h"
 #include "common.h"

+__device__ volatile uint64_t* ncclShmem;
+
 #define NCCL_FUNC5(coll, op, dtype) \
-  NCCL_COLL_NAME(coll, op, dtype), \
-  NCCL_COLL_NAME(coll##LL, op, dtype)
+  NCCL_COLL_NAME(coll##LL, op, dtype), \
+  NCCL_COLL_NAME(coll##LL128, op, dtype), \
+  NCCL_COLL_NAME(coll, op, dtype)

 #define NCCL_FUNC4(coll, op, dtype) \
-  NCCL_FUNC5(coll##Ring, op, dtype), \
-  NCCL_FUNC5(coll##Tree, op, dtype)
+  NCCL_FUNC5(coll##Tree, op, dtype), \
+  NCCL_FUNC5(coll##Ring, op, dtype)

 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -50,7 +53,7 @@
  NCCL_FUNCS3B(coll, copy), \
  NCCL_FUNCS3B(coll, copy)

-// Must be consistent with ncclColl_t
+// Must be consistent with ncclFunc_t
 #define NCCL_FUNCS() { \
  NCCL_FUNCS2B(ncclBroadcast), \
  NCCL_FUNCS2A(ncclReduce), \
@@ -59,7 +62,7 @@
  NCCL_FUNCS2A(ncclAllReduce) }

 // Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
@@ -0,0 +1,36 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef OP128_H_
+#define OP128_H_
+
+inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
+  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
+      : "=l"(v0), "=l"(v1) : "l"(ptr));
+}
+
+inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
+  asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
+      :: "l"(v0), "l"(v1), "l"(ptr));
+}
+
+inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
+  uint64_t* shmemAsmPtr;
+  asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
+  return shmemAsmPtr;
+}
+
+inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
+  asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
+      : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
+}
+
+inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
+  asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
+      :: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
+}
+
+#endif
@@ -37,15 +37,27 @@ class ncclPrimitives {
 private:
  const int tid;
  const int nthreads;
+  const int wid;
+  const int stepSize;
  int nrecv = 0;
  int nsend = 0;
-  const int stepSize;
-  struct ncclConnInfo* recvConn[NRECV];
-  struct ncclConnInfo* sendConn[NSEND];
-  volatile uint64_t* waitPtr;
+  struct ncclConnInfo* recvConn = NULL;
+  volatile uint64_t* recvConnHeadPtr = NULL;
+  uint64_t recvConnHead;
+  volatile uint64_t* recvConnTailPtr = NULL;
+  uint64_t recvConnTail;
+  uint64_t recvConnTailCache; // Cache last seen value
+
+  struct ncclConnInfo* sendConn = NULL;
+  volatile int* sendConnFifoPtr = NULL;
+  volatile uint64_t* sendConnTailPtr = NULL;
+  uint64_t sendConnTail;
+  volatile uint64_t* sendConnHeadPtr = NULL;
+  uint64_t sendConnHead;
+  uint64_t sendConnHeadCache; // Cache last seen value
+
  uint64_t recvStep[NRECV];
  uint64_t sendStep[NSEND];
-  uint64_t sendConnHead[NSEND];
  const T* recvDirectBuff[NRECV];
  T* sendDirectBuff[NSEND];
  const T* recvBuff[NRECV];
@@ -60,15 +72,18 @@ class ncclPrimitives {
  inline __device__ void barrier() {
    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
  }
+  inline __device__ void subBarrier() {
+    asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE));
+  }

  uint32_t mismatch = 0;
  const uint64_t opCount;

-  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+  inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
    if (mismatch) {
      // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
      *(comm->fatalDevError) = ncclDevAssertedMismatch;
-    } else if (remoteOpCount && *remoteOpCount > opCount) {
+    } else if (conn && *conn->opCountRem > opCount) {
      mismatch += 1;
    }
  }
@@ -76,49 +91,55 @@ class ncclPrimitives {
  uint32_t spins = 0;
  uint32_t abort = 0;

-  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+  inline __device__ int checkAbort(int i, int send) {
    spins++;
-    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+    if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
      abort = *(comm->abortFlag);
-      checkMismatch(remoteOpCount);
+      if (wid == i) checkMismatch(send ? sendConn : recvConn);
      spins = 0;
    }
    return abort;
  }

-  inline __device__ void waitRecv(int i) {
+  inline __device__ void waitSend(int nbytes) {
    spins = 0;
    mismatch = 0;
+    if (sendConnHeadPtr) {
+      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
+        sendConnHeadCache = *sendConnHeadPtr;
+        if (checkAbort(wid, 1)) break;
+      }
+      if (sendConnFifoPtr) {
+        sendConnFifoPtr[sendConnHead%NCCL_STEPS] = nbytes;
+      }
+      sendConnHead += SLICESTEPS;
+    }
+  }
+
+  inline __device__ void waitRecv() {
+    spins = 0;
+    mismatch = 0;
+    if (recvConnTailPtr) {
+      while (recvConnTailCache < recvConnTail + SLICESTEPS) {
+        recvConnTailCache = *recvConnTailPtr;
+        if (checkAbort(wid, 0)) break;
+      }
+      recvConnTail += SLICESTEPS;
+    }
+  }
+
+  inline __device__ void incRecv(int i) {
    recvStep[i] += SLICESTEPS;
-    if (tid == i) {
-      while (*(waitPtr) < recvStep[i]) {
-        if (checkAbort(recvConn[i]->opCountRem)) break;
-      }
-    }
+  }
+  inline __device__ void postRecv() {
+    if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += SLICESTEPS;
  }

-  inline __device__ void waitSend(int i) {
-    spins = 0;
-    mismatch = 0;
+  inline __device__ void incSend(int i) {
    sendStep[i] += SLICESTEPS;
-    if (tid == WARP_SIZE+i) {
-      while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
-        sendConnHead[i] = *waitPtr;
-        if (checkAbort(sendConn[i]->opCountRem)) break;
-      }
-    }
  }
-
-  inline __device__ void postRecv(int i) {
-    *(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
-  }
-
-  inline __device__ void postSend(int i) {
-    *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
-  }
-
-  inline __device__ void postSendSize(int i, int size) {
-    if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
+  inline __device__ void postSend() {
+    if (sendConnTailPtr) *sendConnTailPtr = sendConnTail += SLICESTEPS;
  }

  template <int DIRECTRECV>
@@ -131,11 +152,22 @@ class ncclPrimitives {
    return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
  }

+  template <int DIRECTRECV>
+  inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
+    return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
+  }
+
+  template <int DIRECTSEND>
+  inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
+    return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
+  }
+
  template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
  inline __device__ void
  GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
    int offset = 0;
-    int sliceSize = stepSize * SLICESTEPS;
+    int sliceSize = stepSize*SLICESTEPS;
+    int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);

    const T* srcs[RECV*NRECV+SRC];
    srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
@@ -151,101 +183,126 @@ class ncclPrimitives {
      for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
    }

-    #pragma unroll 1
+    bool syncThread = tid >= nthreads-WARP_SIZE;
+
+    #pragma unroll
    for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
-      int realSize = max(0, min(sliceSize, nelem-offset));
-      if (tid < nthreads) {
-        FOR_SEND(waitSend);
-        FOR_RECV(waitRecv);
+      int realSize = max(0, min(dataSize, nelem-offset));
+      if (!syncThread) {
+        if (SEND) waitSend(realSize*sizeof(T));
+        if (RECV) waitRecv();
        if (realSize > 0) {
-          barrier();
+          subBarrier();
          if (DIRECTRECV && recvDirectBuff[0]) {
            // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
            if (SEND) {
-              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize);
            }
          } else {
-            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
          }
        }
-        exitIfAbortBarrier(abort);
-      } else {
-        exitIfAbortBarrier(abort);
-        FOR_SEND(postSendSize, realSize*sizeof(T));
-        if (SEND) __threadfence_system();
-        FOR_SEND(postSend);
-        FOR_RECV(postRecv);
      }
-      for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
-      for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
-      offset += sliceSize;
+      barrier();
+      FOR_SEND(incSend);
+      FOR_RECV(incRecv);
+      if (syncThread) {
+        if (SEND) {
+          if (realSize > 0 && wid == 0) __threadfence_system();
+          __syncwarp();
+          postSend();
+        }
+        if (RECV) postRecv();
+      }
+      srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
+      for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
+      dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
+      for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
+      offset += realSize;
    }
  }

  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
-    recvConn[i] = conn;
-    recvBuff[i] = (const T*)recvConn[i]->buff;
-    recvStep[i] = recvConn[i]->step;
+    recvBuff[i] = (const T*)conn->buff;
+    recvStep[i] = conn->step;
    recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
-    // Return credits in case we rounded up.
-    if (tid == nthreads) *recvConn[i]->head = recvStep[i];
-    if (tid == i) {
-      waitPtr = recvConn[i]->tail;
-      *(recvConn[i]->opCountLoc) = opCount;
-    }
    recvDirectBuff[i] = NULL;
-    if (directBuff && recvConn[i]->direct) {
+    if (directBuff && conn->direct) {
      recvDirectBuff[i] = directBuff;
-      if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
+      if (tid == 0) *conn->ptrExchange = directBuff;
    }
+    if (wid == i) recvConn = conn;
+    if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
    nrecv++;
  }
+  __device__ __forceinline__ void loadRecvSync() {
+    if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
+      recvConnTailPtr = recvConn->tail;
+      recvConnTailCache = *recvConnTailPtr;
+    }
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConnHeadPtr = recvConn->head;
+      // Return credits in case we rounded up.
+      *recvConnHeadPtr = recvConnHead;
+      // Update opCount in case we skipped some operations
+      *(recvConn->opCountLoc) = opCount;
+    }
+  }

  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
-    sendConn[i] = conn;
-    sendBuff[i] = (T*)sendConn[i]->buff;
-    sendStep[i] = sendConn[i]->step;
+    sendBuff[i] = (T*)conn->buff;
+    sendStep[i] = conn->step;
    sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
-    if (tid == WARP_SIZE+i) {
-      waitPtr = sendConn[i]->head;
-      sendConnHead[i] = *waitPtr;
-      *(sendConn[i]->opCountLoc) = opCount;
-    }
    sendDirectBuff[i] = NULL;
-    if (directBuff && sendConn[i]->direct) {
-      void* volatile* ptr = sendConn[i]->ptrExchange;
+    if (directBuff && conn->direct) {
+      void* volatile* ptr = conn->ptrExchange;
      while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
-      __syncthreads();
+      barrier();
      if (tid == 0) *ptr = NULL;
    }
+    if (wid == i) sendConn = conn;
+    if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
    nsend++;
  }
-
-  __device__ __forceinline__ void saveRecvConn(int i) {
-    if (tid == i) {
-      recvConn[i]->step = recvStep[i];
-      __threadfence_system();
-      *(recvConn[i]->opCountLoc) += 1;
+  __device__ __forceinline__ void loadSendSync() {
+    if (tid < nsend) {
+      sendConnHeadPtr = sendConn->head;
+      sendConnHeadCache = *sendConnHeadPtr;
+      sendConnFifoPtr = sendConn->fifo;
+      *(sendConn->opCountLoc) = opCount;
+    }
+    if (tid >= nthreads-WARP_SIZE && wid<nsend) {
+      sendConnTailPtr = sendConn->tail;
    }
  }

-  __device__ __forceinline__ void saveSendConn(int i) {
-    if (tid == WARP_SIZE+i) {
-      sendConn[i]->step = sendStep[i];
+  __device__ __forceinline__ void saveRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConn->step = recvConnHead;
+      *(recvConn->opCountLoc) = opCount+1;
+      __threadfence_system();
+    }
+  }
+
+  __device__ __forceinline__ void saveSendSync() {
+    if (tid < nsend) {
+      sendConn->step = sendConnHead;
+      *(sendConn->opCountLoc) = opCount+1;
      __threadfence_system();
-      *(sendConn[i]->opCountLoc) += 1;
    }
  }

 public:
  __device__ __forceinline__
  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
-    : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
-    // Make sure step is updated before we read it
-    __syncthreads();
+    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize), opCount(opCount) {
+    // Make sure step is updated before we read it.
+    barrier();

    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
+    loadRecvSync();
+    loadSendSync();
  }

  __device__ __forceinline__ void
@@ -305,267 +362,13 @@ class ncclPrimitives {
  }

  __device__ __forceinline__ ~ncclPrimitives() {
-    // Save steps for next collective. Have thread 0 do it to be compatible
-    // with the way LL works.
-    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
-    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
-  }
-};
-
-template <typename T, class FUNC, int NRECV, int NSEND>
-class ncclLLPrimitives {
- private:
-  const int tid;
-  const int nthreads;
-  int nrecv = 0;
-  int nsend = 0;
-  struct ncclConnInfo* recvConn[NRECV];
-  struct ncclConnInfo* sendConn[NSEND];
-  volatile uint64_t* waitPtr;
-  volatile uint64_t* postPtr;
-  volatile int* fifoPtr;
-  uint64_t recvStep[NRECV];
-  uint64_t sendStep[NSEND];
-  uint64_t sendConnHead;
-  union ncclLLFifoLine* recvBuff[NRECV];
-  union ncclLLFifoLine* sendBuff[NSEND];
-  struct ncclDevComm* comm;
-
-  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
-  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
-  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
-  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
-  inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
-  inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
-
-  // Exit If Abort Barrier : make sure all threads exit consistently
-  // Each thread sets a predicate to true if val == 1
-  // all CTA's threads enter the barrier and do a popc on their predicates being True
-  // If any of the thread's predicate was True, all the threads call exit()
-  inline __device__ void exitIfAbortLocalBarrier() {
-    uint32_t popc;
-    asm ("{");
-    asm volatile ("   .reg .pred barr_pred;");
-    asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
-    asm volatile ("   bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
-    asm ("}");
-    if (popc) {
-      // Make sure threads not participating in the operation get the abort and all threads exit
-      exitIfAbortBarrier(1);
-    }
-  }
-
-  inline __device__ void barrier() {
-    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
-  }
-
-  uint32_t mismatch = 0;
-  const uint64_t opCount;
-
-  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
-    if (mismatch > 20) {
-      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
-      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
-      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
-    } else if (remoteOpCount && *remoteOpCount > opCount) {
-      mismatch += 1;
-    }
-  }
-
-  uint32_t spins = 0;
-  uint32_t abort = 0;
-
-  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
-    spins++;
-    if (spins == SPINS_BEFORE_CHECK_ABORT) {
-      abort = *(comm->abortFlag);
-      checkMismatch(remoteOpCount);
-      spins = 0;
-    }
-    return abort;
-  }
-
-  inline __device__ void waitSend(int i, int nbytes) {
-    spins = 0;
-    mismatch = 0;
-    if (tid == WARP_SIZE+i) {
-      while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
-        sendConnHead = *waitPtr;
-        if (checkAbort(sendConn[i]->opCountRem)) break;
-      }
-      if (fifoPtr) {
-        int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
-        fifoPtr[sendStep[i]%NCCL_STEPS] = size;
-      }
-    }
-  }
-
-  inline __device__ void postRecv(int i) {
-    recvStep[i]++;
-    if (tid == i) *postPtr = recvStep[i];
-  }
-
-  inline __device__ void postSend(int i, int offset) {
-    // LL Cleanup : write all flags in the slice to make sure we don't have
-    // data corruption when flag loops over.
-    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
-      for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
-    }
-    sendStep[i]++;
-  }
-
-  __device__ uint64_t readLL(int i, int offset) {
-    union ncclLLFifoLine* src = recvPtr(i) + offset;
-    uint32_t flag = recvFlag(i);
-    uint32_t data1, flag1, data2, flag2;
-    spins = 0;
-    mismatch = 0;
-    do {
-      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
-      if (checkAbort(recvConn[i]->opCountRem)) break;
-    } while ((flag1 != flag) || (flag2 != flag));
-    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
-    return val64;
-  }
-
-  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
-  }
-
-  // Using memcpy handles misaligned pointers.
-  __device__ uint64_t readAL(uint64_t* src) {
-    uint64_t val;
-    memcpy((char*)&val, (char*)src, sizeof(uint64_t));
-    return val;
-  }
-
-  __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
-    memcpy((char*)dst, (char*)&val, nbytes);
-  }
-
-  template <int RECV, int SEND, int SRC, int DST>
-  __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
-    uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
-    FOR_SEND(waitSend, nbytes*2);
-    barrier();
-    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
-    uint64_t* srcPack = (uint64_t*)srcPtr;
-    uint64_t* dstPack = (uint64_t*)dstPtr;
-    int offset = tid;
-    // Do multiples of 64 bits
-    #pragma unroll 2
-    for (; offset<npack; offset+=nthreads) {
-      // Recv : local, then intra-node, then inter-node
-      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
-      if (RECV) {
-        if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
-        for (int i=1; i<NRECV && i<nrecv; i++) {
-          val = MULTI<FUNC, T>()(readLL(i, offset), val);
-        }
-      }
-
-      // Send : inter-node, then intra-node, then local
-      if (SEND) {
-        for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
-        storeLL(sendPtr(0)+offset, val, sendFlag(0));
-      }
-      if (DST) {
-        if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
-          // Last incomplete word
-          storeAL(dstPack+offset, val, nbytes & 0x7);
-        } else {
-          storeAL(dstPack+offset, val, sizeof(uint64_t));
-        }
-      }
-    }
-    exitIfAbortLocalBarrier();
-    FOR_RECV(postRecv);
-    FOR_SEND(postSend, offset);
-  }
-
-  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
-    recvConn[i] = conn;
-    recvBuff[i] = recvConn[i]->llBuff;
-    recvStep[i] = recvConn[i]->step;
-    if (tid == i) {
-      postPtr = recvConn[i]->head;
-      *(recvConn[i]->opCountLoc) = opCount;
-    }
-    nrecv++;
-  }
-
-  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
-    sendConn[i] = conn;
-    sendBuff[i] = sendConn[i]->llBuff;
-    sendStep[i] = sendConn[i]->step;
-    if (tid == WARP_SIZE+i) {
-      waitPtr = sendConn[i]->head;
-      fifoPtr = sendConn[i]->fifo;
-      sendConnHead = *waitPtr;
-      *(sendConn[i]->opCountLoc) = opCount;
-    }
-    nsend++;
-  }
-
-  __device__ __forceinline__ void saveRecvConn(int i) {
-    if (tid == i) {
-      recvConn[i]->step = recvStep[i];
-      *(recvConn[i]->opCountLoc) += 1;
-      __threadfence_block();
-    }
-  }
-
-  __device__ __forceinline__ void saveSendConn(int i) {
-    if (tid == WARP_SIZE+i) {
-      sendConn[i]->step = sendStep[i];
-      *(sendConn[i]->opCountLoc) += 1;
-      __threadfence_block();
-    }
-  }
-
- public:
-  __device__ __forceinline__
-  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
-    : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
-    // Make sure step is updated before we read it.
-    barrier();
-
-    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
-    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
-  }
-
-  __device__ void send(const T* src, int nelem) {
-    return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
-  }
-
-  __device__ void recv(T* dst, int nelem) {
-    return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
-  }
-
-  __device__ void recvReduceSend(const T* src, int nelem) {
-    return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
-  }
-
-  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
-    return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
-  }
-
-  __device__ void copySend(const T* src, T* dst, int nelem) {
-    return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
-  }
-
-  __device__ void recvCopySend(T* dst, int nelem) {
-    return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
-  }
-
-  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
-    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
-  }
-
-  __device__ __forceinline__ ~ncclLLPrimitives() {
    // Save steps for the next operation
-    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
-    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+    saveRecvSync();
+    saveSendSync();
  }
 };
+
+#include "prims_ll.h"
+//#include "prims_ll128.h"
+
 #endif
@@ -0,0 +1,259 @@
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLLPrimitives {
+ private:
+  const int tid;
+  const int nthreads;
+  const int wid;
+  int nrecv = 0;
+  int nsend = 0;
+  struct ncclConnInfo* recvConn = NULL;
+  volatile uint64_t* recvConnHeadPtr = NULL;
+  uint64_t recvConnHead;
+
+  struct ncclConnInfo* sendConn = NULL;
+  volatile int* sendConnFifoPtr = NULL;
+  volatile uint64_t* sendConnHeadPtr = NULL;
+  uint64_t sendConnHead;
+  uint64_t sendConnHeadCache; // Cache last seen value
+
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  union ncclLLFifoLine* recvBuff[NRECV];
+  union ncclLLFifoLine* sendBuff[NSEND];
+  struct ncclDevComm* comm;
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+  inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
+
+  inline __device__ void barrier() {
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
+    if (mismatch > 20) {
+      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+    } else if (conn && *conn->opCountRem > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  inline __device__ int checkAbort(int i, int send) {
+    spins++;
+    if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = *(comm->abortFlag);
+      if (wid == i) checkMismatch(send ? sendConn : recvConn);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  inline __device__ void waitSend(int nbytes) {
+    spins = 0;
+    mismatch = 0;
+    if (sendConnHeadPtr) {
+      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
+        sendConnHeadCache = *sendConnHeadPtr;
+        if (checkAbort(wid, 1)) break;
+      }
+      if (sendConnFifoPtr) {
+        int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+        sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size;
+      }
+      sendConnHead += 1;
+    }
+    barrier();
+  }
+
+  inline __device__ void incRecv(int i) {
+    recvStep[i] += 1;
+  }
+  inline __device__ void postRecv() {
+    barrier();
+    if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
+  }
+
+  inline __device__ void incSend(int i, int offset) {
+    // LL Cleanup : write all flags in the slice to make sure we don't have
+    // data corruption when flag loops over.
+    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+      for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+    }
+    sendStep[i]++;
+  }
+
+  __device__ uint64_t readLL(int i, int offset) {
+    union ncclLLFifoLine* src = recvPtr(i) + offset;
+    uint32_t flag = recvFlag(i);
+    uint32_t data1, flag1, data2, flag2;
+    spins = 0;
+    mismatch = 0;
+    do {
+      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+      if (checkAbort(i, 0)) break;
+    } while ((flag1 != flag) || (flag2 != flag));
+    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+    return val64;
+  }
+
+  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+  }
+
+  // Using memcpy handles misaligned pointers.
+  __device__ uint64_t readAL(uint64_t* src) {
+    uint64_t val;
+    memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+    return val;
+  }
+
+  __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
+    memcpy((char*)dst, (char*)&val, nbytes);
+  }
+
+  template <int RECV, int SEND, int SRC, int DST>
+  __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+    uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
+    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
+    uint64_t* srcPack = (uint64_t*)srcPtr;
+    uint64_t* dstPack = (uint64_t*)dstPtr;
+    int offset = tid;
+
+    // Always waitSend in case of cleanup
+    if (SEND) waitSend(npack*sizeof(union ncclLLFifoLine));
+
+    // Do multiples of 64 bits
+    #pragma unroll 2
+    for (; offset<npack; offset+=nthreads) {
+      // Recv : local, then intra-node, then inter-node
+      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
+      if (RECV) {
+        if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
+        for (int i=1; i<NRECV && i<nrecv; i++) {
+          val = MULTI<FUNC, T>()(readLL(i, offset), val);
+        }
+      }
+
+      // Send : inter-node, then intra-node, then local
+      if (SEND) {
+        for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
+        storeLL(sendPtr(0)+offset, val, sendFlag(0));
+      }
+      if (DST) {
+        if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
+          // Last incomplete word
+          storeAL(dstPack+offset, val, nbytes & 0x7);
+        } else {
+          storeAL(dstPack+offset, val, sizeof(uint64_t));
+        }
+      }
+    }
+    FOR_RECV(incRecv); if (RECV) postRecv();
+    FOR_SEND(incSend, offset);
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvBuff[i] = conn->llBuff;
+    recvStep[i] = conn->step;
+    if (wid == i) recvConn = conn;
+    nrecv++;
+  }
+  __device__ __forceinline__ void loadRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConnHeadPtr = recvConn->head;
+      recvConnHead = recvConn->step;
+      // Update opCount in case we skipped some operations
+      *(recvConn->opCountLoc) = opCount;
+    }
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendBuff[i] = conn->llBuff;
+    sendStep[i] = conn->step;
+    if (wid == i) sendConn = conn;
+    nsend++;
+  }
+  __device__ __forceinline__ void loadSendSync() {
+    if (tid < nsend) {
+      sendConnHeadPtr = sendConn->head;
+      sendConnHeadCache = *sendConnHeadPtr;
+      sendConnHead = sendConn->step;
+      sendConnFifoPtr = sendConn->fifo;
+      *(sendConn->opCountLoc) = opCount;
+    }
+  }
+
+  __device__ __forceinline__ void saveRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConn->step = recvConnHead;
+      *(recvConn->opCountLoc) = opCount+1;
+      __threadfence_block();
+    }
+  }
+
+  __device__ __forceinline__ void saveSendSync() {
+    if (tid < nsend) {
+      sendConn->step = sendConnHead;
+      *(sendConn->opCountLoc) = opCount+1;
+      __threadfence_block();
+    }
+  }
+
+ public:
+  __device__ __forceinline__
+  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) {
+    // Make sure step is updated before we read it.
+    barrier();
+
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
+    loadRecvSync();
+    loadSendSync();
+  }
+
+  __device__ void send(const T* src, int nelem) {
+    return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recv(T* dst, int nelem) {
+    return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceSend(const T* src, int nelem) {
+    return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void copySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void recvCopySend(T* dst, int nelem) {
+    return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ __forceinline__ ~ncclLLPrimitives() {
+    // Save steps for the next operation
+    saveRecvSync();
+    saveSendSync();
+  }
+};
@@ -0,0 +1,410 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "op128.h"
+
+#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
+
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLL128Primitives {
+ private:
+  const int tid;
+  const int nthreads;
+  const int wid;
+  const int warp;
+  const bool flagThread;
+  int nrecv = 0;
+  int nsend = 0;
+  struct ncclConnInfo* recvConn = NULL;
+  volatile uint64_t* recvConnHeadPtr = NULL;
+  uint64_t recvConnHead;
+
+  struct ncclConnInfo* sendConn = NULL;
+  volatile int* sendConnFifoPtr = NULL;
+  volatile uint64_t* sendConnTailPtr = NULL;
+  uint64_t sendConnTail;
+  volatile uint64_t* sendConnHeadPtr = NULL;
+  uint64_t sendConnHead;
+  uint64_t sendConnHeadCache; // Cache last seen value
+
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t* recvBuff[NRECV];
+  uint64_t* sendBuff[NSEND];
+  struct ncclDevComm* comm;
+
+  volatile uint64_t* shmem;
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
+  inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
+  inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
+
+  inline __device__ void barrier() {
+    if (NSEND>NRECV) {
+      asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
+    } else {
+      asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
+    }
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
+    if (mismatch > 20) {
+      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+    } else if (conn && *conn->opCountRem > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  inline __device__ int checkAbort(int i, int send) {
+    spins++;
+    if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = *(comm->abortFlag);
+      if (wid == i) checkMismatch(send ? sendConn : recvConn);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  inline __device__ void waitSend(int nbytes) {
+    spins = 0;
+    mismatch = 0;
+    if (sendConnHeadPtr) {
+      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
+        sendConnHeadCache = *sendConnHeadPtr;
+        if (checkAbort(wid, 1)) break;
+      }
+      if (sendConnFifoPtr) {
+        sendConnFifoPtr[sendStep[wid]%NCCL_STEPS] = nbytes;
+      }
+      sendConnHead += 1;
+    }
+  }
+
+  inline __device__ void incRecv(int i) {
+    recvStep[i] += 1;
+  }
+  inline __device__ void postRecv() {
+    if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
+  }
+
+  inline __device__ void incSend(int i) {
+    sendStep[i] += 1;
+  }
+  inline __device__ void postSend() {
+    if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; }
+  }
+
+  template <int ELEMS_PER_THREAD>
+  inline __device__ void loadSrcToShmem128(int maxOffset, const uint64_t* src64Ptr) {
+#if 0
+    uint64_t v[ELEMS_PER_THREAD];
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      if (u*WARP_SIZE < maxOffset) load128(src64Ptr+u*WARP_SIZE, v[u], v[u+1]);
+    }
+    uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      storeShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
+    }
+#else
+    uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      if (u*WARP_SIZE < maxOffset) {
+        uint64_t v0, v1;
+        load128(src64Ptr+u*WARP_SIZE, v0, v1);
+        storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
+      }
+    }
+#endif
+  }
+
+  inline __device__ void loadSrcToShmem(int start, int end, const T* srcPtr) {
+    T* shmemPtr = (T*)(shmem-2*wid);
+    for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
+      shmemPtr[offset] = srcPtr[offset];
+    }
+  }
+
+  template <int ELEMS_PER_THREAD>
+  inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
+    uint64_t v[ELEMS_PER_THREAD];
+    uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
+    }
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
+    }
+  }
+
+  inline __device__ void storeShmemToDst(int start, int end, T* dstPtr) {
+    T* shmemPtr = (T*)(shmem-2*wid);
+    for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
+      dstPtr[offset] = shmemPtr[offset];
+    }
+  }
+
+  #define WARP_MASK 0xffffffff
+
+  template <int ELEMS_PER_THREAD, int RECV, int SEND, int SRC, int DST>
+  __device__ __forceinline__ void recvReduceSendCopy(int ll128Offset) {
+    uint64_t v[ELEMS_PER_THREAD];
+
+    /************* Data Loading : SHMEM -> REG **************/
+    if (SRC) {
+      volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        v[u] = shmem64Ptr[u*(WARP_SIZE-2)];
+        if (!flagThread) v[u+1] = shmem64Ptr[u*(WARP_SIZE-2)+1];
+      }
+    }
+    /*********** End Data Loading : SHMEM -> REG ************/
+
+    /************************ Recv **************************/
+    if (RECV) {
+      uint64_t flag = recvFlag(0);
+      uint64_t* ptr = recvPtr(0)+ll128Offset;
+      bool needReload;
+      uint64_t v0, v1;
+      do {
+        needReload = false;
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          load128(ptr+u*WARP_SIZE, v0, v1);
+          needReload |= flagThread && (v1 != flag);
+        }
+      } while (__any_sync(WARP_MASK, needReload) && checkAbort(0, 0) == 0);
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        load128(ptr+u*WARP_SIZE, v0, v1);
+        v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
+        v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
+      }
+
+      for (int i=1; i<NRECV && i<nrecv; i++) {
+        uint64_t flag = recvFlag(i);
+        uint64_t* ptr = recvPtr(i)+ll128Offset;
+        uint64_t v0, v1;
+        do {
+          needReload = false;
+          #pragma unroll
+          for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+            load128(ptr+u*WARP_SIZE, v0, v1);
+            needReload |= flagThread && (v1 != flag);
+          }
+        } while (__any_sync(WARP_MASK, needReload) && checkAbort(i, 0) == 0);
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          load128(ptr+u*WARP_SIZE, v0, v1);
+          v[u] = MULTI<FUNC, T>()(v0, v[u]);
+          v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
+        }
+      }
+    }
+    /********************** End Recv ************************/
+
+    /************************ Send **************************/
+    if (SEND) {
+      for (int i=1; i<NSEND && i<nsend; i++) {
+        int flag = sendFlag(i);
+        uint64_t* ptr = sendPtr(i)+ll128Offset;
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+        }
+      }
+      int flag = sendFlag(0);
+      uint64_t* ptr = sendPtr(0)+ll128Offset;
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+      }
+    }
+    /********************** End Send ************************/
+
+    /************* Data Storing : REG -> SHMEM **************/
+    if (DST) {
+      volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        shmem64Ptr[u*(WARP_SIZE-2)] = v[u];
+        if (!flagThread) shmem64Ptr[u*(WARP_SIZE-2)+1] = v[u+1];
+      }
+    }
+    /*********** End data Storing : REG -> SHMEM ************/
+  }
+
+  #define LL128INC (WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD)
+  #define ELEMINC (LL128INC-(LL128INC/NCCL_LL128_LINEELEMS))
+
+  template <int RECV, int SEND, int SRC, int DST>
+  __device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+    if (nelem <= 0) {
+      // Don't move any data but still increase steps and sync with prev/next
+      if (SEND) waitSend(0);
+      FOR_SEND(incSend); if (SEND) postSend();
+      FOR_RECV(incRecv); if (RECV) postRecv();
+      return;
+    }
+    const int nelem64 = ((nelem*sizeof(T))/(2*sizeof(uint64_t)))*2;
+    const uint64_t* src64Ptr = ((uint64_t*)srcPtr);
+    uint64_t* dst64Ptr = ((uint64_t*)dstPtr);
+
+    int ll128Offset = LL128INC*warp+2*wid;
+    int elemOffset = ELEMINC*warp;
+    const int nwarps = nthreads/WARP_SIZE;
+
+    if (SEND) waitSend(DIVUP(nelem*sizeof(T), ELEMINC*sizeof(uint64_t))*LL128INC*sizeof(uint64_t));
+    barrier();
+
+    while (elemOffset*(sizeof(uint64_t)/sizeof(T)) < nelem) {
+      const int maxOffset128 = min(nelem64-elemOffset, (int)ELEMINC);
+      const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
+      if (SRC) {
+        int done = 0;
+        if ((((uint64_t)srcPtr)&0xf) == 0) {
+          loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
+          done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
+        }
+        loadSrcToShmem(done, maxOffset, (T*)(src64Ptr+elemOffset));
+      }
+      __syncwarp();
+      recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SRC, DST>(ll128Offset);
+      __syncwarp();
+      if (DST) {
+        int done = 0;
+        if ((((uint64_t)dstPtr)&0xf) == 0) {
+          storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
+          done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
+        }
+        storeShmemToDst(done, maxOffset, (T*)(dst64Ptr+elemOffset));
+      }
+      __syncwarp();
+      ll128Offset += LL128INC*nwarps;
+      elemOffset += ELEMINC*nwarps;
+    }
+
+    barrier();
+    FOR_SEND(incSend); if (SEND) postSend();
+    FOR_RECV(incRecv); if (RECV) postRecv();
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvBuff[i] = conn->ll128Buff;
+    recvStep[i] = conn->step;
+    if (wid == i) recvConn = conn;
+    nrecv++;
+  }
+  __device__ __forceinline__ void loadRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConnHeadPtr = recvConn->head;
+      recvConnHead = recvConn->step;
+      // Update opCount in case we skipped some operations
+      *(recvConn->opCountLoc) = opCount;
+    }
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendBuff[i] = conn->ll128Buff;
+    sendStep[i] = conn->step;
+    if (wid == i) sendConn = conn;
+    nsend++;
+  }
+  __device__ __forceinline__ void loadSendSync() {
+    if (tid < nsend) {
+      sendConnHeadPtr = sendConn->head;
+      sendConnHeadCache = *sendConnHeadPtr;
+      sendConnHead = sendConn->step;
+      sendConnFifoPtr = sendConn->fifo;
+      *(sendConn->opCountLoc) = opCount;
+    }
+    if (tid >= nthreads-WARP_SIZE && wid<nsend) {
+      if (sendConn->fifo) {
+        sendConnTailPtr = sendConn->tail;
+        sendConnTail = sendConn->step;
+      }
+    }
+  }
+
+  __device__ __forceinline__ void saveRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConn->step = recvConnHead;
+      *(recvConn->opCountLoc) = opCount+1;
+      __threadfence_block();
+    }
+  }
+
+  __device__ __forceinline__ void saveSendSync() {
+    if (tid < nsend) {
+      sendConn->step = sendConnHead;
+      *(sendConn->opCountLoc) = opCount+1;
+      __threadfence_block();
+    }
+  }
+
+ public:
+  __device__ __forceinline__
+  ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
+    // Make sure step is updated before we read it.
+    barrier();
+
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
+    loadRecvSync();
+    loadSendSync();
+  }
+
+  __device__ void send(const T* src, int nelem) {
+    return GenericOp<0, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recv(T* dst, int nelem) {
+    return GenericOp<1, 0, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceSend(const T* src, int nelem) {
+    return GenericOp<1, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+    return GenericOp<1, 0, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void copySend(const T* src, T* dst, int nelem) {
+    return GenericOp<0, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void recvCopySend(T* dst, int nelem) {
+    return GenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+    return GenericOp<1, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ __forceinline__ ~ncclLL128Primitives() {
+    // Save steps for the next operation
+    saveRecvSync();
+    saveSendSync();
+  }
+};
@@ -11,7 +11,7 @@
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -30,7 +30,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
  T * __restrict__ thisOutput = (T*)args->ThisOutput;

  ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+    prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -93,3 +93,48 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {

 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  const int prevRank = ring->devUserRanks[nranks-1];
+  const int root = args->root;
+
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int nelem = min(chunkSize, size-offset);
+    if (prevRank == root) {
+      LLprims.send(thisInput+offset, nelem);
+    } else if (rank == root) {
+      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+    } else {
+      LLprims.recvReduceSend(thisInput+offset, nelem);
+    }
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
@@ -11,7 +11,7 @@
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -19,7 +19,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
  const ssize_t size = args->N;
  const int nranks = comm->nRanks;
  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;

  // Compute pointers
@@ -27,7 +27,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
  T * __restrict__ thisOutput = (T*)args->ThisOutput;

  ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+    prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -121,3 +121,64 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {

 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin ReduceScatter steps ///////////////
+    ssize_t offset;
+    int nelem = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    LLprims.send(thisInput+offset, nelem);
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      LLprims.recvReduceSend(thisInput+offset, nelem);
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
@@ -0,0 +1,169 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_net.h"
+#include <stdlib.h>
+#include <stdarg.h>
+
+int ncclDebugLevel = -1;
+thread_local int ncclDebugNoWarn = 0;
+uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
+FILE *ncclDebugFile = stdout;
+pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+
+void ncclDebugInit() {
+  pthread_mutex_lock(&ncclDebugLock);
+  if (ncclDebugLevel != -1) return;
+  const char* nccl_debug = getenv("NCCL_DEBUG");
+  if (nccl_debug == NULL) {
+    ncclDebugLevel = NCCL_LOG_NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    ncclDebugLevel = NCCL_LOG_VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    ncclDebugLevel = NCCL_LOG_WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    ncclDebugLevel = NCCL_LOG_INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    ncclDebugLevel = NCCL_LOG_ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    ncclDebugLevel = NCCL_LOG_TRACE;
+  }
+
+  /* Parse the NCCL_DEBUG_SUBSYS env var
+   * This can be a comma separated list such as INIT,COLL
+   * or ^INIT,COLL etc
+   */
+  char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
+  if (ncclDebugSubsysEnv != NULL) {
+    int invert = 0;
+    if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
+    ncclDebugMask = invert ? ~0ULL : 0ULL;
+    char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
+    char *subsys = strtok(ncclDebugSubsys, ",");
+    while (subsys != NULL) {
+      uint64_t mask = 0;
+      if (strcasecmp(subsys, "INIT") == 0) {
+        mask = NCCL_INIT;
+      } else if (strcasecmp(subsys, "COLL") == 0) {
+        mask = NCCL_COLL;
+      } else if (strcasecmp(subsys, "P2P") == 0) {
+        mask = NCCL_P2P;
+      } else if (strcasecmp(subsys, "SHM") == 0) {
+        mask = NCCL_SHM;
+      } else if (strcasecmp(subsys, "NET") == 0) {
+        mask = NCCL_NET;
+      } else if (strcasecmp(subsys, "GRAPH") == 0) {
+        mask = NCCL_GRAPH;
+      } else if (strcasecmp(subsys, "TUNING") == 0) {
+        mask = NCCL_TUNING;
+      } else if (strcasecmp(subsys, "ALL") == 0) {
+        mask = NCCL_ALL;
+      }
+      if (mask) {
+        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
+      }
+      subsys = strtok(NULL, ",");
+    }
+    free(ncclDebugSubsys);
+  }
+
+  /* Parse and expand the NCCL_DEBUG_FILE path and
+   * then create the debug file. But don't bother unless the
+   * NCCL_DEBUG level is > VERSION
+   */
+  const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
+  if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
+    int c = 0;
+    char debugFn[PATH_MAX+1] = "";
+    char *dfn = debugFn;
+    while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
+      if (ncclDebugFileEnv[c++] != '%') {
+        *dfn++ = ncclDebugFileEnv[c-1];
+        continue;
+      }
+      switch (ncclDebugFileEnv[c++]) {
+        case '%': // Double %
+          *dfn++ = '%';
+          break;
+        case 'h': // %h = hostname
+          char hostname[1024];
+          getHostName(hostname, 1024, '.');
+          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+          break;
+        case 'p': // %p = pid
+          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+          break;
+        default: // Echo everything we don't understand
+          *dfn++ = '%';
+          *dfn++ = ncclDebugFileEnv[c-1];
+          break;
+      }
+    }
+    *dfn = '\0';
+    if (debugFn[0] != '\0') {
+      FILE *file = fopen(debugFn, "w");
+      if (file != NULL) {
+        INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn);
+        ncclDebugFile = file;
+      }
+    }
+  }
+
+#ifdef ENABLE_TRACE
+  ncclEpoch = std::chrono::high_resolution_clock::now();
+#endif
+  pthread_mutex_unlock(&ncclDebugLock);
+}
+
+/* Common logging function used by the INFO, WARN and TRACE macros
+ * Also exported to the dynamically loadable Net transport modules so
+ * they can share the debugging mechanisms and output files
+ */
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
+  if (ncclDebugLevel == -1) ncclDebugInit();
+  if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO;
+
+  char hostname[1024];
+  getHostName(hostname, 1024, '.');
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+
+  char buffer[1024];
+  size_t len = 0;
+  pthread_mutex_lock(&ncclDebugLock);
+  if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n");
+  if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
+    len = snprintf(buffer, sizeof(buffer),
+                   "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
+  else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
+    len = snprintf(buffer, sizeof(buffer),
+                   "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
+#ifdef ENABLE_TRACE
+  else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
+    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
+    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
+    len = snprintf(buffer, sizeof(buffer),
+                   "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
+  }
+#endif
+  if (len) {
+    va_list vargs;
+    va_start(vargs, fmt);
+    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+    va_end(vargs);
+    fprintf(ncclDebugFile,"%s\n", buffer);
+    fflush(ncclDebugFile);
+  }
+  pthread_mutex_unlock(&ncclDebugLock);
+
+  // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
+  if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
+    fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
+            hostname, getpid(), gettid(), cudaDev, filefunc, line);
+    abort();
+  }
+}
@@ -5,19 +5,17 @@
 ************************************************************************/

 #include "enqueue.h"
-#include "checks.h"
-#include "param.h"
-
-#include "collectives/collectives.h"
+#include "argcheck.h"

 // Only generate inline kernels for LL
 #define NCCL_FUNC5(coll, op, dtype) \
+  (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
  (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)

 #define NCCL_FUNC4(coll, op, dtype) \
-  (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
-  (void*)NCCL_FUNC5(coll##Tree, op, dtype)
+  (void*)NCCL_FUNC5(coll##Tree, op, dtype), \
+  (void*)NCCL_FUNC5(coll##Ring, op, dtype)

 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -54,7 +52,7 @@
  NCCL_FUNCS3B(coll, copy)

 // Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+static void* const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
  NCCL_FUNCS2B(ncclBroadcast),
  NCCL_FUNCS2A(ncclReduce),
  NCCL_FUNCS2B(ncclAllGather),
@@ -207,6 +205,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
    channel->collCount = 0;
  }
  params->gridDim.x = params->blockDim.x = 0;
+  comm->lastOpCount = comm->opCount;
  NCCLCHECK(transportStartProxy(comm));
  return ncclSuccess;
 }
@@ -228,20 +227,70 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/

-static ncclResult_t getPatternInfo(struct ncclInfo* info) {
-  if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
-  else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
-  else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
-  else if (info->coll == ncclCollAllReduce) {
-    if (info->nBytes <= info->comm->treeThreshold)
-      info->pattern = ncclPatternTreeUpDown;
-    else
-      info->pattern = ncclPatternRingTwice;
+// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
+// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
+static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
+  { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .5,  .5,  .6,  .7,  .8,  .9,  .9, 1.0, 1.0, 1.0 },
+  { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .8,  .7,  .7,  .7,  .6,  .6,  .7,  .7,  .8,  .8,  .9,  .9, 1.0 },
+  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .5,  .5,  .6,  .6,  .7,  .8,  .9 }
+};
+
+static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
+  struct ncclComm* comm = info->comm;
+  float minTime = 3600000.0; // Hopefully no operation will take an hour to complete.
+  // Find algorithm / protocol.
+  info->algorithm = -1;
+  info->protocol = -1;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      float bw = comm->bandwidths[info->coll][a][p];
+      if (bw == 0) continue;
+      int logSize = log2i(info->nBytes>>6);
+      if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize];
+      float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw);
+      if (time < minTime) {
+        info->algorithm = a;
+        info->protocol = p;
+        minTime = time;
+      }
+    }
  }
-  else {
-    WARN("Unknown collective %d", info->coll);
+  if (info->algorithm == -1 || info->protocol == -1) {
+    WARN("Error : no algorithm/protocol available");
    return ncclInternalError;
  }
+  //if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime);
+  TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
+
+  int nc = comm->nChannels;
+  int nt = comm->maxThreads[info->protocol];
+  int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
+  while (info->nBytes < nc*nt*threadThreshold) {
+    if (nc >= 2) nc--;
+    else if ((nt % 128) == 0) nt/=2;
+    else break;
+  }
+  if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
+  info->nChannels = nc;
+  info->nThreads = nt;
+  return ncclSuccess;
+}
+
+static ncclResult_t getPatternInfo(struct ncclInfo* info) {
+  switch (info->coll) {
+    case ncclCollBroadcast:
+      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
+    case ncclCollReduce:
+      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
+    case ncclCollReduceScatter:
+    case ncclCollAllGather:
+      info->pattern = ncclPatternRing; break;
+    case ncclCollAllReduce:
+      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
+    default:
+      WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
+      return ncclInternalError;
+  }
  return ncclSuccess;
 }

@@ -264,40 +313,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
  return ncclSuccess;
 }

-static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
-  // Compute thresholds and limits that users can override
-  ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
-  int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
-
-  // First compute nThreads
-  int nt = NCCL_LL_MIN_NTHREADS;
-  while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
-
-  // Then compute nChannels
-  int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
-  if (nc == 0) nc = 1;
-  if (nc > info->comm->nChannels) nc = info->comm->nChannels;
-
-  // Check if we have a fixed LL threshold, otherwise compute it.
-  int perThreadThreshold = info->comm->threadThreshold;
-  if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
-  ssize_t llThreshold = info->comm->llThreshold >= 0 ?
-    info->comm->llThreshold :
-    nc*nt*info->nchunksPerLoop*perThreadThreshold;
-
-  if (info->nBytes <= llThreshold) {
-    *llMode = 1;
-    *nChannels = nc;
-    *nThreads = nt;
-  } else {
-    *llMode = 0;
-    *nChannels = info->comm->nChannels;
-    *nThreads = info->comm->nThreads+1;
-  }
-}
-
 static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
  // Set nstepsPerLoop and nchunksPerLoop
+  NCCLCHECK(getAlgoInfo(info));
  NCCLCHECK(getPatternInfo(info));
  NCCLCHECK(getLoopInfo(info));

@@ -307,48 +325,52 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
  coll->args.ThisOutput = info->recvbuff;
  coll->args.comm = info->comm->devComm;
  coll->args.opCount = info->comm->opCount;
+  coll->args.nChannels = info->nChannels;
+  coll->args.nThreads = info->nThreads;

-  // Compute llMode, nChannels, nThreads
-  int llMode;
-  getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
+  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);

-  int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
-  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
-
-  int stepSize   = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
-  int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
-  int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
+  int stepSize   = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+  int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
+  int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
  int chunkSize  = stepSize*chunkSteps;

  // Compute lastChunkSize
-  if (treeMode == 1 && llMode == 0) {
+  if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
    if (info->pattern == ncclPatternTreeUpDown) {
      // Optimize chunkSize / nSteps
-      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
-      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
-      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
    }
    // Use lastChunkSize as chunkSize
    coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
-  } else if (llMode == 1) {
+  } else if (info->protocol == NCCL_PROTO_LL) {
    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
-    const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
-    coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
-    ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
+    const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
+    coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t));
    coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+  } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
+    int nstepsInter = 1+log2i(info->comm->nNodes);
+    while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2;
+    // Use lastChunkSize as chunkSize
+    coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
  }

  // Compute nSteps for proxies
-  size_t nBytes  = llMode ? info->nBytes*2 : info->nBytes;
-
-  int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
+  int chunkEffectiveSize = chunkSize;
+  if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
+  if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
+  //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
+  int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
  proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
  proxyArgs->sliceSteps = sliceSteps;
  proxyArgs->chunkSteps = chunkSteps;
-  proxyArgs->llMode = llMode;
+  proxyArgs->protocol = info->protocol;
  proxyArgs->opCount = info->comm->opCount;
-  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
-      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
+  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
      nLoops, proxyArgs->nsteps, info->comm);
  return ncclSuccess;
 }
@@ -401,7 +423,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
    channel->collFifoTail = opIndex;
    channel->collCount++;
  }
-  /*if (llMode == 0)*/ info->comm->opCount++;
+  info->comm->opCount++;
  return ncclSuccess;
 }

@@ -0,0 +1,268 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "graph.h"
+#include "trees.h"
+#include "rings.h"
+
+/******************************************************************/
+/********************* Internode connection ***********************/
+/******************************************************************/
+
+ncclResult_t ncclTopoPreset(struct ncclComm* comm,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+    struct ncclTopoRanks* topoRanks) {
+  int rank = comm->rank;
+  int localRanks = comm->localRanks;
+  int nChannels = comm->nChannels;
+
+  for (int c=0; c<nChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    channel->ring.prev = channel->ring.next = -1;
+    channel->treeUp.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
+    channel->treeDn.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
+
+    int* ringIntra = ringGraph->intra+c*localRanks;
+    int* treeIntra = treeGraph->intra+c*localRanks;
+
+    for (int i=0; i<localRanks; i++) {
+      if (ringIntra[i] == rank) {
+        topoRanks->ringRecv[c] = ringIntra[0];
+        topoRanks->ringSend[c] = ringIntra[localRanks-1];
+        channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1];
+        channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
+      }
+      if (treeIntra[i] == rank) {
+        int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
+        int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
+
+        // Tree loop always flows in the same direction. Other trees are symmetric, i.e.
+        // up/down go in reverse directions
+        int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
+
+        // Down tree is common
+        topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
+        topoRanks->treeDnSend[c] = treeIntra[sendIndex];
+        channel->treeDn.up       = treeIntra[prev];
+        channel->treeDn.down[0]  = treeIntra[next];
+        // Up tree depends on the pattern
+        topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
+        topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
+        channel->treeUp.down[0]  = sym ? channel->treeDn.down[0]  : channel->treeDn.up ;
+        channel->treeUp.up       = sym ? channel->treeDn.up       : channel->treeDn.down[0];
+      }
+    }
+    topoRanks->ringPrev[c] = channel->ring.prev;
+    topoRanks->ringNext[c] = channel->ring.next;
+  }
+  // Duplicate channels rings/trees
+  struct ncclChannel* channel0 = comm->channels;
+  struct ncclChannel* channel1 = channel0+nChannels;
+  memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
+  return ncclSuccess;
+}
+
+static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) {
+  int nChannels = comm->nChannels;
+  int nNodes = comm->nNodes;
+  for (int c=0; c<nChannels; c++) {
+    int* recv = ringRecv+c*comm->nRanks;
+    int* send = ringSend+c*comm->nRanks;
+    int* prev = ringPrev+c*comm->nRanks;
+    int* next = ringNext+c*comm->nRanks;
+    struct ncclChannel* channel0 = comm->channels+c;
+    struct ncclChannel* channel1 = channel0+nChannels;
+    for (int n=0; n<nNodes; n++) {
+      int recvRank = recv[firstRanks[n]];
+      int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]];
+      prev[recvRank] = prevSendRank;
+      if (comm->rank == recvRank) {
+        channel0->ring.prev = prevSendRank;
+        channel1->ring.prev = prevSendRank;
+      }
+      int sendRank = send[firstRanks[n]];
+      int nextRecvRank = recv[firstRanks[(n+1)%nNodes]];
+      next[sendRank] = nextRecvRank;
+      if (comm->rank == sendRank) {
+        channel0->ring.next = nextRecvRank;
+        channel1->ring.next = nextRecvRank;
+      }
+    }
+    TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
+    TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) {
+ for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]];
+ return ncclSuccess;
+}
+
+static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
+  if (u0 != -1) tree0->up = indexes[u0];
+  if (u1 != -1) tree1->up = indexes[u1];
+  return ncclSuccess;
+}
+
+static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
+  int x = 0;
+  if (down[x] >= 0) x++;
+  if (down[x] >= 0) {
+    WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
+    return ncclInternalError;
+  }
+  if (r0 != -1) down[x++] = indexes[r0];
+  if (r1 != -1) down[x++] = indexes[r1];
+  return ncclSuccess;
+}
+
+static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
+  NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
+  NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
+  return ncclSuccess;
+}
+
+static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
+  if (tree->down[0] == upRank) tree->down[0] = -1;
+  if (rank == upRank) tree->up = -1;
+  return ncclSuccess;
+}
+
+static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
+  const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
+  int* indexesSend, *indexesRecv;
+  NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
+  NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
+
+  // Compute tree depth. Not an exact value but a good approximation in most
+  // cases
+  int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
+
+  int u0, d0_0, d0_1, u1, d1_0, d1_1;
+  NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+  for (int c=0; c<nChannels; c++) {
+     struct ncclChannel* channel0 = comm->channels+c;
+     struct ncclChannel* channel1 = channel0+nChannels;
+     NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
+     NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
+     NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
+     NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
+     int root = indexesSend[node];
+     if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
+     if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
+     NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
+     NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
+     NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
+     NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
+     if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
+     if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
+     TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c,           channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
+     TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
+     TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c,           channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
+     TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
+     channel0->treeUp.depth = channel1->treeUp.depth = depth;
+  }
+  free(indexesSend);
+  free(indexesRecv);
+  return ncclSuccess;
+}
+
+// Legacy naming
+NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
+NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
+// New naming
+NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
+NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
+
+int ncclMinNchannels() {
+  int minNchannels = 0;
+  if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
+  if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
+  if (minNchannels > MAXCHANNELS) {
+    WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
+    minNchannels = MAXCHANNELS;
+  }
+  if (minNchannels < 0) minNchannels = 0;
+  return minNchannels;
+}
+int ncclMaxNchannels() {
+  int maxNchannels = MAXCHANNELS;
+  if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
+  if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
+  if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
+  if (maxNchannels < 1) {
+    WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
+    maxNchannels = 1;
+  }
+  return maxNchannels;
+}
+
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
+  // Gather data from all ranks
+  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
+  int nranks = comm->nRanks;
+  int nChannels = comm->nChannels;
+  NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
+  for (int i=0; i<nranks; i++) {
+    for (int c=0; c<nChannels;c++) {
+      ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
+      ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
+      ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
+      ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
+      treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
+      treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
+      treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
+      treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
+    }
+  }
+
+  // Connect rings and trees. This should also duplicate the channels.
+  NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
+  NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
+
+  // Duplicate ringPrev/ringNext for ncclBuildRing
+  memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
+  memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
+
+  // Duplication should be complete now
+  nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
+
+  // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
+  // We permit combining max, then min, to only use the first channels, then duplicate them.
+  nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
+  int c;
+  for (c=nChannels; c<ncclMinNchannels(); c++) {
+    memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
+    memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
+    memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
+  }
+  nChannels = comm->nChannels = c;
+
+  // Create rings array and check all is fine
+  NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
+
+  free(ringRecv);
+  free(ringSend);
+  free(ringPrev);
+  free(ringNext);
+  free(treeUpRecv);
+  free(treeUpSend);
+  free(treeDnRecv);
+  free(treeDnSend);
+
+  return ncclSuccess;
+}
@@ -0,0 +1,363 @@
+/*************************************************************************
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "net.h"
+
+// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
+
+struct ncclTopoNodeList {
+  struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES];
+  int count;
+};
+
+static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) {
+  for (int i=0; i<system->nodes[t].count; i++) {
+    if (system->nodes[t].nodes[i].id == id) {
+      *path = node->paths[t]+i;
+      return ncclSuccess;
+    }
+  }
+  WARN("Could not find node of type %d id %lx\n", t, id);
+  return ncclInternalError;
+}
+
+static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
+  if (baseNode->paths[baseNode->type] == NULL) {
+    NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+  }
+
+  // breadth-first search to set all paths to that node in the system
+  struct ncclTopoNodeList nodeList;
+  struct ncclTopoNodeList nextNodeList;
+  nodeList.count = 1; nodeList.list[0] = baseNode;
+  nextNodeList.count = 0;
+  struct ncclTopoLinkList* basePath;
+  NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
+  basePath->count = 0;
+  basePath->width = LOC_WIDTH;
+  basePath->type = LINK_LOC;
+
+  while (nodeList.count) {
+    nextNodeList.count = 0;
+    for (int n=0; n<nodeList.count; n++) {
+      struct ncclTopoNode* node = nodeList.list[n];
+      struct ncclTopoLinkList* path;
+      NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
+      for (int l=0; l<node->nlinks; l++) {
+        struct ncclTopoLink* link = node->links+l;
+        struct ncclTopoNode* remNode = link->remNode;
+        if (remNode->paths[baseNode->type] == NULL) {
+          NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+        }
+        struct ncclTopoLinkList* remPath;
+        NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
+        int width = std::min(path->width, link->width);
+        if (remPath->width < width) {
+          // Find reverse link
+          for (int l=0; l<remNode->nlinks; l++) {
+            if (remNode->links[l].remNode == node) {
+              remPath->list[0] = remNode->links+l;
+              break;
+            }
+          }
+          if (remPath->list[0] == NULL) {
+            WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
+                 remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
+            return ncclInternalError;
+          }
+          // Copy the rest of the path
+          for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
+          remPath->count = path->count + 1;
+          remPath->width = width;
+
+          // Consider the path is QPI when going through the CPU
+          // Also don't consider LINK_NET as we only care about the NIC->GPU path.
+          int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
+          remPath->type = std::max(path->type, type);
+
+          // Add to the list for the next iteration if not already in the list
+          // Disallow GPUs as intermediate steps for now
+          if (remNode->type != GPU) {
+            int i;
+            for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
+            if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
+          }
+        }
+      }
+    }
+    memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
+  }
+  return ncclSuccess;
+}
+
+static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
+  char line[1024];
+#ifdef ENABLE_TRACE
+  INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
+#else
+  sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
+  int offset = strlen(line);
+#endif
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+    if (node->paths[t] == NULL) continue;
+    for (int n = 0; n<system->nodes[t].count; n++) {
+#ifdef ENABLE_TRACE
+      line[0] = 0;
+      int offset = 0;
+      for (int i=0; i<node->paths[t][n].count; i++) {
+        struct ncclTopoLink* link = node->paths[t][n].list[i];
+        struct ncclTopoNode* remNode = link->remNode;
+        sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
+        offset = strlen(line);
+      }
+      INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
+#else
+      sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
+      offset = strlen(line);
+#endif
+    }
+  }
+#ifndef ENABLE_TRACE
+  INFO(NCCL_GRAPH, "%s", line);
+#endif
+}
+
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    printNodePaths(system, system->nodes[GPU].nodes+i);
+  }
+  for (int i=0; i<system->nodes[NET].count; i++) {
+    printNodePaths(system, system->nodes[NET].nodes+i);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
+  // Find the closest CPU to a GPU
+  int minHops = 0;
+  int localCpu = -1;
+  struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
+  for (int c=0; c<system->nodes[CPU].count; c++) {
+    int hops = paths[c].count;
+    if (minHops == 0 || hops < minHops) {
+      localCpu = c;
+      minHops = hops;
+    }
+  }
+  if (localCpu == -1) {
+    WARN("Error : could not find CPU close to GPU %d", gpu);
+    return ncclInternalError;
+  }
+  *retCpu = localCpu;
+  return ncclSuccess;
+}
+
+static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
+  struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
+  struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
+
+  int l=0;
+  // Node 1 -> CPU
+  for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
+  // CPU -> Node 2
+  for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
+
+  // Update path characteristics
+  srcNode->paths[t2][i2].count = l;
+  srcNode->paths[t2][i2].type = LINK_QPI;
+  srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
+  return ncclSuccess;
+}
+
+// Remove/free paths for a given type
+static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+    for (int n=0; n<system->nodes[t].count; n++) {
+      struct ncclTopoNode* node = system->nodes[t].nodes+n;
+      free(node->paths[nodeType]);
+      node->paths[nodeType] = NULL;
+    }
+  }
+}
+
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
+  // Precompute paths between GPUs/NICs.
+
+  // Remove everything in case we're re-computing
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+
+  // Set direct paths from/to CPUs. We need them in many cases.
+  for (int c=0; c<system->nodes[CPU].count; c++) {
+    NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
+  }
+
+  // Set direct paths from/to GPUs.
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    // Compute paths to GPU g
+    NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
+
+    if (peerInfos == NULL) continue;
+    // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
+    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
+    for (int p=0; p<system->nodes[GPU].count; p++) {
+      if (p == g) continue;
+      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
+      int p2p;
+      NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+      if (p2p == 0) {
+        int shm;
+        NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+        if (shm == 1) {
+          // We cannot use GPU Direct, so we need all traffic to go through a CPU
+          int cpu;
+          NCCLCHECK(getLocalCpu(system, g, &cpu));
+          NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+        } else {
+          // We cannot communicate with that peer.
+          system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
+        }
+      }
+    }
+  }
+
+  // Set direct paths from/to NICs.
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
+    NCCLCHECK(ncclTopoSetPaths(netNode, system));
+
+    if (peerInfos == NULL) continue;
+    for (int g=0; g<system->nodes[GPU].count; g++) {
+      if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
+        // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
+        // to go through a CPU
+        int localCpu;
+        NCCLCHECK(getLocalCpu(system, g, &localCpu));
+        NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
+        NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
+      }
+    }
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
+  int *domains;
+  int64_t *ids;
+  NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
+  NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
+  int myDomain = 0;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    domains[g] = g;
+    ids[g] = gpu->id;
+    for (int p=0; p<g; p++) {
+      if (gpu->paths[GPU][p].count > 0) {
+        domains[g] = std::min(domains[g], domains[p]);
+      }
+    }
+    if (gpu->rank == comm->rank) myDomain = domains[g];
+  }
+
+  int ngpus = system->nodes[GPU].count;
+  for (int i=0; i<ngpus; i++) {
+    if (domains[i] == myDomain) continue;
+    struct ncclTopoNode* gpu = NULL;
+    int g;
+    for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
+      gpu = system->nodes[GPU].nodes+g;
+      if (gpu->id == ids[i]) break; else gpu=NULL;
+    }
+    if (gpu == NULL) {
+      WARN("Could not find id %lx", ids[i]);
+      free(domains);
+      free(ids);
+      return ncclInternalError;
+    }
+
+    // Remove GPUs I can't access (even indirectly) from my view of the node
+    for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+      for (int n=0; n<system->nodes[t].count; n++) {
+        struct ncclTopoNode* node = system->nodes[t].nodes+n;
+        if (node == gpu) continue;
+        for (int l=0; l<node->nlinks; l++) {
+          while (l<node->nlinks && node->links[l].remNode == gpu) {
+            if (l<node->nlinks-1)
+              memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
+            node->nlinks--;
+          }
+          if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
+            node->links[l].remNode--;
+          }
+        }
+      }
+    }
+    if (g != system->nodes[GPU].count-1)
+      memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
+    system->nodes[GPU].count--;
+  }
+
+  comm->localRanks = system->nodes[GPU].count;
+  if (system->nodes[GPU].count == comm->nRanks) {
+    // Trim network
+    ncclTopoRemovePathType(system, NET);
+    system->nodes[NET].count = 0;
+  }
+  free(domains);
+  free(ids);
+  return ncclSuccess;
+}
+
+static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
+  int nvlSpeed = 0;
+  int nvlPeers = 0;
+  int pciSpeed = 0;
+  for (int l=0; l<node->nlinks; l++) {
+    if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
+    if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
+    if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
+  }
+  *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
+  // Compute max speed to try to accelerate the search.
+  system->maxSpeed = LOC_WIDTH;
+
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
+  }
+  if (system->nodes[NET].count) {
+    // Try to assign one NIC per GPU
+    int netMaxSpeed = 0;
+    int netMaxSpeedCount = 0;
+    for (int n=0; n<system->nodes[NET].count; n++) {
+      int maxSpeed = 0;
+      struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+      for (int g=0; g<system->nodes[GPU].count; g++) {
+        maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
+      }
+      if (maxSpeed > netMaxSpeed) {
+        netMaxSpeed = maxSpeed;
+        netMaxSpeedCount = 1;
+      } else if (maxSpeed == netMaxSpeed) {
+        netMaxSpeedCount++;
+      }
+    }
+    system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
+  }
+  return ncclSuccess;
+}
+
+void ncclTopoFree(struct ncclTopoSystem* system) {
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+  free(system);
+}
@@ -0,0 +1,57 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+
+#define MAXWIDTH 20
+#define PREFIXLEN 15
+#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
+void dumpLine(int* values, int nranks, const char* prefix) {
+  int prefixlen = strlen(prefix);
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  strncpy(line, prefix, PREFIXLEN);
+  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
+  INFO(NCCL_INIT,"%s", line);
+}
+
+ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  for (int r=0; r<nrings; r++) {
+    char prefix[30];
+    /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
+    dumpLine(prev+r*nranks, nranks, prefix);
+    sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
+    dumpLine(next+r*nranks, nranks, prefix);*/
+
+    int current = rank;
+    for (int i=0; i<nranks; i++) {
+      rings[r*nranks+i] = current;
+      current = next[r*nranks+current];
+    }
+    sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
+    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
+    if (current != rank) {
+      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+      return ncclInternalError;
+    }
+    // Check that all ranks are there
+    for (int i=0; i<nranks; i++) {
+      int found = 0;
+      for (int j=0; j<nranks; j++) {
+        if (rings[r*nranks+j] == i) {
+          found = 1;
+          break;
+        }
+      }
+      if (found == 0) {
+        WARN("Error : ring %d does not contain rank %d", r, i);
+        return ncclInternalError;
+      }
+    }
+  }
+  return ncclSuccess;
+}
@@ -0,0 +1,7 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
@@ -0,0 +1,594 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+
+static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) {
+  if (path->count == 0) return ncclSuccess;
+
+  *node = NULL;
+  if (width > 0) {
+    if (path->type > graph->type) return ncclSuccess;
+    graph->type = std::max(graph->type, path->type);
+    graph->nHops += path->count;
+  } else {
+    graph->type = typeSave;
+    graph->nHops -= path->count;
+  }
+
+  for (int i=0; i<path->count; i++) {
+    if (path->list[i]->width < width) {
+      // Can't follow this path, rewind and exit
+      for (int j=0; j<i; j++) path->list[j]->width += width;
+      return ncclSuccess;
+    }
+    path->list[i]->width -= width;
+  }
+  *node = path->list[path->count-1]->remNode;
+  return ncclSuccess;
+}
+
+static int gpuPciWidth(struct ncclTopoNode* gpu) {
+  for (int l=0; l<gpu->nlinks; l++) {
+    struct ncclTopoLink* gpuLink = gpu->links+l;
+    if (gpuLink->type != LINK_PCI) continue;
+    struct ncclTopoNode* pci = gpuLink->remNode;
+    for (int l=0; l<pci->nlinks; l++) {
+      struct ncclTopoLink* pciLink = pci->links+l;
+      if (pciLink->remNode != gpu) continue;
+      return std::min(gpuLink->width, pciLink->width);
+    }
+  }
+  return -1;
+}
+
+/* Choose the order in which we try next GPUs. This is critical for the search
+   to quickly converge to the best solution even if it eventually times out. */
+struct ncclGpuScore {
+  int g;             // Retain the index
+  int startIndex;    // Least important
+  int intraNhops;
+  int intraWidth;
+  int interNhops;
+  int interPciWidth;
+  int interWidth;    // Most important
+};
+
+static int cmpScore(const void * g1, const void * g2) {
+   struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
+   struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
+   int d;
+   if ((d = (s2->interWidth - s1->interWidth))) return d;
+   if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
+   if ((d = (s1->interNhops - s2->interNhops))) return d;
+   if ((d = (s2->intraWidth - s1->intraWidth))) return d;
+   if ((d = (s1->intraNhops - s2->intraNhops))) return d;
+   return s1->startIndex - s2->startIndex;
+}
+
+static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
+  int intraWidth = scores[0].intraWidth;
+  int intraNhops = scores[0].intraNhops;
+  for (int i=1; i<count; i++) {
+    if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1;
+  }
+  return 0;
+}
+
+static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) {
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    if (system->nodes[NET].nodes[n].used & flag) {
+      *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
+}
+
+ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
+  const uint64_t flag = 1ULL<<(graph->nChannels);
+  int ngpus = system->nodes[GPU].count;
+  struct ncclTopoLinkList* paths = gpu->paths[GPU];
+  struct ncclTopoLinkList* netPaths = NULL;
+  if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths));
+
+  struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES];
+  memset(scores, 0, ngpus*sizeof(struct ncclGpuScore));
+  int start = gpu-system->nodes[GPU].nodes;
+  int count = 0;
+  for (int i=1; i<ngpus; i++) {
+    int g = (start+i)%ngpus;
+    if (paths[g].count == 0) continue; // There is no path to that GPU
+    if (system->nodes[GPU].nodes[g].used & flag) continue;
+    scores[count].g = g;
+    scores[count].startIndex = i;
+    scores[count].intraNhops = paths[g].count;
+    scores[count].intraWidth = paths[g].width;
+    if (netPaths) {
+      scores[count].interNhops = netPaths[g].count;
+      scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
+      scores[count].interWidth = netPaths[g].width;
+    }
+    count++;
+  }
+
+  // Sort GPUs
+  qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore);
+
+  // Check if all have the same intra-node score in which case we go reverse for sortNet = -1
+  if (sortNet == -1 && cmpIntraScores(scores, count) == 0) {
+    for (int i=0; i<count; i++) next[i] = scores[count-1-i].g;
+  } else {
+    for (int i=0; i<count; i++) next[i] = scores[i].g;
+  }
+  *countPtr = count;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time);
+
+#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so.
+
+#define FORCED_ORDER_PCI 1
+#define FORCED_ORDER_REPLAY 2
+
+ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) {
+  *g = -1;
+  if (graph->nChannels == 0) return ncclInternalError;
+  int ngpus = system->nodes[GPU].count;
+  int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
+  for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) {
+    *g = i;
+    return ncclSuccess;
+  }
+  if (*g == -1) return ncclInternalError;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time);
+
+ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) {
+  int typeSave = graph->type;
+  const uint64_t flag = 1ULL<<(graph->nChannels);
+  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+  if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave));
+  if (gpu) {
+    gpu->used ^= flag;
+    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time));
+    gpu->used ^= flag;
+    if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
+  // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels
+  // since it would likely impact the rings algorithms too.
+  if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess;
+
+  // 1. Try to get better bandwidth
+  if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
+  if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
+    *copy = 1;
+    return ncclSuccess;
+  }
+  // 2. Give an advantage when all channels are the same
+  if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) {
+    *copy = 1;
+    return ncclSuccess;
+  }
+  // 3. Less hops
+  if (graph->nHops < refGraph->nHops) *copy = 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) {
+  if ((*time) <= 0) return ncclSuccess;
+  (*time)--;
+
+  int ngpus = system->nodes[GPU].count;
+  if (step == ngpus) {
+    // Determine whether we found a better solution or not
+    int copy = 0;
+    int sameChannels = graph->sameChannels;
+    if (graph->nChannels > 0) {
+      int* intra = graph->intra+graph->nChannels*ngpus;
+      for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0;
+    }
+    graph->nChannels++;
+    NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, &copy));
+    if (copy) {
+      memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
+      if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1;
+    }
+    if (graph->nChannels < MAXCHANNELS/2) {
+      NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time));
+    }
+    graph->nChannels--;
+    graph->sameChannels = sameChannels;
+    return ncclSuccess;
+  }
+  graph->intra[graph->nChannels*ngpus+step] = gpu->rank;
+  if (step == backToNet) {
+    // first get back to NIC
+    if (system->nodes[NET].count) {
+      int maxWidth = 0;
+      struct ncclTopoLinkList* paths = gpu->paths[NET];
+      for (int n=0; n<system->nodes[NET].count; n++) {
+        if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
+        maxWidth = std::max(paths[n].width, maxWidth);
+      }
+      for (int n=0; n<system->nodes[NET].count; n++) {
+        if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
+        if (paths[n].width == maxWidth) {
+          struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+          int typeSave = graph->type;
+          NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave));
+          if (net) {
+            graph->inter[graph->nChannels*2+1] = net->id;
+            NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time));
+            NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave));
+          }
+        }
+      }
+    }
+  } else if (step < system->nodes[GPU].count-1) {
+    // Go to next GPU
+    struct ncclTopoLinkList* paths = gpu->paths[GPU];
+    int next[NCCL_TOPO_MAX_NODES];
+    int count;
+    if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
+      next[0] = step+1;
+      count = 1;
+    } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
+      NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
+      count = 1;
+    } else { // Normal search
+      NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
+    }
+    for (int i=0; i<count; i++) {
+      int g = next[i];
+      int nvlink = graph->nvlink;
+      graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0;
+      int speed = graph->speedIntra;
+      if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed);
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed));
+      graph->nvlink = nvlink;
+    }
+  } else if (step == backToFirstRank) {
+    // Find first GPU and loop back to it
+    int g;
+    int rank = graph->intra[graph->nChannels*ngpus];
+    for (g=0; g<ngpus; g++) {
+      if (system->nodes[GPU].nodes[g].rank == rank) break;
+    }
+    if (g == ngpus) {
+      WARN("Could not find GPU with rank %d\n", rank);
+      return ncclInternalError;
+    }
+    struct ncclTopoLinkList* paths = gpu->paths[GPU];
+    struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g;
+    int typeSave = graph->type;
+    NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave));
+    if (firstGpu) {
+      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time));
+      NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave));
+    }
+  } else {
+    // Next path
+    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) {
+  const uint64_t flag = 1ULL<<(graph->nChannels);
+  const int speed = graph->speedInter;
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+    struct ncclTopoNode* gpu;
+    if (net->used == 0) {
+      graph->inter[graph->nChannels*2] = net->id;
+      for (int i=0; i<system->nodes[NET].count; i++) {
+        if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+      }
+      struct ncclTopoLinkList* paths = net->paths[GPU];
+
+      // First try the PCI order to set a reference
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed));
+      // Then try to replay the last channel
+      if (graph->nChannels > 0) {
+        int g;
+        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed));
+      }
+
+      // Then try the most local GPUs
+      int maxWidth = 0, minHops = 0xfffffff;
+      for (int g=0; g<system->nodes[GPU].count; g++) {
+        if (paths[g].width > maxWidth) {
+          maxWidth = paths[g].width;
+          minHops = paths[g].count;
+        } else if (paths[g].width == maxWidth && paths[g].count < minHops) {
+          minHops = paths[g].count;
+        }
+      }
+      if (maxWidth >= speed) {
+        // In the first loop, avoid using GPUs in both directions between channels (one channel
+        // sending from that GPU and one channel receiving to that GPU), since that usually leads
+        // to lower BW.
+        for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
+          for (int g=0; g<system->nodes[GPU].count; g++) {
+            if (paths[g].width == maxWidth && paths[g].count == minHops) {
+              gpu = system->nodes[GPU].nodes+g;
+              int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
+              if (tryGpuBidir == gpuUsed) {
+                NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed));
+              }
+            }
+          }
+        }
+      }
+      for (int i=0; i<system->nodes[NET].count; i++) {
+        if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+/* Search Patterns
+ *
+ *     Intra-node
+ * Ring            : GPU a -> GPU b -> .. -> GPU x -> GPU a
+ * (=Split Tree Loop)
+ * Tree            : GPU a -> GPU b -> .. -> GPU x
+ * (=Split Tree)
+ *
+ *     Inter-node
+ * Ring            : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
+ * Tree            : NET n -> GPU a -> GPU b -> .. -> GPU x
+ *                              `--> NET n (or m if crossNic)
+ * Split Tree      : NET n -> GPU a -> GPU b -> .. -> GPU x
+ *                                       `--> NET n (or m if crossNic)
+ * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
+ *                                       `--> NET n (or m if crossNic)
+ */
+ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
+  if (system->nodes[NET].count) {
+    if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
+    else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
+    else *backToNet = 1;
+    if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
+    else *backToFirstRank = -1;
+  } else {
+    *backToNet = -1;
+    if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
+    else *backToFirstRank = -1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) {
+  int backToNet, backToFirstRank;
+  NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
+  if (system->nodes[NET].count) {
+    // Start from NET
+    ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time);
+  } else {
+    // Start from GPU 0
+    NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra));
+    if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra));
+    NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra));
+  }
+  return ncclSuccess;
+}
+
+/* Parse user defined rings. Format is like :
+ * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
+ * Rings with a non-matching number of ranks are ignored so we can provide
+ * rings for multiple cases.
+ */
+#define MAX_ENV_RANKS 512
+static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
+  int ranks[MAX_ENV_RANKS];
+  int nChannels = 0;
+  int rank = 0;
+  int offset = 0;
+  int status = 0; // 0 : between numbers, 1 : inside number
+  do {
+    int digit = str[offset] - '0';
+    if (digit >= 0 && digit <= 9) {
+      if (status == 0) {
+        ranks[rank] = digit;
+        status = 1;
+      } else {
+        ranks[rank] = ranks[rank]*10+digit;
+      }
+    } else {
+      if (status == 1) {
+        rank++;
+        if (rank == MAX_ENV_RANKS) goto end;
+      }
+      status = 0;
+      if (str[offset] == '|' || str[offset] == '\0') {
+        // Ignore if ngpus doesn't match
+        if (rank != ngpus) goto newchannel;
+
+        for (int r=0; r<ngpus; r++) {
+          int rank = ranks[r];
+          // Ignore if ranks are out of bounds
+          if (rank < 0 || rank >= ngpus) goto newchannel;
+          // Ignore if ranks are duplicate
+          for (int i=0; i<r; i++)
+            if (ranks[i] == rank) goto newchannel;
+
+          channels[nChannels*ngpus+r] = rank;
+        }
+        nChannels++;
+newchannel:
+        rank = 0;
+      }
+    }
+  } while (str[offset++] != 0);
+end:
+  *nChannelsRet = nChannels;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  int ngpus = system->nodes[GPU].count;
+  int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
+  graph->speedIntra = graph->speedInter = 0;
+  if (graph->crossNic == 2) graph->crossNic = 0;
+  graph->nvlink = 0;
+  graph->type = LINK_LOC;
+  graph->nChannels = 0;
+  graph->sameChannels = 1;
+
+  char* str = getenv("NCCL_GRAPH");
+  if (str) {
+    NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
+    for (int i=0; i<graph->nChannels*ngpus; i++) {
+      // Translate gpu numbers into ranks
+      graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank;
+    }
+    // TODO : let user specify NICs
+    graph->inter[0] = graph->inter[1] = 0;
+    graph->speedIntra = graph->speedInter = PCI_WIDTH+2;
+    graph->nvlink = 0;
+    if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
+      // Reverse the loop
+      for (int c=0; c<graph->nChannels; c++) {
+        for (int i=0; i<=ngpus/2; i++) {
+          int tmp = graph->intra[ngpus*c+i];
+          graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus];
+          graph->intra[ngpus*c+ngpus-i] = tmp;
+        }
+      }
+    }
+    if (graph->nChannels) return ncclSuccess;
+  }
+
+  if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
+
+  struct ncclTopoGraph tmpGraph;
+  memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
+  int bestSpeed = 0;
+
+  // First try crossnic, then decrease speed and finally increase speedIntra.
+  tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth;
+  int maxSpeed = system->maxSpeed;
+  tmpGraph.pattern = graph->pattern;
+
+search:
+  int time = NCCL_SEARCH_TIMEOUT;
+  tmpGraph.nvlink = 1;
+  tmpGraph.nChannels = 0;
+  tmpGraph.sameChannels = 1;
+  NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time));
+#if 0
+  printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
+  for (int c=0; c<graph->nChannels; c++) {
+    printf("%2d : ", c);
+    for (int g=0; g<ngpus; g++) {
+      printf("%d ", graph->intra[c*ngpus+g]);
+    }
+    printf("\n");
+  }
+#endif
+  if (time == -1) goto done;
+  // We already have a solution and we timed out so lower speed will just timeout as well
+  if (time == 0 && graph->nChannels > 0) goto done;
+  if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra;
+
+  if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+    // First pass, we don't have a solution yet ; try to go slower.
+
+    // Try a simpler tree
+    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
+      tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+      goto search;
+    }
+    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
+      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+      goto search;
+    }
+    tmpGraph.pattern = graph->pattern;
+
+    if (tmpGraph.type < LINK_QPI) {
+      tmpGraph.type += 1;
+      goto search;
+    }
+    tmpGraph.type = graph->type;
+
+    if (crossNic && tmpGraph.crossNic == 0) {
+      // Try again with crossNic if permitted
+      tmpGraph.crossNic = crossNic;
+      goto search;
+    }
+    tmpGraph.crossNic = graph->crossNic;
+
+    // Try to reduce speed per channel
+    tmpGraph.speedIntra = tmpGraph.speedInter -= 3;
+    if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search;
+  }
+
+done:
+  // We have a solution now. See if we can increase speedIntra
+  if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+    time = -1;
+    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
+  }
+  if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) {
+    // Try to increase the intra speed only but keeping nChannels the same
+    tmpGraph.speedIntra += 3;
+    maxSpeed = tmpGraph.speedIntra * graph->nChannels;
+    if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search;
+  }
+
+  if (graph->nChannels == 0) {
+    WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
+    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank;
+    graph->inter[0] = graph->inter[1] = 0;
+    graph->speedIntra = graph->speedInter = 3;
+    graph->nvlink = 0;
+    graph->nChannels = 1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels);
+  int ngpus = system->nodes[GPU].count;
+
+  char line[1024];
+  for (int c=0; c<graph->nChannels; c++) {
+    sprintf(line, "%2d :", c);
+    int offset = strlen(line);
+    if (system->nodes[NET].count > 0) {
+      sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]);
+      offset = strlen(line);
+    }
+    for (int i=0; i<ngpus; i++) {
+      sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
+      offset = strlen(line);
+    }
+    if (system->nodes[NET].count > 0) {
+      sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]);
+      offset = strlen(line);
+    }
+    INFO(NCCL_GRAPH, "%s", line);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
+  *dev = graph->inter[(channelId%graph->nChannels)*2+dir];
+  return ncclSuccess;
+}
@@ -0,0 +1,641 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "nvmlwrap.h"
+#include "net.h"
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+
+const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
+
+const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
+
+/******************************************************************/
+/******************* Graph Creation Functions *********************/
+/******************************************************************/
+static int getNumaId(char *path) {
+  char npath[PATH_MAX];
+  snprintf(npath, PATH_MAX, "%s/numa_node", path);
+  npath[PATH_MAX-1] = '\0';
+
+  int numaId = -1;
+  FILE *file = fopen(npath, "r");
+  if (file == NULL) return -1;
+  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+  fclose(file);
+
+  return numaId;
+}
+
+static ncclResult_t getPciPath(char* busId, char** path) {
+  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+  *path = realpath(busPath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", busPath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
+ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
+  char* str = path+offset;
+  // Remove trailing "/"
+  if (*str == '/') str--;
+  // Find next /
+  while (*str != '/') str--;
+  str++;
+  NCCLCHECK(busIdToInt64(str, id));
+  return ncclSuccess;
+}
+
+static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
+  *index = -1;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    if (system->nodes[GPU].nodes[i].id == id) {
+      *index = i;
+    }
+  }
+  return ncclSuccess;
+}
+
+
+static ncclResult_t getPath(int64_t id, char** path) {
+  char busId[] = "0000:00:00.0";
+  NCCLCHECK(int64ToBusId(id, busId));
+  NCCLCHECK(getPciPath(busId, path));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
+  char busId[BUSID_SIZE];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+  NCCLCHECK(getPciPath(busId, path));
+  return ncclSuccess;
+}
+
+
+int interCpuWidth = 0;
+int cpuPciWidth = 0;
+
+static ncclResult_t getCpuWidths() {
+  // Check if already detected
+  if (interCpuWidth + cpuPciWidth) return ncclSuccess;
+
+  // Defaults
+  char cpu[256];
+  sprintf(cpu, "Generic");
+  cpuPciWidth = interCpuWidth = PCI_WIDTH;
+
+#ifdef __PPC__
+  sprintf(cpu, "ppc64");
+  interCpuWidth = P9_WIDTH;
+#endif
+#ifdef __x86_64__
+  sprintf(cpu, "x86_64");
+  union {
+    struct {
+      // CPUID 0 String register order
+      uint32_t ebx;
+      uint32_t edx;
+      uint32_t ecx;
+    };
+    char vendor[12];
+  } cpuid0;
+
+  asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
+  if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
+
+  if (strcmp(cpu, "Intel") == 0) {
+    union {
+      struct {
+        int steppingId:4;
+        int model:4;
+        int familyId:4;
+        int processorType:2;
+        int resv0:2;
+        int extModelId:4;
+        int modelId:8;
+        int resv1:4;
+      };
+      uint32_t val;
+    } cpuid1;
+    asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
+    if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
+      sprintf(cpu, "Intel/Skylake (or later)");
+      interCpuWidth = SKL_QPI_WIDTH;
+    } else {
+      interCpuWidth = QPI_WIDTH;
+    }
+  }
+#endif
+  INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
+  NCCLCHECK(getCpuWidths());
+  *width = interCpuWidth;
+  return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
+  NCCLCHECK(getCpuWidths());
+  *width = cpuPciWidth;
+  return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetPciWidth(int* width) {
+  *width = PCI_WIDTH;
+  return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetNetWidth(int* width) {
+  *width = NET_WIDTH;
+  return ncclSuccess;
+}
+
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceUnknown,
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
+};
+
+static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
+  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
+  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
+  char* rPath = realpath(classPath, NULL);
+  int fd;
+  if ((fd = open(rPath, O_RDONLY)) == -1) {
+    // Could not find device. It might be because we're in a VM and
+    // we don't see the whole machine. This is handled silently so
+    // we don't want to print an INFO error.
+    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+    return ncclSystemError;
+  }
+  free(rPath);
+  char pciClass[9];
+  strncpy(pciClass, "0x000000", 9);
+  int len;
+  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
+  SYSCHECK(close(fd), "close");
+  if (strcmp(pciClass, "0x068000") == 0) {
+    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
+    *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x068001") == 0) {
+    // PCI device is of type "Bridge: IBM Device 04ea"
+    *type = ncclNvLinkDeviceBridge;
+  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
+      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
+    *type = ncclNvLinkDeviceGpu;
+  } else {
+    *type = ncclNvLinkDeviceUnknown;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
+  struct ncclTopoNode* cpuNode = NULL;
+  for (int c=0; c<system->nodes[CPU].count; c++) {
+    if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
+  }
+  if (cpuNode == NULL) { // Create CPU
+    NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
+  }
+  NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
+  NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
+  struct ncclTopoNode* nvsNode = NULL;
+
+  int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    int cudaMajor, cudaMinor;
+    NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
+    int maxNvLinks, width;
+    if (cudaMajor < 6) {
+      maxNvLinks = 0;
+      width = 0;
+    } else if (cudaMajor == 6) {
+      maxNvLinks = 4;
+      width = PASCAL_NVLINK_WIDTH;
+    } else {
+      maxNvLinks = 6;
+      width = VOLTA_NVLINK_WIDTH;
+    }
+
+    int nvlinks = 0;
+    for (int l=0; l<maxNvLinks; ++l) {
+      // Check whether we can use this NVLink for P2P
+      unsigned canP2P;
+      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+      // Make sure the Nvlink is up. The previous call should have trained the link.
+      nvmlEnableState_t isActive;
+      if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+      // Try to figure out what's on the other side of the NVLink
+      nvmlPciInfo_t remoteProc;
+      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
+
+      // Make a lower case copy of the bus ID for calling ncclDeviceType
+      // PCI system path is in lower case
+      char* p = remoteProc.busId;
+      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+        lowerId[c] = tolower(p[c]);
+        if (p[c] == 0) break;
+      }
+
+      enum ncclNvLinkDeviceType type;
+      NCCLCHECK(ncclDeviceType(lowerId, &type));
+      if (type == ncclNvLinkDeviceGpu) {
+        int64_t remoteId;
+        NCCLCHECK(busIdToInt64(lowerId, &remoteId));
+        int peer;
+        NCCLCHECK(idToIndex(system, remoteId, &peer));
+        if (peer != -1) {
+          NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
+          nvlinks++;
+        }
+      } else if (type == ncclNvLinkDeviceBridge) {
+        // Nvlink between GPU and CPU (PPC)
+        // Since the remote bridge does not have a valid numa_node, assume we
+        // are connected to the closest CPU.
+        char* path;
+        NCCLCHECK(getPath(gpu->id, &path));
+        int numaId = getNumaId(path);
+        free(path);
+        NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
+        nvlinks++;
+      } else { // Nvswitch
+        if (type == ncclNvLinkDeviceUnknown) {
+          // The NVLink is up but we couldn't find the PCI device on the other
+          // side. Assume it's an NVswitch outside a VM.
+          if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
+        }
+        if (nvsNode == NULL) { // Create nvswitch
+          NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
+        }
+        NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
+        NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
+        nvlinks++;
+      }
+    }
+    minNvlinks = std::min(minNvlinks, nvlinks);
+    minWidth = std::min(minWidth, width);
+  }
+  int pciWidth;
+  NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
+  system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
+  system->maxWidth = minNvlinks ? minWidth : pciWidth;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
+  struct ncclTopoNode* lastNode = endNode;
+  int pciWidth;
+  NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
+  // Find intermediate PCI switches
+  int slashCount = 0;
+  int offsetRC = 0;
+  while (offsetRC < strlen(path)) {
+    if (path[offsetRC] == '/') slashCount++;
+    if (slashCount == 4) break;
+    offsetRC++;
+  }
+  int offset = strlen(path);
+  slashCount = 0;
+  while (--offset > offsetRC) {
+    if (path[offset] == '/') {
+      slashCount++;
+      // Find if already existing
+      if ((slashCount%2) == 0) {
+        int64_t pciId;
+        NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
+        for (int p=0; p<system->nodes[PCI].count; p++) {
+          if (system->nodes[PCI].nodes[p].id == pciId) {
+            // Found our PCI switch. Attach and stop since the rest should already
+            // be connected
+            NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
+            NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
+            return ncclSuccess;
+          }
+        }
+        struct ncclTopoNode* pciNode;
+        NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
+        NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
+        NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
+        lastNode = pciNode;
+      }
+    }
+  }
+  // Then attach to a CPU node
+  int numaId = getNumaId(path);
+  int width;
+  NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
+  NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
+  return ncclSuccess;
+}
+
+// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
+#include <glob.h>
+#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
+uint64_t getIbGuid(char* path) {
+  uint64_t guid = 0ULL;
+  char guidPath[PATH_MAX];
+  snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
+  // PATH has a wildcard in it so use glob()
+  glob_t globbuf;
+  glob(guidPath, 0, NULL, &globbuf);
+  if (globbuf.gl_pathc > 0)
+    strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
+  globfree(&globbuf);
+  guidPath[PATH_MAX-1] = '\0';
+  FILE *file = fopen(guidPath, "r");
+  if (file != NULL) {
+    uint64_t a, b, c, d;
+    if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
+      guid = (a << 48) + (b << 32) + (c<<16) + d;
+      TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
+    }
+    fclose(file);
+  }
+  return guid;
+}
+
+struct netInfo {
+  char* path;
+  int64_t nic;
+  uint64_t asic;
+  int port;
+  int net;
+};
+
+ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
+  for (int n=0; n<ndev; n++) {
+    struct netInfo* info = netInfos+n;
+    uint64_t ibGuid;
+    info->nic = n;
+    info->asic = n;
+    info->port = 0;
+    info->net = n;
+    if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
+      info->asic = ibGuid;
+
+      // Ignore PCI subdevice when computing the ID to merge multi-port cards
+      // and make them use the same PCI link.
+      char* path = strdup(info->path);
+      path[strlen(path)-1]='0';
+      NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
+      free(path);
+
+      // Same PCI path -> different ports of the same NIC
+      for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
+
+      // Same GUID -> same network links as the other NIC
+      for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
+    }
+    INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    char* path;
+    NCCLCHECK(getPath(gpu->id, &path));
+    NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
+    free(path);
+  }
+
+  // Connect the NICs
+  int netDevCount;
+  NCCLCHECK(ncclNetDevices(&netDevCount));
+  int netWidth;
+  NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
+
+  struct netInfo* netInfos;
+  NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
+
+  for (int n=0; n<netDevCount; n++) {
+    ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
+    if (res != ncclSuccess) netInfos[n].path = NULL;
+  }
+
+  NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
+
+  for (int n=0; n<netDevCount; n++) {
+    struct netInfo* info = netInfos+n;
+    // Create NIC and attach it to the PCI tree
+    struct ncclTopoNode* nicNode = NULL;
+    for (int i=0; i<system->nodes[NIC].count; i++) {
+      if (system->nodes[NIC].nodes[i].id == info->nic) {
+        nicNode = system->nodes[NIC].nodes+i;
+        break;
+      }
+    }
+    if (!nicNode) {
+      NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
+      if (info->path) {
+        // Create the PCI path
+        NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
+      } else {
+        // This is probably a virtual NIC. Just attach it directly to CPU 0
+        int width;
+        NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
+        NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
+      }
+    }
+    free(info->path);
+
+    // Create the network side
+    struct ncclTopoNode* netNode;
+    NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
+
+    // Use rank to store the net information
+    netNode->rank = info->net;
+
+    NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
+    NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
+  }
+  free(netInfos);
+
+  // And connect all CPU nodes together
+  for (int n=0; n<system->nodes[CPU].count; n++) {
+    for (int p=0; p<system->nodes[CPU].count; p++) {
+      if (n == p) continue;
+      int width;
+      NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
+      NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
+  if (node->type == GPU) {
+    sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
+  } else {
+    sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
+  }
+  INFO(NCCL_GRAPH, "%s", line);
+  for (int i=0; i<offset; i++) line[i] = ' ';
+
+  for (int l=0; l<node->nlinks; l++) {
+    struct ncclTopoLink* link = node->links+l;
+    if (link->type == LINK_LOC) continue;
+    if (link->remNode != prevNode) {
+      sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
+      int nextOffset = strlen(line);
+      if (link->type == LINK_PCI) {
+        NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
+      } else {
+        if (link->remNode->type == NET) {
+          sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
+        } else {
+          sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
+        }
+        INFO(NCCL_GRAPH, "%s", line);
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
+  INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
+  char line[1024];
+  for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
+  INFO(NCCL_GRAPH, "==========================================");
+  NCCLCHECK(ncclTopoPrintPaths(s));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
+  // Shift all links to have upLink as last link
+  if (upNode) {
+    int l=0;
+    while (node->links[l].remNode != upNode) l++;
+    struct ncclTopoLink upLink;
+    memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
+    while (node->links[l+1].remNode) {
+      memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
+      l++;
+    }
+    memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
+  }
+
+  // Recursively sort the PCI tree
+  for (int l=0; l<node->nlinks; l++) {
+    struct ncclTopoLink* link = node->links+l;
+    if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
+  }
+  return ncclSuccess;
+}
+
+// We want the graph to be organized to ease/accelerate traversal :
+// 1. NVLinks (already the case)
+// 2. PCI down
+// 3. PCI up
+// 4. QPI (already the case)
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
+  for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+  struct ncclTopoSystem* s;
+  NCCLCHECK(ncclCalloc(&s, 1));
+  nvmlDevice_t* nvmlDevs;
+  int g = 0;
+  NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
+  for (int r=0; r<comm->nRanks; r++) {
+    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+      // Consider the GPU as outside of our node if we can't see it through NVML.
+      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+      if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
+      g++;
+      struct ncclTopoNode* gpuNode;
+      NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
+      gpuNode->rank = r;
+    }
+  }
+
+  NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
+  NCCLCHECK(ncclTopoConnectPCI(s));
+
+  free(nvmlDevs);
+  NCCLCHECK(ncclTopoSortSystem(s));
+  *system = s;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
+  int g1, g2;
+  NCCLCHECK(idToIndex(system, busId1, &g1));
+  NCCLCHECK(idToIndex(system, busId2, &g2));
+  *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
+  int g;
+  NCCLCHECK(idToIndex(system, busId, &g));
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    if (i == g) continue;
+    if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
+      *nvlink = 1;
+      return ncclSuccess;
+    }
+  }
+  *nvlink = 0;
+  return ncclSuccess;
+}
+
+static int pathDistance(struct ncclTopoLinkList* links) {
+  int distance = PATH_PIX;
+  if (links->count > 2) distance = PATH_PXB;
+  for (int l=0; l<links->count; l++) {
+    // PHB if we go through 1 CPU, SYS if we go through 2 CPUs
+    if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
+  }
+  return distance;
+}
+
+ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
+  int g1, g2;
+  NCCLCHECK(idToIndex(system, busId1, &g1));
+  NCCLCHECK(idToIndex(system, busId2, &g2));
+  *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
+  int g;
+  NCCLCHECK(idToIndex(system, busId, &g));
+  *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
+  *count = system->nodes[CPU].count;
+  return ncclSuccess;
+}
@@ -0,0 +1,138 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TOPO_H_
+#define NCCL_TOPO_H_
+
+#include "graph.h"
+#include "core.h"
+
+#define LOC_WIDTH 5000
+#define PASCAL_NVLINK_WIDTH 18
+#define VOLTA_NVLINK_WIDTH 21
+#define PCI_WIDTH 12           // PCI Gen3 x16
+#define QPI_WIDTH 8
+#define SKL_QPI_WIDTH 12
+#define P9_WIDTH 32
+#define NET_WIDTH 12           // 100Gbit
+
+// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU
+// to GPU traffic consumed more PCI bandwidth.
+#define INTEL_P2P(speed) (speed*9/12)
+#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
+
+#define NCCL_TOPO_NODE_TYPES 6
+#define GPU 0
+#define PCI 1
+#define NVS 2
+#define CPU 3 // Actually NUMA domains
+#define NIC 4
+#define NET 5
+extern const char* topoNodeTypeStr[];
+
+#define LINK_LOC 0
+#define LINK_NVL 1
+#define LINK_PCI 2
+#define LINK_QPI 3
+#define LINK_NET 4
+extern const char* topoLinkTypeStr[];
+
+struct ncclTopoNode;
+struct ncclTopoLink {
+  int type;
+  int width;
+  struct ncclTopoNode* remNode;
+};
+#define NCCL_TOPO_MAX_LINKS 32
+#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
+#define SELECT_PATH 1
+#define SELECT_LAST 2
+
+#define NET_GDR_MASK 0x70000000
+
+struct ncclTopoLinkList {
+  struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
+  int count;
+  int width;
+  int type;
+};
+
+struct ncclTopoNode {
+  int type;
+  int64_t id;
+  int rank;
+  int nlinks;
+  struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
+  // Pre-computed paths to GPUs and NICs
+  struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];
+  // Used during search
+  uint64_t used;
+};
+
+struct ncclTopoNodeSet {
+  int count;
+  struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];
+};
+
+struct ncclTopoSystem {
+  struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
+  int maxSpeed;
+  int maxWidth;
+  int searchInitDone;
+};
+
+static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+  for (int i=0; i<system->nodes[type].count; i++) {
+    if (system->nodes[type].nodes[i].id == id) {
+      *node = system->nodes[type].nodes+i;
+      return ncclSuccess;
+    }
+  }
+  if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
+    WARN("Error : tried to create too many nodes of type %d\n", type);
+    return ncclInternalError;
+  }
+  struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
+  system->nodes[type].count++;
+  n->type = type;
+  n->id = id;
+  if (type == GPU) {
+    // Create link to itself (used in some corner cases)
+    n->nlinks=1;
+    n->links[0].type = LINK_LOC;
+    n->links[0].remNode = n;
+    n->links[0].width = LOC_WIDTH;
+  }
+  *node = n;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) {
+  // Aggregate links into higher width for NVLink
+  struct ncclTopoLink* link;
+  for (link = node->links; link->remNode; link++) {
+    if (link->remNode == remNode && link->type == type) break;
+  }
+  if (link->remNode == NULL) node->nlinks++;
+  link->type = type;
+  link->remNode = remNode;
+  link->width += width;
+
+  // Sort links in BW descending order
+  struct ncclTopoLink linkSave;
+  memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
+  while (link != node->links) {
+    if ((link-1)->width >= linkSave.width) break;
+    memcpy(link, link-1, sizeof(struct ncclTopoLink));
+    link--;
+  }
+  memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
+
+#endif
@@ -4,9 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
-#include "net.h"
-#include "param.h"
+#include "nccl.h"

 #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)

@@ -0,0 +1,212 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "topo.h"
+
+NCCL_PARAM(Nthreads, "NTHREADS", -2);
+NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
+
+static int getNthreads(const char* name, int env, int min, int max, int def) {
+  int nt = env;
+  if (nt > 0) {
+    if (nt % WARP_SIZE != 0) {
+      WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
+      nt = max;
+    } else if (nt > max) {
+      WARN("Invalid %s %d (maximum %d).", name, nt, max);
+      nt = max;
+    } else if (nt < min) {
+      WARN("Invalid %s %d (minimum %d).", name, nt, min);
+      nt = min;
+     }
+  } else {
+    nt = def;
+  }
+  return nt;
+}
+
+ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
+  int def, set;
+  if (str[0] == '^') {
+    def = 1; set = 0; str++;
+  } else {
+    def = 0; set = 1;
+  }
+  for (int i=0; i<nelems; i++) list[i] = def;
+  char* tokStr = strdup(str);
+  char* tmpStr;
+  char* token = strtok_r(tokStr, ",", &tmpStr);
+  while (token) {
+    for (int i=0; i<nelems; i++)
+      if (strcasecmp(token, elems[i]) == 0) list[i] = set;
+    token = strtok_r(NULL, ",", &tmpStr);
+  }
+  free(tokStr);
+  return ncclSuccess;
+}
+
+static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
+static const char* ncclAlgoStr[] = { "Tree", "Ring" };
+static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
+
+// Latencies in us, Bandwidths in GB/s
+// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
+static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4,  0 }, { 3.6, 3.6, 8.4 } };
+
+// NVLink, PCI, Network
+#define NCCL_HW_NVLINK 0
+#define NCCL_HW_PCI 1
+#define NCCL_HW_NET 2
+// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
+static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
+{ /* NVLINK */
+  { /* Tree (LL/LL128/Simple)*/ {  .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ {  .4, 2.5, 5.7 } },
+  /* PCI */
+  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } },
+  /* NET */
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ {  .9, 2.5, 6.6 } }
+};
+
+// LL128 max BW for the different collectives
+static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
+
+ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) {
+  int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS;
+  comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
+  comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+  comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
+
+  INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]);
+
+  if (comm->nRanks <= 1) return ncclSuccess;
+
+  struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph };
+  int intraHw[2], hw[2];
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
+
+  for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
+    int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
+      coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
+      comm->nRanks;
+
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue;
+
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter;
+        float busBw = graphs[a]->nChannels * speed * 1.0;
+
+        // Various model refinements
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL)    busBw *= 1.0/4.0;
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
+        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0;
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
+
+        // Convert bus BW to algorithm BW
+        float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps;
+        comm->bandwidths[coll][a][p] = busBw * ratio;
+
+        comm->latencies[coll][a][p] = baseLat[a][p];
+        if (a == NCCL_ALGO_RING) {
+          float lat = hwLat[hw[a]][a][p];
+          if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
+            if (ringGraph->sameChannels) {
+              comm->latencies[coll][a][p] += lat;
+            } else {
+              if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
+              comm->latencies[coll][a][p] += nsteps*lat;
+            }
+          } else {
+            comm->latencies[coll][a][p] += nsteps*lat;
+          }
+        } else {
+          float intraLat = hwLat[intraHw[a]][a][p];
+          float interLat = hwLat[NCCL_HW_NET][a][p];
+          comm->latencies[coll][a][p] +=
+            2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
+        }
+      }
+    }
+  }
+
+  // Protocols/Algorithms enable/disable, and user overrides.
+  // All are enabled except ll128 which is enabled by default only in certain cases.
+  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
+  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 };
+
+  const char *protoStr = getenv("NCCL_PROTO");
+  if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+  const char *algoStr = getenv("NCCL_ALGO");
+  if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+
+  for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    int pEnable = protoEnable[p];
+    if (pEnable == 2 && p == NCCL_PROTO_LL128) {
+      // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
+      pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
+    }
+    if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+  }
+
+  if (comm->rank == 0) {
+    char line[1024];
+    int offset = 0;
+    sprintf(line, "Latency/AlgBw |");
+    offset = strlen(line);
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
+        offset = strlen(line);
+      }
+    }
+    INFO(NCCL_TUNING, "%s", line);
+    for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
+      sprintf(line, "%13s |", ncclFuncStr[c]);
+      offset = strlen(line);
+      for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+          sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+          offset = strlen(line);
+        }
+      }
+      INFO(NCCL_TUNING, "%s", line);
+    }
+  }
+
+  // Set per-thread amount of work before we increase nThreads and nChannels
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
+    comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
+    comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
+  }
+  comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
+
+  // Override defaults with user env
+  char* str = getenv("NCCL_THREAD_THRESHOLDS");
+  if (str) {
+    ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { -2 };
+    sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
+      }
+    }
+  }
+
+  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld",
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  return ncclSuccess;
+}
@@ -51,11 +51,6 @@ struct ncclAsyncArgs {

 thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];

-ncclResult_t ncclSetDevice(int cudaDev) {
-  CUDACHECK(cudaSetDevice(cudaDev));
-  return ncclSuccess;
-}
-
 #define CHECK(a) do { \
  if ((args->ret = (a)) != ncclSuccess) { \
    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
@@ -65,15 +60,14 @@ ncclResult_t ncclSetDevice(int cudaDev) {

 void* ncclAsyncThreadMain(void* args_) {
  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  CHECK(ncclSetDevice(args->init.cudaDev));
-  CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank));
+  CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
  return args;
 }

-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) {
  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInternalError);
+    return ncclAsyncErrCheck(ncclInvalidUsage);
  }
  int index = ncclGroupIndex++;
  struct ncclAsyncArgs* args = ncclGroupArgs+index;
@@ -84,8 +78,6 @@ ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm
  args->init.ndev = ndev;
  memcpy(&args->init.commId, &commId, sizeof(commId));
  args->init.myrank = myrank;
-  // We need to use threads for Init
-  pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args);
  return ncclSuccess;
 }

@@ -97,7 +89,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
  }
  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInternalError);
+    return ncclAsyncErrCheck(ncclInvalidUsage);
  }
  ncclGroupIndex++;
  args->funcType = ASYNC_FUNC_COLL;
@@ -124,6 +116,14 @@ ncclResult_t ncclGroupEnd() {
  ncclResult_t ret = ncclGroupError;
  if (ret != ncclSuccess) goto group_cleanup;

+  /* Launch async ncclCommInitRank */
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_INIT) {
+      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
+    }
+  }
+
  /* Collectives are done in three steps :
   * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
   * 2. Barrier Wait. No CUDA call is permitted
@@ -166,8 +166,8 @@ ncclResult_t ncclGroupEnd() {
      if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
        int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
        if (err == EBUSY) continue;
-        if (err != 0) { ret = ncclSystemError; goto end; }
-        if (args->ret != ncclSuccess) { ret = args->ret; goto end; }
+        if (err != 0) ret = ncclSystemError;
+        if (args->ret != ncclSuccess) ret = args->ret;
        doneArray[i] = 1;
        done--;
      }
@@ -175,20 +175,47 @@ ncclResult_t ncclGroupEnd() {
  }
  goto end;
 group_cleanup:
-  // At least one call in the group failed. Since we want to make that group
-  // an atomic operation, we need to cancel all operations.
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
-    for (int c=0; c<comm->nChannels; c++) {
-      struct ncclChannel* channel = comm->channels+c;
-      for (int i=0; i<channel->collCount; i++) {
-        channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+  if (ret != ncclSuccess) {
+    // At least one call in the group failed. Since we want to make that group
+    // an atomic operation, we need to cancel all operations.
+    for (int i=0; i<ncclGroupIndex; i++) {
+      struct ncclAsyncArgs* args = ncclGroupArgs+i;
+      if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
+        if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm));
+        *args->init.newcomm = NULL;
+      } else {
+        struct ncclComm* comm = args->coll.comm;
+        for (int c=0; c<comm->nChannels; c++) {
+          struct ncclChannel* channel = comm->channels+c;
+          for (int i=0; i<channel->collCount; i++) {
+            channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+          }
+          channel->collFifoTail = channel->collStart;
+          channel->collCount = 0;
+        }
+        /* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
+        struct ncclProxyState* state = &comm->proxyState;
+        struct ncclProxyArgs *op, *start;
+        pthread_mutex_lock(&state->mutex);
+        op = start = state->ops;
+        while (op) {
+          if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
+          struct ncclProxyArgs* peerOp = op->nextPeer;
+          while (peerOp) {
+            if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
+            peerOp = peerOp->nextPeer;
+          }
+          op = op->next;
+          if (op == start) break;
+        }
+        comm->opCount = comm->lastOpCount;
+        pthread_cond_signal(&state->cond);
+        pthread_mutex_unlock(&state->mutex);
+
+        comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
+        comm->userStreamSet = false;
      }
-      channel->collFifoTail = channel->collStart;
-      channel->collCount = 0;
    }
-    comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
-    comm->userStreamSet = false;
  }
 end:
  ncclGroupError = ncclSuccess;
@@ -8,6 +8,7 @@
 #define NCCL_ARGCHECK_H_

 #include "core.h"
+#include "info.h"

 ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
 ncclResult_t ArgsCheck(struct ncclInfo* info);
@@ -17,4 +17,5 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapClose(void* commState);
+ncclResult_t bootstrapAbort(void* commState);
 #endif
@@ -6,7 +6,7 @@

 #ifndef NCCL_CHANNEL_H_
 #define NCCL_CHANNEL_H_
-#include "core.h"
+#include "comm.h"

 ncclResult_t initChannel(struct ncclComm* comm, int channelid);
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
@@ -7,7 +7,10 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_

-#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
+#include "core.h"
+#include "info.h"
+
+#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))

 #define NCCL_COLL_NAME(coll, op, dtype) \
  coll##_##op##_##dtype
@@ -22,7 +25,8 @@

 #define DECL_COLL4(coll, op, dtype) \
  DECL_COLL5(coll, op, dtype) \
-  DECL_COLL5(coll##LL, op, dtype)
+  DECL_COLL5(coll##LL, op, dtype) \
+  DECL_COLL5(coll##LL128, op, dtype)

 #define DECL_COLL3(coll, op, dtype) \
  DECL_COLL4(coll##Ring, op, dtype) \
@@ -7,6 +7,8 @@
 #ifndef NCCL_COMM_H_
 #define NCCL_COMM_H_

+#include "transport.h"
+
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
  void *func;
@@ -18,13 +20,17 @@ struct cudaLaunchParams {
 };
 #endif

-#define MAXCHANNELS 16
 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */

 #define CACHE_LINE_SIZE 128
 #define MEM_ALIGN 4096
 #define CUDA_IPC_MIN 2097152UL

+// Channels / LL tuning
+#define NCCL_LL_THREAD_THRESHOLD 8
+#define NCCL_LL128_THREAD_THRESHOLD 8
+#define NCCL_SIMPLE_THREAD_THRESHOLD 64
+
 struct ncclSendMem {
  union {
    struct {
@@ -50,6 +56,7 @@ struct ncclRecvMem {
    char pad4[MEM_ALIGN];
  };
  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+  uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
  char buff[1]; // Actually larger than that
 };

@@ -57,13 +64,18 @@ struct ncclComm {
  struct ncclChannel channels[MAXCHANNELS];

  struct ncclPeerInfo* peerInfo;
+  struct ncclTopoSystem* topo;

  void* bootstrap;

  int rank;    // my rank in the communicator
  int nRanks;  // number of GPUs in communicator
  int cudaDev; // my cuda device index
-  int nvmlDev; // my NVML device number
+  int64_t busId;   // my PCI bus ID in int format
+
+  int node;
+  int nNodes;
+  int localRanks;

  enum { GROUP, PARALLEL } launchMode;
  cudaStream_t userStream;
@@ -74,17 +86,19 @@ struct ncclComm {
  // Counter to make sure collectives match (needed for bcast/reduce
  // where syncs are not symmetric).
  uint64_t opCount;
+  uint64_t lastOpCount;

  // Channels for collectives
  int nChannels;
-  int nThreads;

-  // Low-latency algorithm threshold
-  ssize_t llThreshold;
-  ssize_t threadThreshold;
+  // Only nvlink is used for inter-GPU communication
+  int nvlink;

-  // Tree algorithm threshold
-  ssize_t treeThreshold;
+  // Algorithm/Protocols thresholds
+  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  int maxThreads[NCCL_NUM_PROTOCOLS];

  // An internal CUDA stream for NCCL kernel CGMD launches
  int groupCudaStream;
@@ -8,19 +8,11 @@
 #define NCCL_CORE_H_

 #include <pthread.h>
-#include <algorithm>
-#include "nccl.h"
-#include "debug.h"
-#include "checks.h"
-#include "alloc.h"
-#include "transport.h"
-#include "devcomm.h"
-#include "comm.h"
-#include "info.h"
-#include "argcheck.h"
-#include <cstdio>
 #include <unistd.h>
 #include <stdlib.h>
+#include <stdint.h>
+#include <algorithm> // For std::min/std::max
+#include "nccl.h"

 #ifdef PROFAPI
 #define NCCL_API(ret, func, args...)        \
@@ -38,10 +30,6 @@
    ret func(args)
 #endif // end PROFAPI

-int ncclCudaCompCap();
-ncclResult_t ncclNvlinkGpu(int* nvlink);
-int64_t ncclTreeThreshold();
-
 static __inline__ int ncclTypeSize(ncclDataType_t type) {
  switch (type) {
    case ncclInt8:
@@ -62,4 +50,22 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
  }
 }

+#define NCCL_NUM_FUNCTIONS 5
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+#include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "utils.h"
+#include "param.h"
+
 #endif // end include guard
@@ -7,15 +7,14 @@
 #ifndef NCCL_DEBUG_H_
 #define NCCL_DEBUG_H_

-#include <pthread.h>
+#include "core.h"
+
 #include <stdio.h>
 #include <chrono>

-#include <unistd.h>
 #include <sys/syscall.h>
 #include <limits.h>
 #include <string.h>
-#include "nccl.h"
 #include "nccl_net.h"

 #define gettid() (pid_t) syscall(SYS_gettid)
@@ -25,9 +24,16 @@ extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
-extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);

-extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
+
+// Let code temporarily downgrade WARN into INFO
+extern thread_local int ncclDebugNoWarn;
+#define NOWARN(a, ret) do { \
+  ncclDebugNoWarn = 1; \
+  ret = a; \
+  ncclDebugNoWarn = 0; \
+} while (0)

 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
@@ -39,101 +45,4 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
 #define TRACE(...)
 #endif

-#include <stdlib.h>
-
-static inline void initDebug() {
-  const char* nccl_debug = getenv("NCCL_DEBUG");
-  if (nccl_debug == NULL) {
-    ncclDebugLevel = NCCL_LOG_NONE;
-  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = NCCL_LOG_VERSION;
-  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = NCCL_LOG_WARN;
-  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = NCCL_LOG_INFO;
-  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = NCCL_LOG_ABORT;
-  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
-    ncclDebugLevel = NCCL_LOG_TRACE;
-  }
-
-  /* Parse the NCCL_DEBUG_SUBSYS env var
-   * This can be a comma separated list such as INIT,COLL
-   * or ^INIT,COLL etc
-   */
-  char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
-  if (nccl_debug_subsys != NULL) {
-    char *subsys = strtok(nccl_debug_subsys, ",");
-    while (subsys != NULL) {
-      int invert = 0;
-      uint64_t mask = 0;
-      if (subsys[0] == '^') { invert = 1; subsys++; }
-      if (strcasecmp(subsys, "INIT") == 0) {
-        mask = NCCL_INIT;
-      } else if (strcasecmp(subsys, "COLL") == 0) {
-        mask = NCCL_COLL;
-      } else if (strcasecmp(subsys, "P2P") == 0) {
-        mask = NCCL_P2P;
-      } else if (strcasecmp(subsys, "SHM") == 0) {
-        mask = NCCL_SHM;
-      } else if (strcasecmp(subsys, "NET") == 0) {
-        mask = NCCL_NET;
-      } else if (strcasecmp(subsys, "ALL") == 0) {
-        mask = NCCL_ALL;
-      }
-      if (mask) {
-        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
-      }
-      subsys = strtok(NULL, ",");
-    }
-  }
-
-  /* Parse and expand the NCCL_DEBUG_FILE path and
-   * then create the debug file. But don't bother unless the
-   * NCCL_DEBUG level is > VERSION
-   */
-  const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
-  if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
-    int c = 0;
-    char debug_fn[PATH_MAX+1] = "";
-    char *dfn = debug_fn;
-    while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
-      if (nccl_debug_file[c++] != '%') {
-        *dfn++ = nccl_debug_file[c-1];
-        continue;
-      }
-      switch (nccl_debug_file[c++]) {
-        case '%': // Double %
-          *dfn++ = '%';
-          break;
-        case 'h': // %h = hostname
-          char hostname[1024];
-          getHostName(hostname, 1024, '.');
-          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
-          break;
-        case 'p': // %p = pid
-          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
-          break;
-        default: // Echo everything we don't understand
-          *dfn++ = '%';
-          *dfn++ = nccl_debug_file[c-1];
-          break;
-      }
-    }
-    *dfn = '\0';
-    if (debug_fn[0] != '\0') {
-      FILE *file = fopen(debug_fn, "w");
-      if (file != NULL) {
-        INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
-        ncclDebugFile = file;
-      }
-    }
-  }
-  pthread_mutex_init(&ncclDebugOutputLock, NULL);
-
-#ifdef ENABLE_TRACE
-  ncclEpoch = std::chrono::high_resolution_clock::now();
-#endif
-}
-
 #endif
@@ -13,8 +13,6 @@
 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8

-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
 #define DIVUP(x, y) \
    (((x)+(y)-1)/(y))
 #define ROUNDUP(x, y) \
@@ -38,16 +36,18 @@ union ncclLLFifoLine {
  int4 i4;
 };

-#define MAXTHREADS 256
-#define NCCL_LL_MAX_NTHREADS MAXTHREADS
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define WARP_SIZE 32
+#define MAXCHANNELS 32
+#define NCCL_MAX_NTHREADS 512
+#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
+#define NCCL_LL_LINES_PER_THREAD 8
+#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
 #define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
 #define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
-#ifdef DEBUG_LL
-#define NCCL_LL_CLEAN_MASK 0x00000ff8
-#define NCCL_LL_FLAG_MAX   0x00001000
-#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
+#ifdef TEST_LL_CLEANUP
+#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
+#define NCCL_LL_FLAG_MAX   0x100
+#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
 #else
 #define NCCL_LL_CLEAN_MASK 0x7ffffff8
 #define NCCL_LL_FLAG(a) ((uint32_t)(a))
@@ -55,6 +55,24 @@ union ncclLLFifoLine {
 // Make sure the clean mask will last for at least NCCL_NSTEPS
 static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");

+#define NCCL_LL128_LINESIZE 128
+#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
+#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
+
+#define NCCL_LL128_MAX_NTHREADS 640
+#define NCCL_LL128_ELEMS_PER_THREAD 120
+
+// Receiving from up to 3 sources is more compute intensive than sending
+// to 3 dests. Use 70% for reduce and 30% for bcast.
+#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
+
+#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS)
+#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t))
+
+#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
+#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+
 struct ncclConnInfo {
  // Regular comm mechanism
  char *buff;         // Local for recv, remote for send
@@ -73,6 +91,9 @@ struct ncclConnInfo {
  // Low latency mechanism
  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
  uint64_t llLastCleaning;
+
+  // High bandwidth, low latency protocol
+  uint64_t* ll128Buff; // Local for recv, remote for send
 };

 struct ncclConnector {
@@ -148,7 +169,8 @@ struct ncclChannel {
  union {
    struct {
      struct ncclRing ring;
-      struct ncclTree tree;
+      struct ncclTree treeUp;
+      struct ncclTree treeDn;

      int id;
      int nthreads;
@@ -171,8 +193,6 @@ struct ncclChannel {
 };
 static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");

-#define MAXCHANNELS 16
-
 typedef enum {
  ncclDevSuccess,
  ncclDevAssertedMismatch,
@@ -7,14 +7,9 @@
 #ifndef NCCL_ENQUEUE_H_
 #define NCCL_ENQUEUE_H_

-#include "core.h"
+#include "comm.h"
 #include "group.h"
-
-// Channels / LL tuning
-#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MIN_NTHREADS 64
+#include "collectives.h"

 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
@@ -0,0 +1,94 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GRAPH_H_
+#define NCCL_GRAPH_H_
+
+#include "nccl.h"
+#include "devcomm.h"
+#include <limits.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdio.h>
+
+enum ncclPathDist {
+  PATH_PIX  = 0,
+  PATH_PXB  = 1,
+  PATH_PHB  = 2,
+  PATH_NODE = 3,
+  PATH_SYS  = 4,
+  PATH_ARRAY_SIZE = 5
+};
+
+extern const char* pathDists[PATH_ARRAY_SIZE];
+
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
+
+struct ncclTopoSystem;
+// Build the topology
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
+
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
+void ncclTopoFree(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
+ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system);
+
+// Query topology
+ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink);
+ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink);
+ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance);
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
+ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance);
+ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count);
+
+#define NCCL_TOPO_MAX_NODES 256
+
+#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
+#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Split tree (send/recv from different ranks) flowing in both directions
+#define NCCL_TOPO_PATTERN_TREE 3            // Simple tree (send/recv from same rank) flowing in both directions
+#define NCCL_TOPO_PATTERN_RING 4            // Ring
+struct ncclTopoGraph {
+  // Input / output
+  int pattern;
+  int crossNic;
+  // Output
+  int nChannels;
+  int speedIntra;
+  int speedInter;
+  int type;
+  int nvlink;
+  int sameChannels;
+  int nHops;
+  int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
+  int inter[MAXCHANNELS*2];
+};
+ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+
+ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+
+struct ncclTopoRanks {
+  int ringRecv[MAXCHANNELS];
+  int ringSend[MAXCHANNELS];
+  int ringPrev[MAXCHANNELS];
+  int ringNext[MAXCHANNELS];
+  int treeUpRecv[MAXCHANNELS];
+  int treeUpSend[MAXCHANNELS];
+  int treeDnRecv[MAXCHANNELS];
+  int treeDnSend[MAXCHANNELS];
+};
+
+ncclResult_t ncclTopoPreset(struct ncclComm* comm,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+    struct ncclTopoRanks* topoRanks);
+
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
+    struct ncclTopoRanks** allTopoRanks, int* rings);
+
+ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph);
+
+#endif
@@ -8,14 +8,14 @@
 #define NCCL_GROUP_H_

 #include "nccl.h"
-#include "core.h"
+#include "comm.h"

 bool ncclAsyncMode();
 ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);

-typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);

-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);

 typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
@@ -8,6 +8,7 @@
 #define NCCL_INFO_H_

 #include "nccl.h"
+#include "core.h"

 typedef enum {
  ncclPatternRing,
@@ -21,7 +22,7 @@ typedef enum {

 // Used to pass NCCL call information between functions
 struct ncclInfo {
-  ncclColl_t coll;
+  ncclFunc_t coll;
  const char* opName;
  // NCCL Coll Args
  const void* sendbuff;
@@ -36,7 +37,11 @@ struct ncclInfo {
  int chunkSteps;
  int sliceSteps;
  // Computed later
+  int algorithm;
+  int protocol;
  ncclPattern_t pattern;
+  int nChannels;
+  int nThreads;
  size_t nBytes;
  int nstepsPerLoop;
  int nchunksPerLoop;
@@ -15,7 +15,7 @@
 #define NCCL_PTR_CUDA 0x2

 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys;

 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

@@ -17,7 +17,6 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 static const char* ncclNetName() { return ncclNet->name; }
 static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
 static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
-static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
@@ -31,6 +30,37 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }

+#define GPU_BUF_SIZE (2*1024*1024)
+static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) {
+  int support;
+  NCCLCHECK(ncclNet->ptrSupport(dev, &support));
+  *supportedTypes = support & ~NCCL_PTR_CUDA;
+  // The network supports GPU Direct RDMA ; verify the GPU supports it as well.
+  if (support & NCCL_PTR_CUDA) {
+    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+    ncclNetHandle_t handle;
+    void* gpuPtr = NULL;
+    void* mHandle = NULL;
+    ncclResult_t res;
+    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup);
+    NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup);
+    NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup);
+    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup);
+    NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res);
+    if (res != ncclSuccess) goto cleanup;
+    NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup);
+    NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup);
+    NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup);
+    *supportedTypes |= NCCL_PTR_CUDA;
+cleanup:
+    if (gpuPtr) cudaFree(gpuPtr);
+    if (rComm) ncclNetCloseRecv(rComm);
+    if (sComm) ncclNetCloseSend(sComm);
+    if (lComm) ncclNetCloseListen(lComm);
+  }
+  return ncclSuccess;
+}
+
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;

@@ -1,133 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NVLINK_H_
-#define NCCL_NVLINK_H_
-
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "nvmlwrap.h"
-#include "topo.h"
-
-#define CONNECT_NVLINK 0x10
-#define CONNECT_NVSWITCH 0x100
-
-enum ncclNvLinkDeviceType {
-  ncclNvLinkDeviceGpu,
-  ncclNvLinkDeviceSwitch,
-  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
-};
-
-static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
-  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
-  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
-  char* rPath = realpath(classPath, NULL);
-  int fd;
-  if ((fd = open(rPath, O_RDONLY)) == -1) {
-    // Could not find device. It might be because we're in a VM and
-    // we don't see the whole machine. This is handled silently so
-    // we don't want to print an INFO error.
-    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
-    return ncclSystemError;
-  }
-  free(rPath);
-  char pciClass[9];
-  strncpy(pciClass, "0x000000", 9);
-  int len;
-  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
-  SYSCHECK(close(fd), "close");
-  if (strcmp(pciClass, "0x068000") == 0) {
-    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
-    *type = ncclNvLinkDeviceSwitch;
-  } else if (strcmp(pciClass, "0x068001") == 0) {
-    // PCI device is of type "Bridge: IBM Device 04ea"
-    *type = ncclNvLinkDeviceBridge;
-  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
-      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
-    *type = ncclNvLinkDeviceGpu;
-  } else {
-    // Ignore if we don't know what's on the other side.
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-/* Get the maximum number of NVLinks based on the GPU generation */
-static ncclResult_t getMaxNvlinks(int* maxLinks) {
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  int ccMajor;
-  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
-  // 6 for Volta, 4 for Pascal
-  *maxLinks = (ccMajor > 6) ? 6 : 4;
-  // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
-  return ncclSuccess;
-}
-
-static int getNvlinkGpu(const char* busId1, const char* busId2) {
-  // Determine if that connection is through NVLink
-  int links = 0;
-  int nvswitch_links = 0;
-  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
-  nvmlDevice_t nvmlDev;
-  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
-  if (res != ncclSuccess) return 0;
-
-  for(int l=0; l<maxNvLinks; ++l) {
-    // Check whether we can use this NVLink for P2P
-    unsigned canP2P;
-    if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
-
-    // Make sure the Nvlink is up. The previous call should have trained the link.
-    nvmlEnableState_t isActive;
-    if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
-
-    // Try to figure out what's on the other side of the NVLink
-    nvmlPciInfo_t remoteProc;
-    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
-
-    // Old versions of NVML return a lowercase PCI ID
-    char* p = remoteProc.busId;
-    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-      if (p[c] == 0) break;
-      p[c] = toupper(p[c]);
-    }
-
-    if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
-      links++;
-    } else {
-      // Make a lower case copy of the bus ID for calling ncclDeviceType
-      // PCI system path is in lower case
-      char* p = remoteProc.busId;
-      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-        if (p[c] == 0) break;
-        lowerId[c] = tolower(p[c]);
-      }
-
-      // Determine if the remote side is NVswitch or a GPU
-      enum ncclNvLinkDeviceType type;
-      ncclResult_t ret = ncclDeviceType(lowerId, &type);
-      if (ret == ncclSuccess) {
-        if (type == ncclNvLinkDeviceSwitch) {
-          //TODO: we are making an assumption that all GPUs are connected to this switch
-          //This assumption may change for future architectures
-          nvswitch_links++;
-        } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
-          links++;
-        }
-      } else {
-        // The NVLink is up but we couldn't find the PCI device on the other
-        // side. Assume it's an NVswitch outside a VM.
-        if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
-        nvswitch_links++;
-      }
-    }
-  }
-  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
-}
-
-#endif
@@ -9,18 +9,31 @@

 #include "nccl.h"

-//#define NVML_DIRECT 1
-#ifdef NVML_DIRECT
-#include "nvml.h"
+// The NVML library doesn't appear to be thread safe
+#include <pthread.h>
+extern pthread_mutex_t nvmlLock;
+#define NVMLLOCK() pthread_mutex_lock(&nvmlLock)
+#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock)
+
+#define NVMLLOCKCALL(cmd, ret) do {                      \
+    NVMLLOCK();                                          \
+    ret = cmd;                                           \
+    NVMLUNLOCK();                                        \
+} while(false)

 #define NVMLCHECK(cmd) do {                              \
-    nvmlReturn_t e = cmd;                                \
+    nvmlReturn_t e;                                      \
+    NVMLLOCKCALL(cmd, e);                                \
    if( e != NVML_SUCCESS ) {                            \
      WARN("NVML failure '%s'", nvmlErrorString(e));     \
      return ncclSystemError;                            \
    }                                                    \
 } while(false)

+//#define NVML_DIRECT 1
+#ifdef NVML_DIRECT
+#include "nvml.h"
+
 static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
 static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
 static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
@@ -57,6 +70,10 @@ static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned i
  NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
  return ncclSuccess;
 }
+static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+  NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
+  return ncclSuccess;
+}
 #else
 // Dynamically handle dependencies on NVML

@@ -139,6 +156,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
 ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
+ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);

 #endif // NVML_DIRECT

@@ -1,17 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_RINGS_H_
-#define NCCL_RINGS_H_
-
-static int getDefaultThreads() {
-  // On Kepler, rings are doubled later.
-  return ncclCudaCompCap() == 3 ? 128 : 256;
-}
-
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
-
-#endif
@@ -66,7 +66,9 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
 #endif
  struct netIf userIfs[MAX_IFS];
  bool searchNot = prefixList && prefixList[0] == '^';
+  if (searchNot) prefixList++;
  bool searchExact = prefixList && prefixList[0] == '=';
+  if (searchExact) prefixList++;
  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);

  int found = 0;
@@ -118,17 +120,17 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
  return found;
 }

-static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
+static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
  /* Check family first */
  int family = local_if.ifa_addr->sa_family;
-  if (family != remote.sa.sa_family) {
+  if (family != remote->sa.sa_family) {
    return false;
  }

  if (family == AF_INET) {
    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
-    struct sockaddr_in& remote_addr = remote.sin;
+    struct sockaddr_in& remote_addr = remote->sin;
    struct in_addr local_subnet, remote_subnet;
    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
@@ -136,7 +138,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
  } else if (family == AF_INET6) {
    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
-    struct sockaddr_in6& remote_addr = remote.sin6;
+    struct sockaddr_in6& remote_addr = remote->sin6;
    struct in6_addr& local_in6 = local_addr->sin6_addr;
    struct in6_addr& mask_in6 = mask->sin6_addr;
    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
@@ -161,7 +163,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
  }
 }

-static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
+static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
 #ifdef ENABLE_TRACE
  char line[1024];
 #endif
@@ -189,13 +191,13 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
    // Store the interface name
    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);

-    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr->sa), line_a));
    found++;
    if (found == maxIfs) break;
  }

  if (found == 0) {
-    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
+    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr->sa), line_a));
  }
  freeifaddrs(interfaces);
  return found;
@@ -300,7 +302,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
        // Try to find interface that is in the same subnet as the IP in comm id
        union socketAddress idAddr;
        GetSocketAddrFromString(&idAddr, commId);
-        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
+        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
      }
    }
    // Then look for anything else (but not docker or lo)
@@ -387,7 +389,7 @@ retry:
  if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
    if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
        (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
-      INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+      if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
      usleep(SLEEP_INT);
      goto retry;
    }
@@ -1,45 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TOPO_H_
-#define NCCL_TOPO_H_
-
-#include "nccl.h"
-#include <limits.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdio.h>
-
-ncclResult_t getCudaPath(int cudaDev, char** path);
-
-static int getNumaId(char *path) {
-  char npath[PATH_MAX];
-  snprintf(npath, PATH_MAX, "%s/numa_node", path);
-  npath[PATH_MAX-1] = '\0';
-
-  int numaId = -1;
-  FILE *file = fopen(npath, "r");
-  if (file == NULL) return -1;
-  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
-  fclose(file);
-
-  return numaId;
-}
-
-enum ncclPathDist {
-  PATH_PIX  = 0,
-  PATH_PXB  = 1,
-  PATH_PHB  = 2,
-  PATH_NODE = 3,
-  PATH_SYS  = 4,
-  PATH_ARRAY_SIZE = 5
-};
-
-extern const char* pathDists[PATH_ARRAY_SIZE];
-
-int pciDistance(char* path1, char* path2);
-
-#endif
@@ -7,12 +7,15 @@
 #ifndef NCCL_TRANSPORT_H_
 #define NCCL_TRANSPORT_H_

-#include "nccl.h"
 #include "devcomm.h"
-#include <stdint.h>
+#include "graph.h"
 #include "nvmlwrap.h"
+#include "core.h"

 #define NTRANSPORTS 3
+#define TRANSPORT_P2P 0
+#define TRANSPORT_SHM 1
+#define TRANSPORT_NET 2

 extern struct ncclTransport ncclTransports[];

@@ -24,15 +27,13 @@ struct ncclComm;
 struct ncclPeerInfo {
  int rank;
  int cudaDev;
-  int nvmlDev;
+  int gdrSupport;
  uint64_t hostHash;
  uint64_t pidHash;
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  dev_t shmDev;
+  int64_t busId;
 };

-// Used to hold the transport connection values
-typedef int64_t ncclTvalue_t;
-
 #define CONNECT_SIZE 128
 struct ncclConnect {
  char data[CONNECT_SIZE];
@@ -51,7 +52,7 @@ struct ncclProxyArgs {
  int chunkSteps;
  int nsteps;
  uint64_t opCount;
-  int llMode;
+  int protocol;
  int state;   // add component before this line -- it is left out during initialization

  // Internal state
@@ -78,7 +79,7 @@ struct ncclProxyState {
 };

 struct ncclTransportComm {
-  ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
+  ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
  ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
  ncclResult_t (*free)(void*);
  ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -86,8 +87,7 @@ struct ncclTransportComm {

 struct ncclTransport {
  const char name[4];
-  ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
-  ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
+  ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
  struct ncclTransportComm send;
  struct ncclTransportComm recv;
 };
@@ -10,6 +10,14 @@
 #include "nccl.h"
 #include <stdint.h>

+int ncclCudaCompCap();
+
+// PCI Bus ID <-> int64 conversion functions
+ncclResult_t int64ToBusId(int64_t id, char* busId);
+ncclResult_t busIdToInt64(char* busId, int64_t* id);
+
+ncclResult_t getBusId(int cudaDev, int64_t *busId);
+
 ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 uint64_t getHash(const char* string, int n);
 uint64_t getHostHash();
@@ -23,4 +31,10 @@ struct netIf {
 int parseStringList(const char* string, struct netIf* ifList, int maxList);
 bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);

+static long log2i(long n) {
+ long l = 0;
+ while (n>>=1) l++;
+ return l;
+}
+
 #endif
@@ -5,6 +5,7 @@
 ************************************************************************/

 #include "argcheck.h"
+#include "comm.h"

 static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
  cudaPointerAttributes attr;
@@ -16,6 +16,7 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
 static nvmlReturn_t (*nvmlInternalShutdown)(void);
 static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
 static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
+static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
 static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
@@ -23,7 +24,10 @@ static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t dev
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
    nvmlNvLinkCapability_t capability, unsigned int *capResult);
 static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
+static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);

+// Used to make the NVML library calls thread safe
+pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER;

 ncclResult_t wrapNvmlSymbols(void) {
  if (nvmlState == nvmlInitialized)
@@ -70,12 +74,14 @@ ncclResult_t wrapNvmlSymbols(void) {
  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
  LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
  LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability);

  nvmlState = nvmlInitialized;
  return ncclSuccess;
@@ -85,6 +91,7 @@ teardown:
  nvmlInternalShutdown = NULL;
  nvmlInternalDeviceGetHandleByPciBusId = NULL;
  nvmlInternalDeviceGetIndex = NULL;
+  nvmlInternalDeviceGetHandleByIndex = NULL;
  nvmlInternalDeviceGetPciInfo = NULL;
  nvmlInternalDeviceGetMinorNumber = NULL;
  nvmlInternalDeviceGetNvLinkState = NULL;
@@ -130,7 +137,8 @@ ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_
    WARN("lib wrapper not initialized.");
    return ncclInternalError;
  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret);
  if (ret != NVML_SUCCESS) {
    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
        nvmlInternalErrorString(ret));
@@ -144,7 +152,8 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
    WARN("lib wrapper not initialized.");
    return ncclInternalError;
  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret);
  if (ret != NVML_SUCCESS) {
    WARN("nvmlDeviceGetIndex() failed: %s ",
        nvmlInternalErrorString(ret));
@@ -153,12 +162,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
  return ncclSuccess;
 }

+ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
+  if (nvmlInternalDeviceGetHandleByIndex == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
  if (nvmlInternalDeviceGetPciInfo == NULL) {
    WARN("lib wrapper not initialized.");
    return ncclInternalError;
  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
  if (ret != NVML_SUCCESS) {
    WARN("nvmlDeviceGetPciInfo() failed: %s ",
        nvmlInternalErrorString(ret));
@@ -172,7 +197,8 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min
    WARN("lib wrapper not initialized.");
    return ncclInternalError;
  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
  if (ret != NVML_SUCCESS) {
    WARN("nvmlDeviceGetMinorNumber() failed: %s ",
        nvmlInternalErrorString(ret));
@@ -186,7 +212,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
    /* Do not warn, this symbol is optional. */
    return ncclInternalError;
  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret);
  if (ret != NVML_SUCCESS) {
    if (ret != NVML_ERROR_NOT_SUPPORTED)
      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
@@ -201,7 +228,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
    /* Do not warn, this symbol is optional. */
    return ncclInternalError;
  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret);
  if (ret != NVML_SUCCESS) {
    if (ret != NVML_ERROR_NOT_SUPPORTED)
      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
@@ -217,7 +245,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
    /* Do not warn, this symbol is optional. */
    return ncclInternalError;
  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret);
  if (ret != NVML_SUCCESS) {
    if (ret != NVML_ERROR_NOT_SUPPORTED)
      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
@@ -226,4 +255,19 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
  }
  return ncclSuccess;
 }
+
+ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+  if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
 #endif
@@ -1,391 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "param.h"
-
-#define NCCL_MAX_SCORE 7
-
-/* Parse user defined rings. Format is like :
- * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
- * Rings with a non-matching number of ranks are ignored so we can provide
- * rings for multiple cases.
- */
-#define MAX_ENV_RANKS 512
-static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
-  int ranks[MAX_ENV_RANKS];
-  int nrings = 0;
-  int rank = 0;
-  int offset = 0;
-  int status = 0; // 0 : between numbers, 1 : inside number
-  do {
-    int digit = str[offset] - '0';
-    if (digit >= 0 && digit <= 9) {
-      if (status == 0) {
-        ranks[rank] = digit;
-        status = 1;
-      } else {
-        ranks[rank] = ranks[rank]*10+digit;
-      }
-    } else {
-      if (status == 1) {
-        rank++;
-        if (rank == MAX_ENV_RANKS) goto end;
-      }
-      status = 0;
-      if (str[offset] == '|' || str[offset] == '\0') {
-        int prevRank = ranks[rank-1];
-        // Ignore rings if nranks doesn't match
-        if (rank != nranks) goto newring;
-
-        for (int r=0; r<nranks; r++) {
-          int rank = ranks[r];
-          // Ignore rings with ranks out of bounds
-          if (rank < 0 || rank >= nranks) goto newring;
-          // Ignore rings with duplicate ranks
-          for (int i=0; i<r; i++)
-            if (ranks[i] == rank) goto newring;
-
-          next[nrings*nranks+prevRank] = rank;
-          prev[nrings*nranks+rank] = prevRank;
-          prevRank = rank;
-        }
-        nrings++;
-newring:
-        rank = 0;
-      }
-    }
-  } while (str[offset++] != 0);
-end:
-  *nringsRet = nrings;
-  return ncclSuccess;
-}
-
-/*
- * Ring creation algorithm
- *
- * First, we establish hierarchical coordinates depending on the way ranks can
- * communicate. After fillCoords, we have for each rank a unique 3-int array
- * {   node, pci_domain,   rank } corresponding to the three transports :
- * { 2[NET],     1[SHM], 0[P2P] }.
- * Also, we renumber ranks (to indexes) based on their growing coordinates.
- *
- * Then, we ask transports to connect groups together. We start with net, then
- * shm, then p2p. We maintain two arrays, prev and next, where values are equal
- * to -1 when ranks are not yet connected, and a rank otherwise. We never
- * connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
- * ranks, if we are rank 13, we should see something like (provided we have a
- * single net interface, hence a single ring) :
- *
- * Connecting all nodes                                <13>
- * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
- *
- * Connecting P2P domains with shared memory           <13>
- * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
- *
- * Connecting ranks (only inside the P2P domain)       <13>
- * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
- *
- * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
- * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
- * risking to explode in terms of combinations, and we scale better.
- *
- * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
- * we get at least one ring.
- */
-
-static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
-  connected[rank] = 1;
-  for (int r=0; r<nranks; r++) {
-    if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
-      recIsConnected(r, connected, nranks, matrix, transport);
-    }
-  }
-}
-
-static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
-  for (int r=0; r<nranks; r++) connected[r] = 0;
-  recIsConnected(rank, connected, nranks, matrix, transport);
-}
-
-#define NEW_IDX(rank) do { \
-  rankToIdx[rank] = idx; \
-  idxToRank[idx] = rank; \
-  for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
-  idx++; \
-} while (0)
-
-int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
-  for (int r=0; r<nranks; r++) {
-    if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
-  }
-  return -1;
-}
-
-static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
-  int current[NTRANSPORTS];
-  int* p2pConnected;
-  NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
-  for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
-  int curRank = 0, idx = 0;
-  while (1) {
-    // P2P is handled separately as there is no level below it and we need to
-    // cover the case of being connected to another GPU indirectly.
-    // So we detect all GPUs in the same P2P domain once and add them all at
-    // once.
-    isConnected(curRank, p2pConnected, nranks, matrix, 0);
-    for (int r=0; r<nranks; r++) {
-      if (p2pConnected[r]) {
-        NEW_IDX(r);
-        curRank = r;
-        current[0]++;
-      }
-    }
-    current[0] = 0;
-
-    if (idx == nranks) {
-      free(p2pConnected);
-      return ncclSuccess;
-    }
-
-    // Find next group, either connected through SHM or NET.
-    int rank;
-    int transport = 1;
-    while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
-      current[transport] = 0;
-      transport++;
-      if (transport == NTRANSPORTS) {
-        WARN("Error : Could not find transport to connect next group\n");
-        free(p2pConnected);
-        return ncclInternalError; }
-    }
-    curRank = rank;
-    current[transport]++;
-  }
-}
-
-#ifdef __PPC__
-// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes
-#define DEFAULT_MIN_NRINGS 4
-#else
-#define DEFAULT_MIN_NRINGS 0
-#endif
-NCCL_PARAM(MinNrings, "MIN_NRINGS", DEFAULT_MIN_NRINGS);
-NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
-
-/* Users can force the number of threads with an environment variable */
-NCCL_PARAM(Nthreads, "NTHREADS", -2);
-ncclResult_t getEnvThreads(int* nthreads) {
-  int64_t nt = ncclParamNthreads();
-  if (nt != -2)
-    *nthreads = nt;
-  return ncclSuccess;
-}
-
-static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
-  if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
-  for (int r=nrings; r<newNrings; r++) {
-    for (int i=0; i<nranks; i++) {
-      a[r*nranks+i] = a[(r-nrings)*nranks+i];
-      b[r*nranks+i] = b[(r-nrings)*nranks+i];
-      c[r*nranks+i] = c[(r-nrings)*nranks+i];
-      d[r*nranks+i] = d[(r-nrings)*nranks+i];
-    }
-  }
-  return newNrings;
-}
-/* Main ring creation function */
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
-  *nrings = 0;
-
-  if (nranks == 1) return ncclSuccess;
-
-  char* str = getenv("NCCL_RINGS");
-  if (str && strlen(str)>0) {
-    int ret = parseRings(str, nrings, nranks, prev, next);
-    if (ret == ncclSuccess && *nrings > 0) {
-      if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
-      NCCLCHECK(getEnvThreads(nthreads));
-      for (int r = 0; r<*nrings; r++) {
-        for (int i = 0; i<nranks; i++) {
-          if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
-          if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
-        }
-      }
-      return ncclSuccess;
-    }
-    if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
-    *nrings = 0;
-  }
-
-  // Compute hierarchical topology groups, indexes, and rank<->index tables
-  int* coords, *globalIdxToRank, *globalRankToIdx;
-  NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
-  for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
-  NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
-  NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));
-
-  NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));
-
-  // Start with a high score, then decrease until we find rings
-  int minScore = NCCL_MAX_SCORE;
-  int nringsTmp;
-  int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
-  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&idxToRank, nranks));
-  NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
-  NCCLCHECK(ncclCalloc(&groups, nranks));
-  NCCLCHECK(ncclCalloc(&subgroups, nranks));
-
-  int nThreads;
-  do {
-    nThreads = *nthreads;
-    for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
-    nringsTmp = MAXCHANNELS;
-    // Loop over transports to connect groups
-    for (int t=NTRANSPORTS-1; t>=0; t--) {
-      for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
-
-      int nidx = 0;
-      for (int i=0; i<nranks; i++) {
-        // Extract only ranks in the same local area as rank
-        // We need to extract them in the topological order, hence we iterate over indexes, not ranks
-        int r = globalIdxToRank[i];
-        int sameLocal = 1;
-        for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
-        if (!sameLocal) continue;
-
-        groups[nidx] = coords[r*NTRANSPORTS+t];
-        subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
-        rankToIdx[r] = nidx;
-        idxToRank[nidx] = r;
-        nidx++;
-      }
-
-      int ngroups = groups[nidx-1] + 1; // Coords should be ordered
-
-      ncclTvalue_t* subvalues;
-      int *subprev, *subnext;
-      NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
-      NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
-      NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
-      if (ngroups > 1) {
-        /* Extract subvalues */
-        for (int i=0; i<nidx; i++) {
-          for (int j=0; j<nidx; j++) {
-            if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
-              subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
-            else
-              subvalues[i*nidx+j] = 0;
-          }
-        }
-        /* Extract subprev/subnext */
-        for (int i=0; i<nidx*nringsTmp; i++) {
-          subprev[i] = subnext[i] = -1;
-        }
-        for (int r=0; r<nringsTmp; r++) {
-          int start = -1, end = -1;
-          for (int i=0; i<nranks; i++) {
-            if (rankToIdx[i] == -1) continue;
-            if (prevTmp[r*nranks+i] != -1) start = i;
-            if (nextTmp[r*nranks+i] != -1) end = i;
-          }
-          if (start != -1 && end != -1) {
-            subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
-            subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
-          }
-        }
-        /* Get rings */
-        NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
-        /* Merge subprev/subnext into prev/next */
-        for (int r=0; r<nringsTmp; r++) {
-          for (int i=0; i<nidx; i++) {
-            if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
-            if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
-            if (t == NTRANSPORTS-1) {
-              // Save node-level masters for trees
-              treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
-              treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
-            }
-          }
-        }
-        //for (int r=0; r<nringsTmp; r++) {
-        //printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
-        //printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
-        //}
-      }
-      free(subvalues);
-      free(subprev);
-      free(subnext);
-      if (nringsTmp == 0) break;
-    }
-    minScore--;
-    if (nringsTmp > *nrings) {
-      *nrings = nringsTmp;
-      for (int i=0; i<nranks*(*nrings); i++) {
-        prev[i] = prevTmp[i];
-        next[i] = nextTmp[i];
-      }
-    }
-  } while (nringsTmp == 0 && minScore);
-
-  free(coords);
-  free(globalRankToIdx);
-  free(globalIdxToRank);
-  free(prevTmp);
-  free(nextTmp);
-  free(idxToRank);
-  free(rankToIdx);
-  free(groups);
-  free(subgroups);
-
-  *nthreads = nThreads;
-
-  /* Duplicate the rings in case of multinode+NVLink */
-  int nnodes = 0;
-  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
-  int nvlink;
-  NCCLCHECK(ncclNvlinkGpu(&nvlink));
-  if (nnodes > 1 && nvlink) {
-    *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
-  }
-
-  if (*nrings == 0) {
-    WARN("Could not create rings, falling back on simple ring");
-    *nrings = 1;
-    prev[rank] = (rank-1+nranks) % nranks;
-    next[rank] = (rank+1)%nranks;
-  }
-
-  int maxNrings = ncclParamMaxNrings();
-  int minNrings = ncclParamMinNrings();
-  if (maxNrings > 0 && minNrings > maxNrings) {
-    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
-    minNrings = 0;
-  }
-  if (minNrings > MAXCHANNELS) {
-    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
-    minNrings = MAXCHANNELS;
-  }
-  if (maxNrings > 0 && maxNrings <= *nrings) {
-    if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
-    *nrings = maxNrings;
-  } else {
-    int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
-    if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
-    if (minNrings > 0 && minNrings > *nrings) {
-      if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
-      *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
-    }
-  }
-
-  NCCLCHECK(getEnvThreads(nthreads));
-  return ncclSuccess;
-}
@@ -1,57 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "topo.h"
-
-#define BUSID_SIZE (sizeof("0000:00:00.0"))
-#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
-
-ncclResult_t getCudaPath(int cudaDev, char** path) {
-  char busId[BUSID_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
-  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
-  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
-  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
-  *path = realpath(busPath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", busPath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
-
-int pciDistance(char* path1, char* path2) {
-  int score = 0;
-  int depth = 0;
-  int same = 1;
-  for (int i=0; i<strlen(path1); i++) {
-    if (path1[i] != path2[i]) same = 0;
-    if (path1[i] == '/') {
-      depth++;
-      if (same == 1) score++;
-    }
-  }
-  if (score <= 3) {
-#ifdef __PPC__
-    // NUMA distance detection and PATH_SYS not supported on IBM/Power nodes
-    // nodes currently
-    return PATH_NODE;
-#else
-    /* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
-    int numaId1 = getNumaId(path1);
-    int numaId2 = getNumaId(path2);
-    TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
-    return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
-#endif
-  }
-  if (score == 4) return PATH_PHB;
-  if (score == depth-1) return PATH_PIX;
-  return PATH_PXB;
-}
@@ -5,27 +5,53 @@
 ************************************************************************/

 #include "utils.h"
-#include "debug.h"
-#include "nccl_net.h"
-#include <unistd.h>
-#include <string.h>
-#include <stdarg.h>
-
-#include "nvmlwrap.h"
 #include "core.h"

+#include "nvmlwrap.h"
+
+// Get current Compute Capability
+int ncclCudaCompCap() {
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  int ccMajor, ccMinor;
+  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
+  return ccMajor*10+ccMinor;
+}
+
+ncclResult_t int64ToBusId(int64_t id, char* busId) {
+  sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
+  return ncclSuccess;
+}
+
+ncclResult_t busIdToInt64(char* busId, int64_t* id) {
+  const int size = strlen(busId);
+  char* hexStr;
+  NCCLCHECK(ncclCalloc(&hexStr, size));
+  int hexOffset = 0;
+  for (int i=0; i<size; i++) {
+    char c = busId[i];
+    if (c == '.' || c == ':') continue;
+    if ((c >= '0' && c <= '9') ||
+        (c >= 'A' && c <= 'F') ||
+        (c >= 'a' && c <= 'f')) {
+      hexStr[hexOffset++] = busId[i];
+    } else break;
+  }
+  hexStr[hexOffset] = '\0';
+  *id = strtol(hexStr, NULL, 16);
+  free(hexStr);
+  return ncclSuccess;
+}
+
 // Convert a logical cudaDev index to the NVML device minor number
-ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  nvmlDevice_t nvmlDevice;
-  unsigned int dev;
-  *nvmlDev = -1;
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
-  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
-  NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
-
-  *nvmlDev = dev;
-
+ncclResult_t getBusId(int cudaDev, int64_t *busId) {
+  // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
+  // format. Still need to allocate proper space in case PCI domain goes
+  // higher.
+  char busIdStr[] = "00000000:00:00.0";
+  CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
+  NCCLCHECK(busIdToInt64(busIdStr, busId));
  return ncclSuccess;
 }

@@ -40,53 +66,6 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
  return ncclSuccess;
 }

-/* Common logging function used by the INFO, WARN and TRACE macros
- * Also exported to the dynamically loadable Net transport modules so
- * they can share the debugging mechanisms and output files
- */
-void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
-  if (ncclDebugLevel <= NCCL_LOG_NONE) return;
-
-  char hostname[1024];
-  getHostName(hostname, 1024, '.');
-  int cudaDev;
-  cudaGetDevice(&cudaDev);
-
-  char buffer[1024];
-  size_t len = 0;
-  pthread_mutex_lock(&ncclDebugOutputLock);
-  if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
-    len = snprintf(buffer, sizeof(buffer),
-                   "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
-  else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
-    len = snprintf(buffer, sizeof(buffer),
-                   "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
-#ifdef ENABLE_TRACE
-  else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
-    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
-    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer),
-                   "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
-  }
-#endif
-  if (len) {
-    va_list vargs;
-    va_start(vargs, fmt);
-    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
-    va_end(vargs);
-    fprintf(ncclDebugFile,"%s\n", buffer);
-    fflush(ncclDebugFile);
-  }
-  pthread_mutex_unlock(&ncclDebugOutputLock);
-
-  // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
-  if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
-    fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
-            hostname, getpid(), gettid(), cudaDev, filefunc, line);
-    abort();
-  }
-}
-
 uint64_t getHash(const char* string, int n) {
  // Based on DJB2, result = result * 33 + char
  uint64_t result = 5381;
@@ -100,27 +79,39 @@ uint64_t getHash(const char* string, int n) {
 * that will be unique for both bare-metal and container instances
 * Equivalent of a hash of;
 *
- * $(hostname) $(readlink /proc/self/ns/uts) $(readlink /proc/self/ns/mnt)
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ * This string can be overridden by using the NCCL_HOSTID env var.
 */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
 uint64_t getHostHash(void) {
-  char uname[1024];
-  // Start off with the full hostname
-  (void) getHostName(uname, sizeof(uname), '\0');
-  int offset = strlen(uname);
-  int len;
-  // $(readlink /proc/self/ns/uts)
-  len = readlink("/proc/self/ns/uts", uname+offset, sizeof(uname)-1-offset);
-  if (len < 0) len = 0;
-  offset += len;
-  // $(readlink /proc/self/ns/mnt)
-  len = readlink("/proc/self/ns/mnt", uname+offset, sizeof(uname)-1-offset);
-  if (len < 0) len = 0;
-  offset += len;
-  // Trailing '\0'
-  uname[offset]='\0';
-  TRACE(NCCL_INIT,"unique hostname '%s'", uname);
+  char hostHash[1024];
+  char *hostId;

-  return getHash(uname, strlen(uname));
+  // Fall back is the full hostname if something fails
+  (void) getHostName(hostHash, sizeof(hostHash), '\0');
+  int offset = strlen(hostHash);
+
+  if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
+    strncpy(hostHash, hostId, sizeof(hostHash));
+  } else {
+    FILE *file = fopen(HOSTID_FILE, "r");
+    if (file != NULL) {
+      char *p;
+      if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+      }
+    }
+    fclose(file);
+  }
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
+
+  return getHash(hostHash, strlen(hostHash));
 }

 /* Generate a hash of the unique identifying string for this process
@@ -147,8 +138,6 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
  if (!string) return 0;

  const char* ptr = string;
-  // Ignore "^" or "=" prefix, will be detected outside of this function
-  if (ptr[0] == '^' || ptr[0] == '=') ptr++;

  int ifNum = 0;
  int ifC = 0;
@@ -41,7 +41,7 @@ typedef enum { ncclSuccess                 =  0,
 * This integer is coded with the MAJOR, MINOR and PATCH level of the
 * NCCL library
 */
-ncclResult_t ncclGetVersion(int *version);
+ncclResult_t  ncclGetVersion(int *version);
 ncclResult_t pncclGetVersion(int *version);

 /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
@@ -244,7 +244,8 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
 * Start a group call. All subsequent calls to NCCL may not block due to
 * inter-CPU synchronization.
 */
-ncclResult_t ncclGroupStart();
+ncclResult_t  ncclGroupStart();
+ncclResult_t pncclGroupStart();

 /*
 * Group End
@@ -252,7 +253,8 @@ ncclResult_t ncclGroupStart();
 * End a group call. Wait for all calls since ncclGroupStart to complete
 * before returning.
 */
-ncclResult_t ncclGroupEnd();
+ncclResult_t  ncclGroupEnd();
+ncclResult_t pncclGroupEnd();

 #ifdef __cplusplus
 } // end extern "C"
@@ -4,7 +4,8 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "comm.h"
+#include "info.h"

 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
@@ -119,13 +120,13 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r
  }
  if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
    // Tree up
-    struct ncclTree* tree = &args->channel->tree;
+    struct ncclTree* tree = &args->channel->treeUp;
    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
  }
  if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
    // Tree down
-    struct ncclTree* tree = &args->channel->tree;
+    struct ncclTree* tree = &args->channel->treeDn;
    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
  }
@@ -157,7 +158,9 @@ void* persistentThread(void *comm_) {
      }
    } while (op == NULL);
    op->idle = 0;
-    if (op->state != ncclProxyOpNone) ret = op->progress(op);
+    // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
+    // yet and might be cancelled before they even start. Hold on on those.
+    if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
    if (ret != ncclSuccess) {
      comm->fatalError = ret;
      INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
@@ -4,39 +4,9 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
-#include "transport.h"
-#include "nvmlwrap.h"
+#include "comm.h"
 #include "net.h"
-#include "param.h"
-#include "topo.h"
-#include <cuda_runtime.h>
-#include <assert.h>
-
-#define NET_MAX_IFS 16
-#define NET_MAX_GPUS 32
-
-// Cache GPU-NIC distances to avoid re-computing them
-#define NET_TVALUE_UNKNOWN 0ULL
-static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
-static int ncclNetNDev;
-
-// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
-#define NET_BITS_PER_IF 3
-#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
-static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
-static ncclTvalue_t getTvalue(short* distances, int ndev) {
-  ncclTvalue_t tvalue = 0;
-  for (int d=0; d<ndev; d++) {
-    ncclTvalue_t score = 1 + PATH_SYS - distances[d];
-    // Keep 3 bits of score info per dev
-    tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
-  }
-  return tvalue;
-}
-static int getScore(ncclTvalue_t tvalue, int dev) {
-  return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
-}
+#include "graph.h"

 struct netConnectInfo {
  ncclNetHandle_t netHandle;
@@ -53,6 +23,7 @@ struct netSendResources {
  int buffSize;
  void* mhandle;
  void* llMhandle;
+  void* ll128Mhandle;
  struct ncclRecvMem* devRecvMem;
  uint64_t step;
  uint64_t llLastCleaning;
@@ -70,228 +41,61 @@ struct netRecvResources {
  int buffSize;
  void* mhandle;
  void* llMhandle;
+  void* ll128Mhandle;
  struct ncclRecvMem* devRecvMem;
  uint64_t step;
  uint64_t llLastCleaning;
 };

-static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
-  char* cudaPath = NULL;
-  char* nicPath = NULL;
-  ncclResult_t err;
-  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
-  err = ncclNetPciPath(dev, &nicPath);
-  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
-  if (nicPath) free(nicPath);
-  if (cudaPath) free(cudaPath);
+/* Determine if two peers can communicate with NET */
+ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  *ret = 1;
  return ncclSuccess;
 }

-static ncclResult_t netDevices(int* ndev, short** distances) {
-  NCCLCHECK(ncclNetDevices(ndev));
-  if (*ndev == 0) {
-    WARN("Error : Network returned 0 device");
-    return ncclSystemError;
-  }
-  if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
-
-  *distances = (short*)malloc(*ndev*sizeof(short));
-  if (*distances == NULL) return ncclSystemError;
-
-  // Find distance with current GPU
-  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
-  char line[1024];
-  sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
-  for (int d=0; d<*ndev; d++) {
-    NCCLCHECK(netDistance(cudaDev, d, *distances+d));
-    sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
-  }
-  INFO(NCCL_INIT|NCCL_NET, "%s", line);
-  return ncclSuccess;
-}
-
-/* Determine if we can communicate with the peer */
-ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  ret[0] = ncclNetTvalues[cudaDev];
-  if (ret[0] == NET_TVALUE_UNKNOWN) {
-    if (cudaDev >= NET_MAX_GPUS) {
-      WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
-      return ncclInternalError;
-    }
-    int nDev;
-    short* distances;
-    NCCLCHECK(netDevices(&nDev, &distances));
-    ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
-    ncclNetNDev = nDev;
-    free(distances);
-  }
-  return ncclSuccess;
-}
-
-static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
-  int bestRank = -1;
-  int bestScore = 0;
-  for (int rank=0; rank<nranks; rank++) {
-    if (groups[rank] != group) continue;
-    for (int i=0; i<nranks; i++) {
-      ncclTvalue_t netValue = values[rank*nranks+i];
-      if (netValue != 0) {
-        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
-        if (score >= minScore && score > bestScore) {
-          bestScore = score;
-          bestRank = rank;
-        }
-        // All other values should be the same, stop here for this rank
-        break;
-      }
-    }
-  }
-  return bestRank;
-}
-static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
-  // For the last rank, we don't need the absolute best score, just to be within minScore.
-  for (int rank=nranks-1; rank>=0; rank--) {
-    if (groups[rank] != group) continue;
-    if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
-    if (startRank == rank) continue;
-    for (int i=0; i<nranks; i++) {
-      ncclTvalue_t netValue = values[rank*nranks+i];
-      if (netValue != 0) {
-        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
-        if (score >= minScore) {
-          return rank;
-        }
-        // All other values should be the same, stop here for this rank
-        break;
-      }
-    }
-  }
-  return -1;
-}
-
-ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  int nGroups = groups[nranks-1] + 1;
-  int *cardUsed, *starts, *ends;
-  NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
-  NCCLCHECK(ncclCalloc(&starts, nGroups));
-  NCCLCHECK(ncclCalloc(&ends, nGroups));
-
-  for (int ring = 0; ring<*nringsRet; ring++) {
-    for (int group = 0; group<nGroups; group++) {
-      int nranksInGroup = 0;
-      int nsubGroups = 0;
-      for (int rank=0; rank<nranks; rank++)
-        if (groups[rank] == group) {
-          nranksInGroup++;
-          nsubGroups = std::max(subgroups[rank], nsubGroups);
-        }
-      starts[group] = ends[group] = -1;
-      // Receive on the rank closest to the NIC
-      for (int card=0; card<NET_MAX_IFS; card++) {
-        if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
-        int start = groupBestStart(nranks, groups, group, values, card, minScore);
-        // Send from any rank, but best on a different subgroup and close to the NIC also.
-        int end = (nranksInGroup == 1) ? start
-            : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
-        //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
-        if (start != -1 && end != -1) {
-          cardUsed[group*NET_MAX_IFS+card] = 1;
-          starts[group] = start;
-          ends[group] = end;
-          break;
-        }
-      }
-      if (starts[group] == -1 || ends[group] == -1) {
-        *nringsRet = ring;
-        goto done;
-      }
-    }
-    // Link groups together
-    for (int group = 0; group<nGroups; group++) {
-      int nextGroup = (group+1)%nGroups;
-      next[ring*nranks+ends[group]] = starts[nextGroup];
-      prev[ring*nranks+starts[nextGroup]] = ends[group];
-    }
-  }
-done:
-  free(cardUsed);
-  free(starts);
-  free(ends);
-  return ncclSuccess;
-}
-
-int getDev(int cudaDev, int ringId) {
-  ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
-
-  int dev = 0;
-  int maxScore = 0;
-  for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
-  int skip = ringId+1;
-  while (skip) {
-    for (int d=0; d<ncclNetNDev; d++) {
-      if (getScore(tvalues, d) == maxScore) {
-        skip--;
-        if (skip == 0) { dev = d; goto end; }
-      }
-    }
-  }
-end:
-  return dev;
-}
-
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);

-static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
+static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) {
  *useGdr = 0;

-  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
-
  if (read) { // For reads (sends) only enable under certain conditions
    int gdrReadParam = ncclParamNetGdrRead();
    if (gdrReadParam == 0) return ncclSuccess;
    if (gdrReadParam < 0) {
       int nvlink;
-       NCCLCHECK(ncclNvlinkGpu(&nvlink));
+       NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink));
       if (!nvlink) return ncclSuccess;
    }
  }

  // Check if we are close enough that it makes sense to enable GDR
  int netGdrLevel = ncclParamNetGdrLevel();
-  short distance;
-  NCCLCHECK(netDistance(cudaDev, dev, &distance));
+  int distance;
+  NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance));
  if (distance >= netGdrLevel) {
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel);
    return ncclSuccess;
  }

  // Finally, check if the NIC supports it
  int flags;
-  NCCLCHECK(ncclNetPtrSupport(dev, &flags));
+  NCCLCHECK(ncclNetPtrSupport(netDev, &flags));
  if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
  *useGdr = 1;
-  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
+  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read);
  return ncclSuccess;
 }

 /* Determine if we will use this transport for this peer and return connect
 * information for this peer */
-ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
  struct netSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  send->transportResources = resources;

-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  resources->netDev = getDev(cudaDev, channelId);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
+  NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));

  int sendSize = sizeof(struct ncclSendMem);
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -303,20 +107,18 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
  resources->buffSize = buffSize;

-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
      resources->useGdr ? "/GDRDMA" : "");
  return ncclSuccess;
 }

-ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
  struct netRecvResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;

-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  resources->netDev = getDev(cudaDev, channelId);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
+  NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));

  int sendSize = sizeof(struct ncclSendMem);
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -328,7 +130,7 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
  resources->buffSize = buffSize;

-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
      resources->useGdr ? "/GDRDMA" : "");
  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
@@ -343,6 +145,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
  send->conn.buff = recvMem->buff;
  send->conn.llBuff = resources->devHostRecvMem->llBuff;
+  send->conn.ll128Buff = recvMem->ll128Buff;

  // Head/Tail/Opcount/Fifos are always on host
  send->conn.tail = &resources->devHostRecvMem->tail;
@@ -360,6 +163,8 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
  NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
        NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));

  return ncclSuccess;
 }
@@ -373,6 +178,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
  recv->conn.buff = recvMem->buff;
  recv->conn.llBuff = recvMem->llBuff;
+  recv->conn.ll128Buff = recvMem->ll128Buff;

  // Head/Tail/Opcount are always on host
  recv->conn.tail = &resources->devHostRecvMem->tail;
@@ -388,6 +194,8 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));

  return ncclSuccess;
 }
@@ -397,6 +205,7 @@ ncclResult_t netSendFree(void* transportResources) {
  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle));
  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
  if (resources->useGdr)
    CUDACHECK(cudaFree(resources->devRecvMem));
@@ -410,6 +219,7 @@ ncclResult_t netRecvFree(void* transportResources) {
  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle));
  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
  if (resources->useGdr)
    CUDACHECK(cudaFree(resources->devRecvMem));
@@ -437,7 +247,39 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
-        if (args->llMode) {
+        if (args->protocol == NCCL_PROTO_LL128) {
+          int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS;
+          if (args->tail < *recvTail) {
+            int buffSlot = args->tail%NCCL_STEPS;
+            if (sizesFifo[buffSlot] != -1) {
+              struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+              char* localBuff = (char*)localMem->ll128Buff;
+              int ready = resources->useGdr;
+              if (!ready) {
+                // When data is in sysmem, we need to wait until all flags are correct since the GPU only
+                // called threadfence()
+                uint64_t flag = args->tail + 1;
+                int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
+                volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
+                ready = 1;
+                for (int i=0; i<nFifoLines; i++) {
+                  if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; }
+                }
+              }
+              if (ready) {
+                // Send through network
+                NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot));
+                if (args->requests[buffSlot] != NULL) {
+                  sizesFifo[buffSlot] = -1;
+                  // Make sure size is reset to zero before we update the head.
+                  __sync_synchronize();
+                  args->tail += args->sliceSteps;
+                  args->idle = 0;
+                }
+              }
+            }
+          }
+        } else if (args->protocol == NCCL_PROTO_LL) {
          int buffSlot = args->tail%NCCL_STEPS;
          int size = sizesFifo[buffSlot];
          if (size != -1) {
@@ -463,17 +305,19 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
            }
          }
        } else if (args->tail < *recvTail) {
-          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
          int stepSize = args->channel->buffSize/NCCL_STEPS;
+          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
          // Send through network
          int buffSlot = args->tail%NCCL_STEPS;
-          NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
-          if (args->requests[buffSlot] != NULL) {
-            sizesFifo[buffSlot] = -1;
-            // Make sure size is reset to zero before we update the head.
-            __sync_synchronize();
-            args->tail += args->sliceSteps;
-            args->idle = 0;
+          if (sizesFifo[buffSlot] != -1) {
+            NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
+            if (args->requests[buffSlot] != NULL) {
+              sizesFifo[buffSlot] = -1;
+              // Make sure size is reset to zero before we update the head.
+              __sync_synchronize();
+              args->tail += args->sliceSteps;
+              args->idle = 0;
+            }
          }
        }
      }
@@ -512,11 +356,11 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
  }
  if (args->state == ncclProxyOpProgress) {
    args->idle = 1;
-    int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
+    int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
    if (args->head < args->end) {
      struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
-      char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
-      void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
+      char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff;
+      void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle;
      volatile uint64_t* sendHead = &resources->hostSendMem->head;
      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
        int buffSlot = args->tail%NCCL_STEPS;
@@ -533,7 +377,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
        if (done) {
          args->head += args->sliceSteps;
-          if (args->llMode == 0) {
+          if (args->protocol == NCCL_PROTO_SIMPLE) {
            if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
            resources->hostRecvMem->tail = args->head;
          }
@@ -553,7 +397,6 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
 struct ncclTransport netTransport = {
  "NET",
  netCanConnect,
-  netGetRings,
  { netSendSetup, netSendConnect, netSendFree, netSendProxy },
  { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
 };
@@ -8,7 +8,7 @@
 #include "core.h"
 #include "socket.h"
 #include "net.h"
-#include "topo.h"
+#include "graph.h"
 #include "utils.h"
 #include "param.h"

@@ -107,7 +107,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
      char* userIbEnv = getenv("NCCL_IB_HCA");
      struct netIf userIfs[MAX_IB_DEVS];
      bool searchNot = userIbEnv && userIbEnv[0] == '^';
+      if (searchNot) userIbEnv++;
      bool searchExact = userIbEnv && userIbEnv[0] == '=';
+      if (searchExact) userIbEnv++;
      int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);

      if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
@@ -199,32 +201,14 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
    moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
  }
  if (moduleLoaded == 0) return ncclSystemError;
-  ncclResult_t ret = ncclSystemError;
-  void* ptr;
-  if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
-    struct ibv_mr* mr;
-    struct ibv_pd* pd;
-    if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
-      if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) {
-        ret = ncclSuccess;
-        wrap_ibv_dereg_mr(mr);
-      }
-      wrap_ibv_dealloc_pd(pd);
-    }
-    cudaFree(ptr);
-  }
-  return ret;
+  return ncclSuccess;
 }

 ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
  *supportedTypes = NCCL_PTR_HOST;

-  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
-
  if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
+    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
    return ncclSuccess;
  }
  *supportedTypes |= NCCL_PTR_CUDA;
@@ -4,7 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "nccl.h"
+#include "comm.h"
 #include "core.h"
 #include "socket.h"
 #include "net.h"
@@ -108,6 +108,7 @@ struct ncclSocketRequest {
  void* data;
  int size;
  int ctrlFd;
+  int offset;
  int used;
  struct ncclSocketComm* comm;
  struct ncclSocketTask* tasks[MAX_SOCKETS];
@@ -193,7 +194,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
  }
  if (nThreads == -2 || nSocksPerThread == -2) {
    // Auto-detection
-    int autoNt=1, autoNs=1;
+    int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
    char vendorPath[PATH_MAX];
    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
    char* rPath = realpath(vendorPath, NULL);
@@ -213,6 +214,9 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
    if (strcmp(vendor, "0x1d0f") == 0) { // AWS
      autoNt = 2;
      autoNs = 8;
+    } else if (strcmp(vendor, "0x1ae0") == 0) { // GCP
+      autoNt = 4;
+      autoNs = 1;
    }
 end:
    if (nThreads == -2) nThreads = autoNt;
@@ -226,7 +230,7 @@ end:
  }
  *ns = nSocks;
  *nt = nThreads;
-  INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
+  if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
  return ncclSuccess;
 }

@@ -379,31 +383,45 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
      return ncclInternalError;
    }
    r->size = data;
+    r->offset = 0;
    r->used = 2; // done exchanging size
    // divide into subtasks
-    int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
    int chunkOffset = 0, i = 0;
-    while (chunkOffset < r->size) {
-      int chunkSize = std::min(taskSize, r->size-chunkOffset);
-      NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
-      chunkOffset += chunkSize;
+    if (r->comm->nSocks > 0) {
+      int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
+      while (chunkOffset < r->size) {
+        int chunkSize = std::min(taskSize, r->size-chunkOffset);
+        NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+        chunkOffset += chunkSize;
+      }
    }
    r->nSubs = i;
  }
  if (r->used == 2) { // already exchanged size
-    int nCompleted = 0;
-    for (int i=0; i<r->nSubs; i++) {
-      struct ncclSocketTask* sub = r->tasks[i];
-      if (sub->result != ncclSuccess) return sub->result;
-      if (sub->offset == sub->size) nCompleted++;
-    }
-    if (nCompleted == r->nSubs) {
-      if (size) *size = r->size;
-      *done = 1;
-      r->used = 0;
+    if (r->nSubs > 0) {
+      int nCompleted = 0;
      for (int i=0; i<r->nSubs; i++) {
        struct ncclSocketTask* sub = r->tasks[i];
-        sub->used = 0;
+        if (sub->result != ncclSuccess) return sub->result;
+        if (sub->offset == sub->size) nCompleted++;
+      }
+      if (nCompleted == r->nSubs) {
+        if (size) *size = r->size;
+        *done = 1;
+        r->used = 0;
+        for (int i=0; i<r->nSubs; i++) {
+          struct ncclSocketTask* sub = r->tasks[i];
+          sub->used = 0;
+        }
+      }
+    } else { // progress request using main thread
+      if (r->offset < r->size) {
+        NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->data, r->size, &r->offset));
+      }
+      if (r->offset == r->size) {
+        if (size) *size = r->size;
+        *done = 1;
+        r->used = 0;
      }
    }
  }
@@ -4,15 +4,9 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "comm.h"
+#include "graph.h"
 #include "utils.h"
-#include "topo.h"
-#include "transport.h"
-#include "param.h"
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <ctype.h>
-#include "nvlink.h"

 struct p2pConnectInfo {
  int direct;
@@ -38,419 +32,91 @@ NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
 NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);

 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
-static int busIdToCudaDev(const char* busId) {
+static int busIdToCudaDev(int64_t busId) {
  int ndev;
  if (cudaGetDeviceCount(&ndev) != cudaSuccess)
    return -1;
  for (int i = 0; i < ndev; i++) {
-    char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
+    char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    if (cudaDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
      return -1;
-    if (strcmp(busId, devBusId) == 0) {
-      return i;
-    }
+    int64_t devBusId;
+    NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId));
+    if (busId == devBusId) return i;
  }
  // BusId was not found in our locally visible CUDA devices
  return -1;
 }

-/* Determine if we can communicate with the peer through p2p */
-ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
-  // Do not use P2P across root complexes by default (provided CUDA permits it)
-  int p2pLevel = PATH_NODE;
+/* Determine if two peers can communicate through p2p */
+ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  int cpuCount;
+  NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount));
+  // Do not use P2P across sockets by default (provided CUDA permits it).
+  // When we are on a single socket, don't even use P2P through the CPU as
+  // it should be able to sustain two flows to sysmem faster than PCI P2P.
+  int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE;
  if (ncclParamP2pDisable() == 1) p2pLevel = 0;
  if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();

+  // Disable P2P
  *ret = 0;

  if (p2pLevel == 0) return ncclSuccess;

  // Rule out different nodes
-  if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
+  if (info1->hostHash != info2->hostHash) return ncclSuccess;

  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
-  int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-  if (peerCudaDev == -1) {
+  int cudaDev1 = busIdToCudaDev(info1->busId);
+  int cudaDev2 = busIdToCudaDev(info2->busId);
+  if (cudaDev1 == -1 || cudaDev2 == -1) {
    // Peer's CUDA device is not visible in this process
 #if CUDART_VERSION >= 10010
    // But in CUDA 10.1 we can still communicate with 'invisible' devices
-    TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
+    TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId);
    // Check for NVLink/NVswitch including P2P access
-    int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
-    if (nvlinkp2p > 0) {
-      *ret = nvlinkp2p;
+    int nvlink;
+    NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
+    if (nvlink > 0) {
+      *ret = 1;
      return ncclSuccess;
    }
 #endif
    return ncclSuccess;
  }

-  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
+  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId);

  // Do not detect topology if we're on the same GPU. Note this is not really supported.
-  if (myInfo->cudaDev == peerCudaDev) {
-    *ret = 1 + PATH_SYS;
+  if (cudaDev1 == cudaDev2) {
+    *ret = 1;
    return ncclSuccess;
  }

  // See if CUDA can do P2P
  int p2p;
-  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
-    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
-         myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
+  if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) {
+    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
+         cudaDev1, info1->busId, cudaDev2, info2->busId);
    return ncclSuccess;
  }
  if (p2p == 0) return ncclSuccess;

  // Check for NVLink/NVswitch
-  int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
-  if (nvlinkp2p > 0) {
-    *ret = nvlinkp2p;
+  int nvlink;
+  NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
+  if (nvlink > 0) {
+    *ret = 1;
    return ncclSuccess;
  }

  // Finally compute the PCI distance and compare with the p2pLevel.
-  char* myPath;
-  char* peerPath;
-  ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
-  ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
-  if (err1 == ncclSuccess && err2 == ncclSuccess) {
-    int distance = pciDistance(myPath, peerPath);
-    if (distance < p2pLevel) {
-      *ret = 1 + PATH_SYS - distance;
-    }
+  int distance;
+  NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance));
+  if (distance < p2pLevel) {
+    *ret = 1;
  }
-  if (err1 == ncclSuccess) free(myPath);
-  if (err2 == ncclSuccess) free(peerPath);
-  return ncclSuccess;
-}
-
-#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
-#define MAXGPUS_PCI 64
-
-static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
-  int nrings = 0;
-  ncclTvalue_t* line = matrix+current*n;
-  inTheRing[current] = 1;
-  int currentStep = (currentRing+1)*n-remaining;
-  rings[currentStep-1] = current;
-  if (remaining == 0) {
-    int looprank = rings[currentRing*n];
-    if (line[looprank] > 0) {
-      if (currentRing+1 == nRingsMax) {
-        nrings = 1;
-      } else {
-        line[looprank]--;
-        for (int i=0; i<n; i++) inTheRing[i] = 0;
-        if (connect) {
-          // First two slots are already set and we need to respect those constraints
-          inTheRing[rings[currentStep]] = 1;
-          nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect);
-        } else {
-          rings[(currentRing+1)*n] = 0;
-          nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect);
-        }
-        line[looprank]++;
-        for (int i=0; i<n; i++) inTheRing[i] = 1;
-      }
-    }
-  } else {
-    int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
-    int maxStep = 0;
-    for (int i=0; i<n; i++) {
-      if (inTheRing[i] == 0 && line[i] > 0) {
-        line[i]--;
-        int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect);
-        if (nr > nrings) {
-          nrings = nr;
-          maxStep = (nr+currentRing)*n;
-          ringsSave[currentStep] = i;
-          // Save the rest of the rings
-          for (int r=currentStep+1; r<maxStep; r++) {
-            ringsSave[r] = rings[r];
-          }
-          if (nrings + currentRing == nRingsMax) {
-            // We found an optimal solution. Let's stop there.
-            break;
-          }
-        }
-        line[i]++;
-      }
-    }
-    for (int r=currentStep; r<maxStep; r++) {
-      rings[r] = ringsSave[r];
-    }
-  }
-  inTheRing[current] = 0;
-  return nrings;
-}
-
-static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
-  if (nrings == 0) return 0;
-  // Copy rings by dup times
-  if (newNrings > MAXCHANNELS) {
-    newNrings = MAXCHANNELS;
-  }
-  for (int r=nrings; r<newNrings; r++) {
-    for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
-  }
-  return newNrings;
-}
-
-int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) {
-  int* inTheRing = (int*)malloc(sizeof(int)*nranks);
-  if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; }
-  for (int i=0; i<nranks; i++) inTheRing[i] = 0;
-  int nrings;
-  if (connect) {
-    inTheRing[rings[0]] = 1;
-    nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
-  } else {
-    rings[0] = 0;
-    nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
-  }
-  free(inTheRing);
-  return nrings;
-}
-
-static inline int findConnect(int nranks, int* ranks) {
-  for (int i = 0; i<nranks; i++) {
-    if (ranks[i] != -1) return i;
-  }
-  return -1;
-}
-
-int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
-  if (nrings == 0) return 0;
-  if (nrings > MAXCHANNELS) {
-    WARN("Max rings reached, limiting to %d", MAXCHANNELS);
-    nrings = MAXCHANNELS;
-  }
-  // Find existing constraints / connections
-  int connect = 0;
-  for (int r=0; r<nrings; r++) {
-    int start = findConnect(nranks, prev+r*nranks);
-    int end = findConnect(nranks, next+r*nranks);
-    if (start != -1 && end != -1) {
-      rings[r*nranks] = end;
-      rings[r*nranks+1] = start;
-      connect = 1;
-    }
-  }
-
-  // Compute rings
-  ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks);
-  if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; }
-  for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++)
-      matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ;
-
-  int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect);
-
-  free(matrix);
-
-  if (oversubscribe || connect) return compNrings;
-
-  if (compNrings && compNrings < nrings && nranks <= 4) {
-    // Try to oversubscribe to get a better result
-    int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
-    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
-    for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
-    int nThreads = *nthreads;
-    int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
-    if (compNrings2 > compNrings*2) {
-      // Oversubscription worked.
-      for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i];
-      compNrings = compNrings2;
-    }
-    free(rings2);
-  }
-
-  // Duplicate the rings for direct NVLink
-  compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
-
-  return compNrings;
-}
-
-int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
-  int nrings = nringsStart;
-  int connect = 0;
-  for (int r=0; r<nrings; r++) {
-    int start = findConnect(nranks, prev+r*nranks);
-    int end = findConnect(nranks, next+r*nranks);
-    if (start != -1 && end != -1) {
-      rings[r*nranks] = end;
-      rings[r*nranks+1] = start;
-      int cur = start;
-      for (int i=2; i<nranks; i++) {
-        int next = (cur+1) % nranks;
-        while (next == end || next == start) next = (next+1) % nranks;
-        if (values[cur*nranks+next] < minScore) {
-          return 0;
-        }
-        rings[r*nranks+i] = next;
-        cur = next;
-      }
-      connect = 1;
-    } else {
-      if (connect == 1 && r > 0) {
-        WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r);
-        return r;
-      } else {
-        return 0;
-      }
-    }
-  }
-  return nrings;
-}
-
-int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
-  for (int r=0; r<nringsStart; r++) {
-    for (int i=0; i<nranks; i++) {
-      rings[r*nranks+i] = i;
-    }
-  }
-  return nringsStart;
-}
-
-static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
-  for (int score = PATH_SYS+1; score >= minScore; score--) {
-    int best = -1;
-    int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
-    for (int n = 0; n < nranks; n++) {
-      if (inRing[n]) continue;
-      if (values[rank*nranks+n] == score) {
-        if (end == -1) return n;
-        if (values[end*nranks+n] < worst_end_score) {
-          best = n;
-          worst_end_score = values[end*nranks+n];
-        }
-      }
-    }
-    if (best != -1) return best;
-  }
-  return -1;
-}
-
-int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) {
-  int connect = 0;
-  for (int r=0; r<nrings; r++) {
-    int start = findConnect(nranks, prev+r*nranks);
-    int end = findConnect(nranks, next+r*nranks);
-
-    int inRing[MAXGPUS_PCI];
-    for (int i=0; i<nranks; i++) inRing[i] = 0;
-
-    if (start == -1 && end == -1) {
-      if (connect == 1 && r > 0) {
-        WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r);
-        return r;
-      }
-      end = 0;
-      inRing[end] = 1;
-      start = findClosestPci(values, inRing, end, -1, nranks, minScore);
-      if (start == -1) return r;
-    } else if (start == -1 || end == -1) {
-      WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r);
-      return r;
-    } else {
-      connect = 1;
-    }
-    rings[r*nranks] = end;
-    rings[r*nranks+1] = start;
-    inRing[start] = inRing[end] = 1;
-    int cur = start;
-    for (int i=2; i<nranks; i++) {
-      int next = findClosestPci(values, inRing, cur, end, nranks, minScore);
-      if (next == -1) return r;
-
-      inRing[next] = 1;
-      rings[r*nranks+i] = next;
-      cur = next;
-    }
-    // Check the loop is closing
-    inRing[end] = 0;
-    if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r;
-
-    if (connect == 0) return 1;
-  }
-  return nrings;
-}
-
-ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  if (*nringsRet == 0) return ncclSuccess;
-  int *rings;
-  NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
-  for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
-  int nrings = *nringsRet;
-
-  // NVswitch
-  int nvswitchLinks = 0;
-  int directLinks = 0;
-  for (int rank=0; rank<nranks; rank++) {
-    for (int j=1; j<nranks; j++) {
-      int i = (rank + j) % nranks;
-      ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH;
-      if (j>1 && links != nvswitchLinks) {
-        WARN("Internal error : NVswitch links mismatch");
-        return ncclInternalError;
-      }
-      nvswitchLinks = links;
-    }
-  }
-  if (nvswitchLinks) {
-    // NVSwitch : Connect existing rings
-    int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads);
-    if (nringsConnected > 0) {
-      nrings = nringsConnected;
-    } else {
-      nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks
-      // Or create new ones
-      nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads);
-      // And duplicate them
-      nrings = copyRings(nranks, rings, nrings, nrings*2);
-    }
-    goto end;
-  }
-
-  // point-to-point NVLink
-  for (int rank=0; rank<nranks; rank++) {
-    int links = 0;
-    for (int i=0; i<nranks; i++) {
-      ncclTvalue_t val = values[rank*nranks+i];
-      if (val >= CONNECT_NVSWITCH) continue;
-      links += val/CONNECT_NVLINK;
-    }
-    if (rank == 0) directLinks = links;
-    else directLinks = std::min(directLinks, links);
-  }
-  if (directLinks > 0) {
-    // NVLink : Connect rings or create new ones
-    if (nranks > MAXGPUS_NVLINKP2P) {
-      WARN("Recursive P2P computation cannot work for >8 GPUs");
-      return ncclInternalError;
-    }
-    nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
-    goto end;
-  }
-
-  // PCIe or QPI : Connect rings or create new ones
-  nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore);
-
-end:
-  *nringsRet = nrings;
-  for (int ring = 0; ring<nrings; ring++) {
-    for (int index=0; index<nranks; index++) {
-      int prevIndex = (index - 1 + nranks) % nranks;
-      int nextIndex = (index + 1) % nranks;
-      int curRank = rings[ring*nranks+index];
-      int prevRank = rings[ring*nranks+prevIndex];
-      int nextRank = rings[ring*nranks+nextIndex];
-      if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank;
-      if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank;
-    }
-  }
-
-  free(rings);
  return ncclSuccess;
 }

@@ -462,7 +128,7 @@ end:
  } while (0)

 /* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
    struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {

  struct p2pSendResources* resources;
@@ -477,19 +143,20 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
    info.direct = 1;
    info.directPtr = resources->devMem;
    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      return ncclInternalError;
    } else {
      // Enable P2P access
      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
      if (err == cudaErrorPeerAccessAlreadyEnabled) {
        cudaGetLastError();
      } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d(=%d): %d %s",
-             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
+        WARN("failed to peer with device %d(=%lx): %d %s",
+             peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
        return ncclInternalError;
      }
-      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
-          channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
+          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
    }
  } else {
    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
@@ -498,12 +165,12 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
    // Map IPC and enable P2P access
    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
    if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
      return ncclInternalError;
    }
-    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
-        channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
+        channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
    //TRACE_DUMP_IPC(&info.devIpc);
  }
  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -512,7 +179,7 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
 }

 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
    struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {

  struct p2pRecvResources* resources;
@@ -534,11 +201,11 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
      if (err == cudaErrorPeerAccessAlreadyEnabled) {
        cudaGetLastError();
      } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d(=%d): %d %s",
-             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
+        WARN("failed to peer with device %d(=%lx): %d %s",
+             peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
        return ncclInternalError;
      }
-      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
+      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
    }
  } else {
    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
@@ -547,11 +214,11 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
    // Map IPC and enable P2P access
    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
    if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
      return ncclInternalError;
    }
-    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
    //TRACE_DUMP_IPC(&info.devIpc);
  }
  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -580,6 +247,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC

  send->conn.buff = remDevMem->buff;
  send->conn.llBuff = remDevMem->llBuff;
+  send->conn.ll128Buff = remDevMem->ll128Buff;
  send->conn.tail = &remDevMem->tail;
  send->conn.opCountRem = &remDevMem->opCount;
  send->conn.head = &resources->devMem->head;
@@ -610,6 +278,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto

  recv->conn.buff = resources->devMem->buff;
  recv->conn.llBuff = resources->devMem->llBuff;
+  recv->conn.ll128Buff = resources->devMem->ll128Buff;
  recv->conn.tail = &resources->devMem->tail;
  recv->conn.opCountLoc = &resources->devMem->opCount;
  recv->conn.head = &remDevMem->head;
@@ -638,7 +307,6 @@ ncclResult_t p2pRecvFree(void* resources) {
 struct ncclTransport p2pTransport = {
  "P2P",
  p2pCanConnect,
-  p2pGetRings,
  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
 };
@@ -4,13 +4,8 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
-#include "utils.h"
-#include "transport.h"
-#include "param.h"
+#include "comm.h"
 #include "shm.h"
-#include <unistd.h>
-#include <cuda_runtime.h>

 struct shmConnectInfo {
  uint64_t pidHash;
@@ -40,98 +35,29 @@ struct shmRecvResources {

 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);

-/* Determine if we can communicate with the peer */
-ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
-  *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
-  return ncclSuccess;
-}
+/* Determine two peers can communicate with SHM */
+ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  *ret = 0;

-static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) {
-  for (int rank = 0; rank<nranks; rank++) {
-    if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
-  }
-  return -1;
-}
+  if (ncclParamShmDisable() == 1) return ncclSuccess;

-static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) {
-  for (int rank = nranks-1; rank>=0; rank--) {
-    if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
-  }
-  return -1;
-}
+  // Same host?
+  TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash);
+  if (info1->hostHash != info2->hostHash) return ncclSuccess;

-#define MAXGROUPS 16
+  // Common /dev/shm (between containers) ?
+  TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev);
+  if (info1->shmDev != info2->shmDev) return ncclSuccess;
+
+  *ret = 1;

-ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
-  int nGroups = groups[nranks-1] + 1;
-  int starts[MAXGROUPS];
-  int ends[MAXGROUPS];
-  for (int ring = 0; ring<*nringsRet; ring++) {
-    int startGroup = -1, endGroup = -1;
-    for (int group = 0; group<nGroups; group++) {
-      int start = -1;
-      int end = -1;
-      int nranksInGroup = 0;
-      for (int rank=0; rank<nranks; rank++) {
-        if (groups[rank] != group) continue;
-        nranksInGroup++;
-        if (prev[ring*nranks+rank] != -1) {
-          if (start != -1) {
-            WARN("Multiple starts found in group");
-          }
-          start = rank;
-          startGroup = group;
-        }
-        if (next[ring*nranks+rank] != -1) {
-          if (end != -1) {
-            WARN("Multiple ends found in group");
-          }
-          end = rank;
-          endGroup = group;
-        }
-      }
-      if (nranksInGroup == 1) {
-        start = end = groupFirst(nranks, groups, group, -1);
-      } else {
-        if (start == -1)
-          start = groupFirst(nranks, groups, group, end);
-        if (end == -1)
-          end = groupLast(nranks, groups, group, start);
-      }
-      if (start == -1 || end == -1) {
-        *nringsRet = ring;
-        return ncclSuccess;
-      }
-      starts[group] = start;
-      ends[group] = end;
-    }
-    if (endGroup == -1 || startGroup == -1) {
-      startGroup = 0;
-      endGroup = nGroups-1;
-      // Close the loop
-      next[ring*nranks+ends[endGroup]] = starts[startGroup];
-      prev[ring*nranks+starts[startGroup]] = ends[endGroup];
-    }
-    int group = startGroup;
-    for (int i=0; i<nGroups-2; i++) {
-      int nextGroup = (group+1)%nGroups;
-      if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups;
-      next[ring*nranks+ends[group]] = starts[nextGroup];
-      prev[ring*nranks+starts[nextGroup]] = ends[group];
-      group = nextGroup;
-    }
-    // Connect with the last
-    next[ring*nranks+ends[group]] = starts[endGroup];
-    prev[ring*nranks+starts[endGroup]] = ends[group];
-  }
  return ncclSuccess;
 }

 #define MAX_SHM_NAME_LEN 1024

 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {

  struct shmSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
@@ -149,13 +75,13 @@ ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));

-  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
  return ncclSuccess;
 }

-ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
  struct shmRecvResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;
@@ -194,6 +120,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
  send->transportResources = resources;
  send->conn.buff = resources->devRemHostMem->buff;
  send->conn.llBuff = resources->devRemHostMem->llBuff;
+  send->conn.ll128Buff = resources->devRemHostMem->ll128Buff;
  send->conn.tail = &resources->devRemHostMem->tail;
  send->conn.opCountRem = &resources->devRemHostMem->opCount;

@@ -218,6 +145,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto

  recv->conn.buff = resources->devHostMem->buff;
  recv->conn.llBuff = resources->devHostMem->llBuff;
+  recv->conn.ll128Buff = resources->devHostMem->ll128Buff;
  recv->conn.tail = &resources->devHostMem->tail;
  recv->conn.opCountLoc = &resources->devHostMem->opCount;
  return ncclSuccess;
@@ -242,7 +170,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
 struct ncclTransport shmTransport = {
  "SHM",
  shmCanConnect,
-  shmGetRings,
  { shmSendSetup, shmSendConnect, shmSendFree, NULL },
  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
 };