Merge pull request #172 from wenkaidu/topo_expl

Add topology explorer [ROCm/rccl commit: 5b3856f2ed]
2020-02-20 15:16:55 -08:00
@@ -30,9 +30,15 @@ const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "QPI", "NET" };
 const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
 #endif

+#ifdef TOPO_EXPL
+#include "model.h"
+extern NodeModel *node_model;
+#endif
+
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
 /******************************************************************/
+#ifndef TOPO_EXPL
 static int getNumaId(char *path) {
  char npath[PATH_MAX];
  snprintf(npath, PATH_MAX, "%s/numa_node", path);
@@ -59,6 +65,15 @@ static ncclResult_t getPciPath(char* busId, char** path) {
  }
  return ncclSuccess;
 }
+#else
+static int getNumaId(char *path) {
+  return node_model->getNumaId(path);
+}
+
+static ncclResult_t getPciPath(char* busId, char** path) {
+  return node_model->getGpuPciPath(busId, path);
+}
+#endif

 // Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
 ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
@@ -102,6 +117,7 @@ int interCpuWidth = 0;
 int cpuPciWidth = 0;
 int p2pPciWidth = 0;

+#ifndef TOPO_EXPL
 static ncclResult_t getCpuWidths() {
  // Check if already detected
  if (interCpuWidth + cpuPciWidth + p2pPciWidth) return ncclSuccess;
@@ -182,6 +198,14 @@ static ncclResult_t getCpuWidths() {
  INFO(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth);
  return ncclSuccess;
 }
+#else
+static ncclResult_t getCpuWidths() {
+  char cpu[256];
+  node_model->getCpuWidths(cpu, &interCpuWidth, &cpuPciWidth, &p2pPciWidth);
+  TRACE(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth);
+  return ncclSuccess;
+}
+#endif

 static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
  NCCLCHECK(getCpuWidths());
@@ -272,7 +296,11 @@ ncclResult_t ncclTopoConnectXGMI(struct ncclComm* comm, struct ncclTopoSystem* s
      uint32_t link_type, hops;
      int cudaDev1 = busIdToCudaDev(comm->peerInfo[gpu1->rank].busId);
      int cudaDev2 = busIdToCudaDev(comm->peerInfo[gpu2->rank].busId);
+#ifndef TOPO_EXPL
      if (hipExtGetLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) {
+#else
+      if (node_model->getLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) {
+#endif
        if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) {
          NCCLCHECK(ncclTopoConnectNodes(gpu1, gpu2, LINK_NVL, minWidth));
        }
@@ -424,6 +452,7 @@ ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTop
 // Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
 #include <glob.h>
 #define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
+#ifndef TOPO_EXPL
 uint64_t getIbGuid(char* path) {
  uint64_t guid = 0ULL;
  char guidPath[PATH_MAX];
@@ -446,6 +475,11 @@ uint64_t getIbGuid(char* path) {
  }
  return guid;
 }
+#else
+uint64_t getIbGuid(char* path) {
+  return node_model->getIbGuid(path);
+}
+#endif

 struct netInfo {
  char* path;
@@ -0,0 +1,19 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+HIP_PATH ?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+HIP_PATH = ../../..
+endif
+HIPCC = $(HIP_PATH)/bin/hipcc
+
+EXE = topo_expl
+CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE
+
+files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/graph/search.cc ../../src/graph/connect.cc
+
+all: $(EXE)
+
+$(EXE): $(files)
+	$(HIPCC) $(CXXFLAGS) $^ -o $@
+
+clean:
+	rm -f *.o $(EXE)
@@ -0,0 +1,419 @@
+/*
+Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef MODEL_H_
+#define MODEL_H_
+
+class CpuDevices {
+private:
+  char *cpuName;
+  int interCpuWidth;
+  int cpuPciWidth;
+  int p2pPciWidth;
+
+public:
+  CpuDevices(const char *cpuname, const int intercpuwidth, const int cpupciwidth, const int p2ppciwidth) :
+    cpuName((char *)cpuname), interCpuWidth(intercpuwidth), cpuPciWidth(cpupciwidth), p2pPciWidth(p2ppciwidth) {}
+
+  CpuDevices() : cpuName(0), interCpuWidth(0), cpuPciWidth(0), p2pPciWidth(0) {}
+
+  ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) {
+    strcpy(name, cpuName);
+    *interCpu = interCpuWidth;
+    *cpuPci = cpuPciWidth;
+    *p2pPci = p2pPciWidth;
+    return ncclSuccess;
+  }
+};
+
+class GpuDevices {
+private:
+  int nGpus;
+  uint64_t *busIds;
+  char **gpuPciPaths;
+  int *gpuNumaIds;
+  int *connMatrix;
+
+public:
+  GpuDevices(const int ngpus, const uint64_t *busids, const char **gpupcipaths, const int *gpunumaids, const int *connmatrix) :
+    nGpus(ngpus), busIds((uint64_t *)busids), gpuPciPaths((char **)gpupcipaths), gpuNumaIds((int *)gpunumaids), connMatrix((int *)connmatrix) {}
+
+  GpuDevices () : nGpus(0), busIds(0), gpuPciPaths(0), gpuNumaIds(0), connMatrix(0) {}
+
+  int getnDevs() { return nGpus; }
+
+  uint64_t getBusId(int dev) { return busIds[dev]; }
+
+  ncclResult_t getPciPath(char* busId, char** path) {
+    char tempBusId[] = "0000:00:00.0";
+    *path = (char *)malloc(PATH_MAX);
+    int i;
+    for (i = 0; i < nGpus; i++) {
+      NCCLCHECK(int64ToBusId(busIds[i], tempBusId));
+      if (strcmp(busId, tempBusId) == 0)
+        break;
+    }
+    if (i < nGpus)
+      strcpy(*path, gpuPciPaths[i]);
+    else {
+      WARN("Could not find real path of %s", busId);
+      return ncclSystemError;
+    }
+    return ncclSuccess;
+  }
+
+  int p2pCanConnect(int device1, int device2) {
+    // connection matrix are 8 GPUs
+    int dist = connMatrix[device1*8+device2];
+    if (dist == 255)
+      return 0;
+    //if (dist%15 == 0 && dist/15 != 1) {
+    //  return 0;
+    //}
+    return 1;
+  };
+
+  hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) {
+    // connection matrix are 8 GPUs
+    int dist = connMatrix[device1*8+device2];
+
+    if (dist%15 == 0) {
+      *linktype = 4;
+      *hopcount = dist/15;
+    }
+    else if (dist%20 == 0) {
+      *linktype = 2;
+      *hopcount = dist/20;
+    }
+    else if (dist%36 == 0) {
+      *linktype = 1;
+      *hopcount = dist/36;
+    }
+    return hipSuccess;
+  }
+
+  virtual int getNumaId(char *path) {
+    int n;
+    // search for all GPUs
+    for (n = 0; n < nGpus; n++)
+      if (strcmp(path, gpuPciPaths[n]) == 0)
+        break;
+    if (n < nGpus)
+      return gpuNumaIds[n];
+    return -1;
+  }
+};
+
+class NetDevices {
+private:
+  int nNetDevs;
+  char **netPciPaths;
+  uint64_t *netGuids;    // IB ports on same card share the same GUID
+  int *netNumaIds;
+
+public:
+  NetDevices(const int nnetdevs, const char **netpcipaths, const uint64_t *netguids, const int *netnumaids) :
+    nNetDevs(nnetdevs), netPciPaths((char **)netpcipaths), netGuids((uint64_t *)netguids), netNumaIds((int *)netnumaids) {}
+
+  NetDevices() : nNetDevs(0), netPciPaths(0), netGuids(0), netNumaIds(0) {}
+
+  int getnDevs() { return nNetDevs; }
+
+  ncclResult_t getPciPath(int dev, char** path) {
+    *path = (char *)malloc(PATH_MAX);
+    if (dev < nNetDevs)
+      strcpy(*path, netPciPaths[dev]);
+    else {
+      WARN("Could not find real path of %d", dev);
+      return ncclSystemError;
+    }
+    return ncclSuccess;
+  }
+
+  virtual int getNumaId(char *path) {
+    int n;
+    // search for all NICs
+    for (n = 0; n < nNetDevs; n++)
+      if (strcmp(path, netPciPaths[n]) == 0)
+        break;
+    if (n < nNetDevs)
+      return netNumaIds[n];
+    return -1;
+  }
+
+  uint64_t getIbGuid(char* path) {
+    int n;
+    for (n = 0; n < nNetDevs; n++)
+      if (strcmp(path, netPciPaths[n]) == 0)
+        break;
+    if (n < nNetDevs)
+      return netGuids[n];
+    WARN("Invalid IB path %s", path);
+    return 0;
+  }
+};
+
+class NodeModel {
+private:
+  CpuDevices cpus;
+  GpuDevices gpus;
+  NetDevices netdevs;
+
+public:
+  int nodeId;
+  int currRank;
+  int firstRank;
+  uint64_t hostHash;  // auto-generated
+  uint64_t pidHash;   // auto-generated
+  char description[256];
+
+  int rankToCudaDev(int rank) { return rank - firstRank; }
+
+  int getnGpus() { return gpus.getnDevs(); }
+
+  int getnNetDevs() { return netdevs.getnDevs(); }
+
+  ncclResult_t getGpuPciPath(char* busId, char** path) {
+    return gpus.getPciPath(busId, path);
+  }
+
+  ncclResult_t getNetPciPath(int dev, char** path) {
+
+    return netdevs.getPciPath(dev, path);
+  }
+
+  uint64_t getGpuBusId(int dev) {
+    return gpus.getBusId(dev);
+  }
+
+  int p2pCanConnect(int device1, int device2) { return gpus.p2pCanConnect(device1, device2); }
+
+  hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) {
+    return gpus.getLinkTypeAndHopCount(device1, device2, linktype, hopcount);
+  }
+
+  uint64_t getIbGuid(char* path) {
+    return netdevs.getIbGuid(path);
+  }
+
+  int shmCanConnect(int device1, int device2) { return 1; }
+  int netCanConnect(int device1, int device2) { return 1; }
+
+  virtual int getNumaId(char *path) {
+    int numa = gpus.getNumaId(path);
+    if (numa != -1) return numa;
+    numa = netdevs.getNumaId(path);
+    if (numa != -1) return numa;
+    WARN("Invalid path %s for getNumaId", path);
+    return 0;
+  }
+
+  virtual ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) {
+    return cpus.getCpuWidths(name, interCpu, cpuPci, p2pPci);
+  }
+
+  NodeModel(CpuDevices cpu, GpuDevices gpu, NetDevices net, const char *desc) :
+    cpus(cpu), gpus(gpu), netdevs(net) {
+      strncpy(description, desc, 256);
+  }
+
+  NodeModel() {}
+
+  ~NodeModel() {}
+};
+
+class NetworkModel {
+private:
+  int nNodes;
+  int nRanks;
+  NodeModel nodes[NCCL_TOPO_MAX_NODES];
+
+public:
+  void AddNode(NodeModel node) {
+    nodes[nNodes] = node;
+    nodes[nNodes].nodeId = nNodes;
+    nodes[nNodes].firstRank = nRanks;
+    nodes[nNodes].hostHash = ((uint64_t)rand() << 32) | rand();
+    nodes[nNodes].pidHash = ((uint64_t)rand() << 32) | rand();
+    nNodes++;
+    nRanks += node.getnGpus();
+  }
+
+  int GetNNodes() { return nNodes; }
+
+  int GetNRanks() { return nRanks; }
+
+  NodeModel* GetNode(int rank) {
+    int node_id;
+
+    if(rank < 0 || rank >= nRanks)
+      return 0;
+
+    for(node_id = nNodes-1; node_id >= 0; node_id--)
+      if(rank >= nodes[node_id].firstRank) break;
+
+    if (node_id >= 0) {
+      nodes[node_id].currRank = rank;
+      return nodes+node_id;
+    }
+    else
+      return 0;
+  }
+
+  NetworkModel() : nNodes(0), nRanks(0) {}
+};
+
+
+const static uint64_t busIds_8[] = { 0x1d000, 0x20000, 0x23000, 0x26000, 0x3f000, 0x43000, 0x46000, 0x49000 };
+
+const static char* gpuPciPaths_8[] = {
+  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:08.0/0000:1b:00.0/0000:1c:00.0/0000:1d:00.0",
+  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:0c.0/0000:1e:00.0/0000:1f:00.0/0000:20:00.0",
+  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:10.0/0000:21:00.0/0000:22:00.0/0000:23:00.0",
+  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:14.0/0000:24:00.0/0000:25:00.0/0000:26:00.0",
+  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:04.0/0000:3d:00.0/0000:3e:00.0/0000:3f:00.0",
+  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:0c.0/0000:41:00.0/0000:42:00.0/0000:43:00.0",
+  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:10.0/0000:44:00.0/0000:45:00.0/0000:46:00.0",
+  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:14.0/0000:47:00.0/0000:48:00.0/0000:49:00.0",
+};
+
+const static int gpuPciNumaIds_8[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+const static char* netPciPaths_1[] = {
+  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0",
+};
+
+const static char* netPciPaths_1_1[] = {
+  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0",
+};
+
+const static uint64_t netGuids_1[] = {
+  0xb8599f030007053aL,
+};
+
+const static int netPciNumaIds_1[] = { 0 };
+
+const static char* netPciPaths_2[] = {
+  "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0",
+  "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0",
+};
+
+const static uint64_t netGuids_2[] = {
+  0xb8599f030007053aL,
+  0x506b4b030027bbf2L,
+};
+
+const static int netPciNumaIds_2[] = { 0, 0 };
+
+const static uint64_t rome_busIds_8[] = { 0x63000, 0x23000, 0x26000, 0x03000, 0xe3000, 0xc3000, 0xc6000, 0xa3000 };
+
+const static char* rome_gpuPciPaths_8[] = {
+  "/sys/devices/pci0000:60/0000:60:03.1/0000:61:00.0/0000:62:00.0/0000:63:00.0",
+  "/sys/devices/pci0000:20/0000:20:01.1/0000:21:00.0/0000:22:00.0/0000:23:00.0",
+  "/sys/devices/pci0000:20/0000:20:03.1/0000:24:00.0/0000:25:00.0/0000:26:00.0",
+  "/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0",
+  "/sys/devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/0000:e2:00.0/0000:e3:00.0",
+  "/sys/devices/pci0000:c0/0000:c0:01.1/0000:c1:00.0/0000:c2:00.0/0000:c3:00.0",
+  "/sys/devices/pci0000:c0/0000:c0:03.1/0000:c4:00.0/0000:c5:00.0/0000:c6:00.0",
+  "/sys/devices/pci0000:a0/0000:a0:03.1/0000:a1:00.0/0000:a2:00.0/0000:a3:00.0",
+};
+
+const static int rome_gpuPciNumaIds_8[] = { 0, 0, 0, 0, 4, 4, 4, 4 };
+
+const static char* rome_netPciPaths_1[] = {
+  "/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0",
+};
+
+const static uint64_t rome_netGuids_1[] = {
+  0xb8599f030007053aL,
+};
+
+const static int rom_netPciNumaIds_1[] = { 0 };
+
+const static char* rome_netPciPaths_2[] = {
+  "/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0",
+  "/sys/devices/pci0000:80/0000:80:01.1/0000:81:00.0",
+};
+
+const static uint64_t rome_netGuids_2[] = {
+  0xb8599f030007053aL,
+  0x506b4b030027bbf2L,
+};
+
+const static int rom_netPciNumaIds_2[] = { 0, 4 };
+
+const int conn_mat_pcie[64] = {
+  0 , 40, 40, 40, 40, 40, 40, 40,
+  40, 0 , 40, 40, 40, 40, 40, 40,
+  40, 40, 0 , 40, 40, 40, 40, 40,
+  40, 40, 40, 0 , 40, 40, 40, 40,
+  40, 40, 40, 40, 0 , 40, 40, 40,
+  40, 40, 40, 40, 40, 0 , 40, 40,
+  40, 40, 40, 40, 40, 40, 0 , 40,
+  40, 40, 40, 40, 40, 40, 40, 0 ,
+};
+
+const int conn_mat_4p2h[64] = {
+  0 , 15, 15, 30, 40, 40, 40, 40,
+  15, 0 , 30, 15, 40, 40, 40, 40,
+  15, 30, 0 , 15, 40, 40, 40, 40,
+  30, 15, 15, 0 , 40, 40, 40, 40,
+  40, 40, 40, 40, 0 , 15, 15, 30,
+  40, 40, 40, 40, 15, 0 , 30, 15,
+  40, 40, 40, 40, 15, 30, 0 , 15,
+  40, 40, 40, 40, 30, 15, 15, 0 ,
+};
+
+const int conn_mat_8p6l[64] = {
+  0 , 15, 15, 15, 15, 30, 15, 15,
+  15, 0 , 15, 15, 30, 15, 15, 15,
+  15, 15, 0 , 15, 15, 15, 15, 30,
+  15, 15, 15, 0 , 15, 15, 30, 15,
+  15, 30, 15, 15, 0 , 15, 15, 15,
+  30, 15, 15, 15, 15, 0 , 15, 15,
+  15, 15, 15, 30, 15, 15, 0 , 15,
+  15, 15, 30, 15, 15, 15, 15, 0 ,
+};
+
+const int conn_mat_8p6l_1[64] = {
+ 0 , 15, 15, 30, 15, 15, 15, 15,
+ 15, 0 , 30, 15, 15, 15, 15, 15,
+ 15, 30, 0 , 15, 15, 15, 15, 15,
+ 30, 15, 15, 0 , 15, 15, 15, 15,
+ 15, 15, 15, 15, 0 , 15, 15, 30,
+ 15, 15, 15, 15, 15, 0 , 30, 15,
+ 15, 15, 15, 15, 15, 30, 0 , 15,
+ 15, 15, 15, 15, 30, 15, 15, 0 ,
+};
+
+const int conn_mat_rome[64] = {
+  0 , 40, 40, 40, 72, 72, 72, 72,
+  40, 0 , 40, 40, 72, 72, 72, 72,
+  40, 40, 0 , 40, 72, 72, 72, 72,
+  40, 40, 40, 0 , 72, 72, 72, 72,
+  72, 72, 72, 72, 0 , 40, 40, 40,
+  72, 72, 72, 72, 40, 0 , 40, 40,
+  72, 72, 72, 72, 40, 40, 0 , 40,
+  72, 72, 72, 72, 40, 40, 40, 0 ,
+};
+
+#endif
@@ -0,0 +1,267 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_H_
+#define NCCL_H_
+
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_fp16.h>
+
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 5
+#define NCCL_PATCH 6
+#define NCCL_SUFFIX ""
+
+#define NCCL_VERSION_CODE 2506
+#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
+
+#define RCCL_BFLOAT16 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Opaque handle to communicator */
+typedef struct ncclComm* ncclComm_t;
+
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclNumResults              =  6 } ncclResult_t;
+
+/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
+ * NCCL library
+ */
+ncclResult_t  ncclGetVersion(int *version);
+ncclResult_t pncclGetVersion(int *version);
+
+/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
+ * called once and the Id should be distributed to all ranks in the
+ * communicator before calling ncclCommInitRank. */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+
+/* Creates a new communicator (multi thread/process version).
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
+ * Each rank is associated to a CUDA device, which has to be set before calling
+ * ncclCommInitRank.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+
+/* Creates a clique of communicators (single process version).
+ * This is a convenience function to create a single-process communicator clique.
+ * Returns an array of ndev newly initialized communicators in comm.
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+
+/* Frees resources associated with communicator object, but waits for any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+ncclResult_t pncclCommDestroy(ncclComm_t comm);
+
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+
+/* Returns a human-readable error message. */
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
+
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+
+/* Gets the number of ranks in the communicator clique. */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+
+/* Returns the cuda device number associated with the communicator. */
+ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+
+/* Returns the user-ordered "rank" associated with the communicator. */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+
+/* Reduction operation selector */
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               ncclNumOps     = 4 } ncclRedOp_t;
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+               ncclNumTypes   = 10 } ncclDataType_t;
+
+/*
+ * Collective communication operations
+ *
+ * Collective communication operations must be called separately for each
+ * communicator in a communicator clique.
+ *
+ * They return when operations have been enqueued on the CUDA stream.
+ *
+ * Since they may perform inter-CPU synchronization, each call has to be done
+ * from a different thread or process, or need to use Group Semantics (see
+ * below).
+ */
+
+/*
+ * Reduce
+ *
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
+ * operation.
+ * recvbuff may be NULL on all calls except for root device.
+ * root is the rank (not the CUDA device) where data will reside after the
+ * operation is complete.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+
+/*
+ * (deprecated) Broadcast (in-place)
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * This operation is implicitely in place.
+ */
+ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+
+/*
+ * Broadcast
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+
+/*
+ * All-Reduce
+ *
+ * Reduces data arrays of length count in sendbuff using op operation, and
+ * leaves identical copies of result on each recvbuff.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
+
+/*
+ * Reduce-Scatter
+ *
+ * Reduces data in sendbuff using op operation and leaves reduced result
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
+ * block of the result.
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+ * should have a size of at least nranks*recvcount elements.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+ */
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    hipStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    hipStream_t stream);
+
+/*
+ * All-Gather
+ *
+ * Each device gathers sendcount values from other GPUs into recvbuff,
+ * receiving data from rank i at offset i*sendcount.
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+ * should have a size of at least nranks*sendcount elements.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+ */
+ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
+
+/*
+ * Group semantics
+ *
+ * When managing multiple GPUs from a single thread, and since NCCL collective
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
+ * different ranks/devices into a single call.
+ *
+ * Grouping NCCL calls as being part of the same collective operation is done
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
+ * to be complete. Note that for collective communication, ncclGroupEnd only
+ * guarantees that the operations are enqueued on the streams, not that
+ * the operation is effectively done.
+ *
+ * Both collective communication and ncclCommInitRank can be used in conjunction
+ * of ncclGroupStart/ncclGroupEnd.
+ */
+
+/*
+ * Group Start
+ *
+ * Start a group call. All subsequent calls to NCCL may not block due to
+ * inter-CPU synchronization.
+ */
+ncclResult_t  ncclGroupStart();
+ncclResult_t pncclGroupStart();
+
+/*
+ * Group End
+ *
+ * End a group call. Wait for all calls since ncclGroupStart to complete
+ * before returning.
+ */
+ncclResult_t  ncclGroupEnd();
+ncclResult_t pncclGroupEnd();
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end include guard
@@ -0,0 +1,44 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef UTILS_H_
+#define UTILS_H_
+
+struct allGather1Data_t {
+  struct ncclPeerInfo peerInfo;
+  struct ncclComm* comm;
+};
+
+struct allGather3Data_t {
+  int cudaCompCap;
+  int fullCudaCompCap;
+  int nvlink;
+  int nChannels;
+  struct {
+    int sameChannels;
+    int speedIntra;
+    int speedInter;
+    int nvlink;
+  } tree;
+  struct {
+    int sameChannels;
+    int speedIntra;
+    int speedInter;
+    int nvlink;
+  } ring;
+  struct ncclTopoRanks topoRanks;
+};
+
+ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data);
+
+ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data,
+  struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph);
+
+ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
+  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph);
+
+#endif
@@ -0,0 +1,216 @@
+/*
+Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "nccl.h"
+#include "channel.h"
+#include "nvmlwrap.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "group.h"
+#include "net.h"
+#include "graph.h"
+#include "argcheck.h"
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <hip/hip_runtime.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "model.h"
+
+extern NodeModel *node_model;
+
+static ncclResult_t dummyNetDevices(int* ndev) {
+  *ndev = node_model->getnNetDevs();
+  return ncclSuccess;
+}
+
+static ncclResult_t dummyNetPciPath(int dev, char** path) {
+  node_model->getNetPciPath(dev, path);
+  return ncclSuccess;
+}
+
+ncclNet_t ncclNetDummy = {
+  "IB",
+  0,
+  dummyNetDevices,
+  dummyNetPciPath,
+  0,
+  0,
+  0,
+  0,
+  0,
+  0,
+  0,
+  0,
+  0,
+  0,
+  0,
+  0,
+  0
+};
+
+ncclNet_t* ncclNet = &ncclNetDummy;
+
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  return ncclSuccess;
+}
+
+/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
+int busIdToCudaDev(int64_t busId) {
+  int cudaDev;
+
+  for (cudaDev = 0; cudaDev < node_model->getnGpus(); cudaDev++) {
+    if (node_model->getGpuBusId(cudaDev) == busId)
+      break;
+  }
+
+  if (cudaDev < node_model->getnGpus())
+    return cudaDev;
+  else
+    WARN("Invalid busId %lx", busId);
+  return 0;
+}
+
+/* Determine if two peers can communicate with P2P */
+ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  // Rule out different nodes
+  *ret = 0;
+  if (info1->hostHash != info2->hostHash) return ncclSuccess;
+  int cudaDev1 = busIdToCudaDev(info1->busId);
+  int cudaDev2 = busIdToCudaDev(info2->busId);
+  *ret = node_model->p2pCanConnect(cudaDev1, cudaDev2);
+  return ncclSuccess;
+}
+
+/* Send: Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+  if (myInfo->pidHash == peerInfo->pidHash) {
+    if (myInfo->cudaDev == peerInfo->cudaDev) {
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      return ncclInternalError;
+    } else {
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
+          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+    }
+  } else {
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
+        channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+    //TRACE_DUMP_IPC(&info.devIpc);
+  }
+  return ncclSuccess;
+}
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
+  return ncclSuccess;
+}
+
+struct ncclTransport p2pTransport = {
+  "P2P",
+  p2pCanConnect,
+  { p2pSendSetup, NULL, NULL, NULL },
+  { p2pRecvSetup, NULL, NULL, NULL }
+};
+
+/* Determine if two peers can communicate with SHM */
+ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  // Rule out different nodes
+  *ret = 0;
+  if (info1->hostHash != info2->hostHash) return ncclSuccess;
+  int cudaDev1 = busIdToCudaDev(info1->busId);
+  int cudaDev2 = busIdToCudaDev(info2->busId);
+  *ret = node_model->shmCanConnect(cudaDev1, cudaDev2);
+  return ncclSuccess;
+}
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+  return ncclSuccess;
+}
+
+struct ncclTransport shmTransport = {
+  "SHM",
+  shmCanConnect,
+  { shmSendSetup, NULL, NULL, NULL },
+  { shmRecvSetup, NULL, NULL, NULL }
+};
+
+/* Determine if two peers can communicate with NET */
+ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  *ret = node_model->netCanConnect(info1->rank, info2->rank);
+  return ncclSuccess;
+}
+
+ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+  int netDev, useGdr = 0;
+
+  NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &netDev));
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev,
+      useGdr ? "/GDRDMA" : "");
+  return ncclSuccess;
+}
+
+NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
+
+ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+  int netDev, useGdr = 0;
+
+  NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &netDev));
+  // Check if we are close enough that it makes sense to enable GDR
+  int netGdrLevel = ncclParamNetGdrLevel();
+  int distance;
+  NCCLCHECK(ncclTopoNetDistance(topo, myInfo->busId, netDev, &distance));
+  if (distance >= netGdrLevel) {
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), myInfo->busId, netDev, distance, netGdrLevel);
+  }
+  else
+    useGdr = 1;
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev,
+      useGdr ? "/GDRDMA" : "");
+  return ncclSuccess;
+}
+
+struct ncclTransport netTransport = {
+  "NET",
+  netCanConnect,
+  { netSendSetup, NULL, NULL, NULL },
+  { netRecvSetup, NULL, NULL, NULL }
+};
+
+struct ncclTransport ncclTransports[NTRANSPORTS] = {
+  p2pTransport,
+  shmTransport,
+  netTransport,
+};
@@ -0,0 +1,231 @@
+/*
+Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "nccl.h"
+#include "channel.h"
+#include "nvmlwrap.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "group.h"
+#include "net.h"
+#include "graph.h"
+#include "argcheck.h"
+#include "cpuset.h"
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <hip/hip_runtime.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <cstdio>
+#include <iostream>
+#include <cstring>
+#include "model.h"
+#include "utils.h"
+
+NodeModel *node_model;
+
+char* getCmdOption(char ** begin, char ** end, const std::string & option) {
+    char ** itr = std::find(begin, end, option);
+    if (itr != end && ++itr != end)
+    {
+        return *itr;
+    }
+    return 0;
+}
+
+bool cmdOptionExists(char** begin, char** end, const std::string& option) {
+    return std::find(begin, end, option) != end;
+}
+
+const char *model_descriptions[] = {
+  "4 nodes with 8 GPUs PCIe 1 NIC",
+  "4 nodes with 8 GPUs PCIe 2 NIC",
+  "2 nodes VEGA20 4P1H",
+  "4 nodes with 8 VEGA20 GPUs XGMI 4P2H 1 NIC",
+  "single node gfx908 4P3L",
+  "single node gfx908 8P6L",
+  "single node gfx908 8P6L Alt. Connection",
+  "single node 8 GPUs PCIe on Rome",
+  "4 nodes 8 GPUs PCIe 2 NICs on Rome",
+  "3 nodes 8 GPUs PCIe + 1 Rome 8 GPUs PCIe + 2 nodes gfx908 4P3L",
+  NULL,
+};
+
+int main(int argc,char* argv[])
+{
+  struct ncclComm *comm;
+
+  if (!cmdOptionExists(argv, argv + argc, "-m")) {
+    printf("Usage: ./topo_expl -m model_id\n");
+    printf("List of model_id:\n");
+    for (int i = 0; model_descriptions[i] != NULL; i++)
+      printf("  %d: %s\n", i, model_descriptions[i]);
+    exit(0);
+  }
+
+  int model_id = 0;
+  char *mi = getCmdOption(argv, argv + argc, "-m");
+  if (mi)
+    model_id = atol(mi);
+
+  // CPU, GPU and NIC devices on Skylake
+  CpuDevices skylake("Skylake", 12, 12, 12);
+  GpuDevices vg20_pcie(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_pcie);
+  GpuDevices vg20_4p1h(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h);
+  GpuDevices vg20_4p2h(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h);
+  GpuDevices gfx908_4p3l(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l);
+  GpuDevices gfx908_8p6l(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l);
+  GpuDevices gfx908_8p6l_1(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l_1);
+  NetDevices nic_1(1, netPciPaths_1, netGuids_1, netPciNumaIds_1);
+  NetDevices nic_1_1(1, netPciPaths_1_1, netGuids_1, netPciNumaIds_1);
+  NetDevices nic_2(2, netPciPaths_2, netGuids_2, netPciNumaIds_2);
+
+  // CPU, GPU and NIC devices on Rome
+  CpuDevices rome("Rome", 24, 24, 24);
+  GpuDevices vg20_pcie_rome(8, rome_busIds_8, rome_gpuPciPaths_8, rome_gpuPciNumaIds_8, conn_mat_rome);
+  NetDevices nic_1_rome(1, rome_netPciPaths_1, rome_netGuids_1, rom_netPciNumaIds_1);
+  NetDevices nic_2_rome(2, rome_netPciPaths_2, rome_netGuids_2, rom_netPciNumaIds_2);
+
+  // 8 GPUs PCIe 1 NIC
+  NodeModel model_8pcie_1nic(skylake, vg20_pcie, nic_1, "Skylake 8 GPUs PCIe");
+
+  // 8 GPUs PCIe 2 NIC
+  NodeModel model_8pcie_2nic(skylake, vg20_pcie, nic_2, "Skylake 8 GPUs PCIe 2 NIC");
+
+  // VEGA20 4P1H, use VEGA20 4P2H model
+  NodeModel model_vg20_4p1h_1nic(skylake, vg20_4p1h, nic_1, "Skylake VEGA20 4P1H");
+
+  // VEGA20 GPUs XGMI 4P2H 1 NIC
+  NodeModel model_vg20_4p2h_1nic(skylake, vg20_4p2h, nic_1_1, "Skylake VEGA20 4P2H");
+
+  // gfx908 4P3L
+  NodeModel model_gfx908_4p_1nic(skylake, gfx908_4p3l, nic_1, "Skylake gfx908 4P3L");
+
+  // gfx908 8P6L
+  NodeModel model_gfx908_8p_1nic(skylake, gfx908_8p6l, nic_1, "Skylake gfx908 8P6L");
+
+  // gfx908 8P6L alternative connection
+  NodeModel model_gfx908_8p_1nic_1(skylake, gfx908_8p6l_1, nic_1, "Skylake gfx908 8P6L Alt. Connection");
+
+  // 8 GPUs PCIe on Rome
+  NodeModel model_8pcie_1nic_rome(rome, vg20_pcie_rome, nic_1_rome, "Rome 8 GPUs PCIe");
+
+  // 8 GPUs PCIe 2 NICs on Rome
+  NodeModel model_8pcie_2nic_rome(rome, vg20_pcie_rome, nic_2_rome, "Rome 8 GPUs PCIe 2 NICs");
+
+  NetworkModel network;
+
+  switch(model_id) {
+    case 0:
+      for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_1nic);
+      break;
+    case 1:
+      for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic);
+      break;
+    case 2:
+      for (int i = 0; i < 2; i ++) network.AddNode(model_vg20_4p1h_1nic);
+      break;
+    case 3:
+      for (int i = 0; i < 4; i ++) network.AddNode(model_vg20_4p2h_1nic);
+      break;
+    case 4:
+      network.AddNode(model_gfx908_4p_1nic);
+      break;
+    case 5:
+      network.AddNode(model_gfx908_8p_1nic);
+      break;
+    case 6:
+      network.AddNode(model_gfx908_8p_1nic_1);
+      break;
+    case 7:
+      network.AddNode(model_8pcie_1nic_rome);
+      break;
+    case 8:
+      for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic_rome);
+      break;
+    case 9:
+      for (int i = 0; i < 3; i ++) network.AddNode(model_8pcie_1nic);
+      network.AddNode(model_8pcie_1nic_rome);
+      for (int i = 0; i < 2; i ++) network.AddNode(model_gfx908_4p_1nic);
+      break;
+    default:
+      printf("Invalid model_id %d\n", model_id);
+      exit(0);
+  }
+
+  printf("Generating topology using %d: %s\n", model_id, model_descriptions[model_id]);
+
+  int nranks = network.GetNRanks();
+  int nnodes = network.GetNNodes();
+
+  printf("nnodes = %d, nranks = %d\n", nnodes, nranks);
+  for (int i = 0; i < nranks; i++) {
+    node_model = network.GetNode(i);
+    assert(node_model!=0);
+    printf("Rank %d: node %d (%s) GPU busId %lx\n", i, node_model->nodeId,
+      node_model->description, node_model->getGpuBusId(node_model->rankToCudaDev(i)));
+  }
+
+  NCCLCHECK(ncclCalloc(&comm, nranks));
+
+  struct allGather1Data_t *allGather1Data;
+  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+
+  struct allGather3Data_t *allGather3Data;
+  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+
+  for (int i = 0; i < nranks; i++) {
+    comm[i].rank = i;
+    comm[i].nRanks = nranks;
+    node_model = network.GetNode(i);
+    assert(node_model!=0);
+    bootstrapAllGather(&comm[i], allGather1Data);
+  }
+
+  struct ncclTopoGraph treeGraph, ringGraph;
+
+  for (int i = 0; i < nranks; i++) {
+    node_model = network.GetNode(i);
+    assert(node_model!=0);
+    initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph, ringGraph);
+  }
+
+  for (int i = 0; i < nranks; i++) {
+    node_model = network.GetNode(i);
+    assert(node_model!=0);
+    initTransportsRank_3(&comm[i], allGather3Data, treeGraph, ringGraph);
+  }
+
+  free(allGather3Data);
+  free(allGather1Data);
+
+  free(comm);
+  printf("Done generating topology using %d: %s\n", model_id, model_descriptions[model_id]);
+
+  return 0;
+}
@@ -0,0 +1,420 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "channel.h"
+#include "nvmlwrap.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "group.h"
+#include "net.h"
+#include "graph.h"
+#include "argcheck.h"
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <hip/hip_runtime.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "model.h"
+#include "utils.h"
+
+extern NodeModel *node_model;
+
+NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+
+// Get current Compute Capability
+int ncclCudaCompCap() {
+  int ccMajor = 1, ccMinor = 0;
+  return ccMajor*10+ccMinor;
+}
+
+ncclResult_t int64ToBusId(int64_t id, char* busId) {
+  sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
+  return ncclSuccess;
+}
+
+ncclResult_t busIdToInt64(char* busId, int64_t* id) {
+  const int size = strlen(busId);
+  char* hexStr;
+  NCCLCHECK(ncclCalloc(&hexStr, size));
+  int hexOffset = 0;
+  for (int i=0; i<size; i++) {
+    char c = busId[i];
+    if (c == '.' || c == ':') continue;
+    if ((c >= '0' && c <= '9') ||
+        (c >= 'A' && c <= 'F') ||
+        (c >= 'a' && c <= 'f')) {
+      hexStr[hexOffset++] = busId[i];
+    } else break;
+  }
+  hexStr[hexOffset] = '\0';
+  *id = strtol(hexStr, NULL, 16);
+  free(hexStr);
+  return ncclSuccess;
+}
+
+int ncclDebugLevel = -1;
+
+void ncclDebugInit() {
+  if (ncclDebugLevel != -1) return;
+  const char* nccl_debug = getenv("NCCL_DEBUG");
+  if (nccl_debug == NULL) {
+    ncclDebugLevel = NCCL_LOG_NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    ncclDebugLevel = NCCL_LOG_VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    ncclDebugLevel = NCCL_LOG_WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    ncclDebugLevel = NCCL_LOG_INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    ncclDebugLevel = NCCL_LOG_ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    ncclDebugLevel = NCCL_LOG_TRACE;
+  }
+}
+
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
+  if (ncclDebugLevel == -1) ncclDebugInit();
+  if (level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE) return;
+  char buffer[1024];
+  size_t len;
+  len = snprintf(buffer, sizeof(buffer),
+                   "[%d:%d] ", node_model->nodeId, node_model->currRank);
+  va_list args;
+  va_start(args, fmt);
+  vsprintf(buffer+len, fmt, args);
+  va_end(args);
+  printf("%s\n", buffer);
+  if (level == NCCL_LOG_WARN) {
+    fprintf(stderr,"[%d:%d] %s:%d TOPO EXPL ABORT\n",
+            node_model->nodeId, node_model->currRank, filefunc, line);
+    abort();
+  }
+}
+
+ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) {
+  // AllGather1 - begin
+  allGather1Data[comm->rank].peerInfo.rank = comm->rank;
+  allGather1Data[comm->rank].peerInfo.cudaDev = node_model->rankToCudaDev(comm->rank);
+  allGather1Data[comm->rank].peerInfo.gdrSupport = 1;
+  allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash;
+  allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash;
+  allGather1Data[comm->rank].peerInfo.shmDev = 0x19;
+  allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(node_model->rankToCudaDev(comm->rank));
+  return ncclSuccess;
+}
+
+ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data,
+  struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph) {
+  // We use 3 AllGathers
+  // 1. { peerInfo, comm }
+  // 2. ConnectTransport[nranks], ConnectValue[nranks]
+  // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+  //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
+  //NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+
+  // AllGather1 - begin
+  //struct allGather1Data_t *allGather1Data;
+  //NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+  //allGather1Data[rank].comm = comm;
+  struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
+  //NCCLCHECK(fillInfo(comm, myInfo, commHash));
+  //NCCLCHECK(bootstrapAllGather(comm, allGather1Data));
+
+  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+  for (int i = 0; i < nranks; i++) {
+    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
+    if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
+      return ncclInvalidUsage;
+    }
+  }
+  // AllGather1 data is used again below
+  // AllGather1 - end
+
+  // Topo detection / System graph creation
+  NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
+  // Compute paths between GPUs and NICs
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  // Remove inaccessible GPUs and unused NICs
+  NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
+  // Recompute paths after trimming
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  // Compute max speed to accelerate search
+  NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
+  // Print final topology
+  NCCLCHECK(ncclTopoPrint(comm->topo));
+
+  // Get rings and trees
+  //struct ncclTopoGraph treeGraph;
+  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+  treeGraph.crossNic = ncclParamCrossNic();
+  // We communicate only half the data between node with trees on 2 nodes.
+  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
+  //struct ncclTopoGraph ringGraph;
+  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
+  ringGraph.crossNic = ncclParamCrossNic();
+  NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
+
+  // AllGather3 - begin
+  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+  allGather3Data[rank].nvlink = treeGraph.nvlink;
+  allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
+  allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
+  allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
+  allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
+  allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
+  allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
+  allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
+  allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
+
+  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
+  //INFO(NCCL_GRAPH, "%d: nvlink %d nChannels %d tree.sameChannels %d tree.speedIntra %d tree.speedInter %d tree.nvlink %d ring.sameChannels %d ring.speedIntra %d ring.speedInter %d ring.nvlink %d",
+  //  rank, allGather3Data[rank].nvlink, allGather3Data[rank].nChannels, allGather3Data[rank].tree.sameChannels, allGather3Data[rank].tree.speedIntra, allGather3Data[rank].tree.speedInter, allGather3Data[rank].tree.nvlink,
+  //  allGather3Data[rank].ring.sameChannels, allGather3Data[rank].ring.speedIntra, allGather3Data[rank].ring.speedInter, allGather3Data[rank].ring.nvlink);
+  //INFO(NCCL_GRAPH, "  ringRecv %d ringSend %d ringPrev %d ringNext %d treeUpRecv %d treeUpSend %d treeDnRecv %d treeDnSend %d",
+  //  allGather3Data[rank].topoRanks.ringRecv[0], allGather3Data[rank].topoRanks.ringSend[0], allGather3Data[rank].topoRanks.ringPrev[0], allGather3Data[rank].topoRanks.ringNext[0],
+  //  allGather3Data[rank].topoRanks.treeUpRecv[0], allGather3Data[rank].topoRanks.treeUpSend[0], allGather3Data[rank].topoRanks.treeDnRecv[0], allGather3Data[rank].topoRanks.treeDnSend[0]);
+  return ncclSuccess;
+}
+
+template <int type>
+static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
+    int ret = 0;
+    NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
+    if (ret) {
+      //cpu_set_t affinitySave;
+      //sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+      //int cudaDev;
+      //CUDACHECK(hipGetDevice(&cudaDev));
+      //setCpuAffinity(cudaDev);
+      connector->transportComm = transportComm;
+      NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
+      //sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+      return ncclSuccess;
+    }
+  }
+  WARN("No transport found !");
+  return ncclInternalError;
+}
+
+static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
+  uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
+  struct ncclConnect connect;
+  struct ncclConnector* conn;
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) { ++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    //NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) { ++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    //NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) {++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
+    //NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    //NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) {++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
+    //CCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    //NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
+  return ncclSuccess;
+}
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
+  struct ncclChannel* channel = comm->channels+channelid;
+  channel->id = channelid;
+
+  // Setup intermediate buffering
+  //channel->buffSize = ncclParamBuffsize();
+
+  // Ring index to user rank table.
+  //NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+
+  // Communication structures with peers.
+  //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
+  for (size_t i=0; i<comm->nRanks; ++i) {
+    channel->peers[i].send.comm = comm;
+    channel->peers[i].recv.comm = comm;
+  }
+
+  // Per-channel operation list.
+  //NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+  return ncclSuccess;
+}
+
+static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+  NCCLCHECK(initChannel(comm, channelId));
+
+  struct ncclRing* ring = &comm->channels[channelId].ring;
+  // Reorganize ranks to start with rank.
+  int shift;
+  for (shift = 0; shift<nranks; shift++) {
+    if (ringRanks[shift] == rank) {
+      break;
+    }
+  }
+  for (int i=0; i<nranks; i++) {
+    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph,
+  struct ncclTopoGraph& ringGraph) {
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
+
+  // Determine nNodes, firstRanks, ...
+  int* nodesFirstRank;
+  NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
+  for (int i=0; i<nranks; i++) {
+    int node = -1;
+    int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
+    for (int n=0; n<comm->nNodes; n++) {
+      if (nodesFirstRank[n] == firstRank) node = n;
+    }
+    if (node == -1) {
+      node = comm->nNodes++;
+      nodesFirstRank[node] = firstRank;
+    }
+    if (i == comm->rank) comm->node = node;
+  }
+
+  // Determine the minimum CUDA Compute capability of all GPUs
+  int myCompCap = allGather3Data[rank].cudaCompCap;
+  int minCompCap = myCompCap, maxCompCap = myCompCap;
+  for (int i = 0; i < nranks; i++) {
+    minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
+    maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
+  }
+
+  comm->nvlink = 1;
+  for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
+
+  int nChannelsOrig = comm->nChannels;
+  struct ncclTopoRanks** allTopoRanks;
+  NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
+  for (int i=0; i<nranks; i++) {
+    allTopoRanks[i] = &allGather3Data[i].topoRanks;
+    // Make sure we align all ranks so that the tuning is consistent across ranks
+    treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
+    treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
+    treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
+    treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
+    treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
+    ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
+    ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
+    ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
+    ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
+  }
+
+  if (comm->nChannels < nChannelsOrig) {
+    // We started duplicating channels during Preset(), so we need to move the
+    // duplicated channels since we have removed some.
+    for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
+  }
+
+  int *rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
+
+  char line[1024];
+  sprintf(line, "nodesFirstRank: ");
+  int offset = strlen(line);
+  for (int i=0; i<comm->nNodes; i++) {
+    sprintf(line+offset, "%d ", nodesFirstRank[i]);
+    offset = strlen(line);
+  }
+  INFO(NCCL_INIT, "%s", line);
+
+  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+
+  free(allTopoRanks);
+  free(nodesFirstRank);
+
+  // AllGather3 - end
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
+
+  line[0]='\0';
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclTree* treeUp = &comm->channels[c].treeUp;
+    struct ncclTree* treeDn = &comm->channels[c].treeDn;
+    snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
+        c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
+        treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
+  }
+  line[1023] = '\0';
+  INFO(NCCL_INIT, "Trees%s", line);
+
+  free(rings);
+
+  // Done with AllGather1 data
+  //free(allGather1Data);
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+
+  // Connect with prev/next for each ring
+  struct ncclConnect *connect;
+  NCCLCHECK(ncclCalloc(&connect, 2));
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
+    if (comm->nRanks == 1) continue;
+    NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
+    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
+    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
+  }
+  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
+  free(connect);
+
+  return ncclSuccess;
+}