Rework Rome detection and add multiple network ports models (#274)

* Rework Rome detection and add multiple network ports models * Remove unused opCount in p2p transport
2020-10-07 13:37:36 -07:00
parent 88a062342b
commit ae008fd2db
11 changed files with 616 additions and 352 deletions
@@ -255,7 +255,7 @@ int ncclMaxNchannels() {
  return maxNchannels;
 }

-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn) {
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
  // Gather data from all ranks
  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
  int nranks = comm->nRanks;
@@ -292,7 +292,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
  int nc = nChannels*2;
  if (gcn == 908) nc = std::max(nc, 4);
  if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*4;
-  if (comm->topo->nodes[NET].count && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*comm->topo->nodes[NET].count;
+  if (!nnets) nnets = comm->topo->nodes[NET].count;
+  if (nnets && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*nnets;
  int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));

  // Duplication should be complete now
@@ -0,0 +1,106 @@
+/*
+Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#define MAX_ROME_GPUS 8
+#define MAX_ROME_NICS 2
+
+struct rcclRomeModel {
+  int nGpus;
+  int nCpus;
+  int nNics;
+  int nLinks;
+  int64_t gpuIds[MAX_ROME_GPUS];
+  int64_t gpuNuma[MAX_ROME_GPUS];
+  int64_t nicNuma[MAX_ROME_NICS];
+  int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS];
+  const char *pattern;
+  const char *ringBase;
+};
+
+static struct rcclRomeModel rome_model_22 = {
+  .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2,
+  .gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, },
+  .gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, },
+  .nicNuma = { 2, },
+  .connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, },
+  .pattern = "10302120",
+  .ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
+};
+
+static struct rcclRomeModel rome_model_25 = {
+  .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
+  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
+  .nicNuma = { 0, 3, },
+  .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
+  .pattern = "11303011",
+  .ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
+};
+
+static struct rcclRomeModel rome_model_27 = {
+  .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
+  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
+  .nicNuma = { 0, 3, },
+  .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
+  .pattern = "11303011",
+  .ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
+};
+
+static struct rcclRomeModel rome_model_29 = {
+  .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
+  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
+  .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
+  .nicNuma = { 2, },
+  .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
+  .pattern = "10302120",
+  .ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
+};
+
+static struct rcclRomeModel rome_model_31 = {
+  .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
+  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
+  .nicNuma = { 0, 6, },
+  .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
+  .pattern = "0110201010200110",
+  .ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
+};
+
+static struct rcclRomeModel rome_model_33 = {
+  .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
+  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
+  .nicNuma = { 0, 6, },
+  .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
+  .pattern = "0110201010200110",
+  .ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
+};
+
+static struct rcclRomeModel romeTopoModels[] = {
+  rome_model_22,
+  rome_model_25,
+  rome_model_27,
+  rome_model_29,
+  rome_model_31,
+  rome_model_33,
+};
@@ -10,6 +10,8 @@
 #include "topo.h"
 #include "xml.h"
 #include <math.h>
+#include <sys/time.h>
+#include "rome_models.h"

 // Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
 // max speed.
@@ -663,64 +665,111 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
 }

 /* Parse user defined rings. Format is like :
- * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
- * Rings with a non-matching number of ranks are ignored so we can provide
+ * "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
+ * Network interfaces can be optionally specified by N prefix.
+ * Rings with a non-matching number of gpus are ignored so we can provide
 * rings for multiple cases.
 */
-#define MAX_ENV_RANKS 512
-static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
-  int ranks[MAX_ENV_RANKS];
+static ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int nnets, int* net_map ) {
+  int gpus[MAX_ROME_GPUS];
  int nChannels = 0;
-  int rank = 0;
+  int gpu = 0;
  int offset = 0;
-  int status = 0; // 0 : between numbers, 1 : inside number
+  int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET
+  int nets[2];
+  int net = 0;
+  int ngpus = system->nodes[GPU].count;
  do {
-    int digit = str[offset] - '0';
-    if (digit >= 0 && digit <= 9) {
+    if (str[offset] == 'N') {
      if (status == 0) {
-        ranks[rank] = digit;
-        status = 1;
-      } else {
-        ranks[rank] = ranks[rank]*10+digit;
+        status = 2;
      }
    } else {
-      if (status == 1) {
-        rank++;
-        if (rank == MAX_ENV_RANKS) goto end;
-      }
-      status = 0;
-      if (str[offset] == '|' || str[offset] == '\0') {
-        // Ignore if ngpus doesn't match
-        if (rank != ngpus) goto newchannel;
-
-        for (int r=0; r<ngpus; r++) {
-          int rank = ranks[r];
-          // Ignore if ranks are out of bounds
-          if (rank < 0 || rank >= ngpus) goto newchannel;
-          // Ignore if ranks are duplicate
-          for (int i=0; i<r; i++)
-            if (ranks[i] == rank) goto newchannel;
-
-          channels[nChannels*ngpus+r] = rank;
+      int digit = str[offset] - '0';
+      if (digit >= 0 && digit <= 9) {
+        if (status == 0) {
+          gpus[gpu] = digit;
+          status = 1;
+        } else if (status == 2) {
+          nets[net] = digit;
        }
-        nChannels++;
+        else{
+          gpus[gpu] = gpus[gpu]*10+digit;
+        }
+      } else {
+        if (status == 1) {
+          gpu++;
+          if (gpu > MAX_ROME_GPUS) goto end;
+        } else if (status == 2) {
+          net++;
+          if (net > 2) goto end;
+        }
+        status = 0;
+        if (str[offset] == '|' || str[offset] == '\0') {
+          // Ignore if ngpus doesn't match
+          if (gpu != ngpus) goto newchannel;
+          // Ignore if nnets are not 0 or 2
+          if (net && net != 2) goto newchannel;
+
+          for (int r=0; r<ngpus; r++) {
+            int g = gpus[r];
+            // Ignore if gpus are out of bounds
+            if (g < 0 || g >= ngpus) goto newchannel;
+            // Ignore if gpus are duplicate
+            for (int i=0; i<r; i++)
+              if (gpus[i] == g) goto newchannel;
+            // remap if needed
+            if (gpu_map) g = gpu_map[g];
+            // Translate gpu numbers into ranks
+            int j = 0;
+            for (j = 0; j < ngpus; j++)
+              if (g == system->nodes[GPU].nodes[j].gpu.dev)
+                break;
+            if (j < ngpus)
+              graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank;
+            else
+              return ncclInternalError;
+          }
+
+          if (net) {
+            if (nets[0] >= nnets || nets[1] >= nnets) goto newchannel;
+            graph->inter[nChannels*2] = nets[0];
+            graph->inter[nChannels*2+1] = nets[1];
+          } else if (net_map && nnets) {
+            graph->inter[nChannels*2] = net_map[nChannels%nnets];
+            graph->inter[nChannels*2+1] = net_map[(nChannels+1)%nnets];
+          } else if (nnets) {
+            graph->inter[nChannels*2] = nChannels%nnets;
+            graph->inter[nChannels*2+1] = (nChannels+1)%nnets;
+          }
+          nChannels++;
 newchannel:
-        rank = 0;
+          gpu = 0;
+          net = 0;
+        }
      }
    }
  } while (str[offset++] != 0);
 end:
-  *nChannelsRet = nChannels;
+  graph->nChannels = nChannels;
+  graph->speedIntra = graph->speedInter = system->maxWidth;
+#if 0
+  for (int i=0; i<graph->nChannels; i++) {
+    printf("%d: ", i);
+    printf ("NET/%d ", graph->inter[i*2]);
+    for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
+    printf ("NET/%d ", graph->inter[i*2+1]);
+    printf("\n");
+  }
+#endif
  return ncclSuccess;
 }

-static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str) {
+static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
  static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
-  static char ringRemap[256];
  int id[8], dist[8];
  int i;

-  *str = 0;
  int ngpus = system->nodes[GPU].count;
  if (ngpus != 8)
    return ncclSuccess;
@@ -757,251 +806,242 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str)
    dist[m] = dist[n]; dist[n] = temp;
  }
  // create chordal ring based on reference and remapped ids
-  for (i = 0; i <strlen(ringBase); i++) {
-    if (ringBase[i] >= '0' && ringBase[i] <= '9')
-      ringRemap[i] = id[ringBase[i]-'0']+'0';
-    else
-      ringRemap[i] = ringBase[i];
-  }
-  ringRemap[i] = 0;
-  *str = ringRemap;
  system->type = RCCL_TOPO_CR8G;
-  INFO(NCCL_GRAPH, "Use chordal ring: %s", ringRemap);
+  NCCLCHECK(parseGraph(ringBase, system, graph, id, 0, NULL));
+  if (system->nodes[NET].count) {
+    int *intra, *used;
+    graph->nChannels = system->nodes[NET].count;
+    NCCLCHECK(ncclCalloc(&intra, ngpus));
+    NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
+    for (int n = 0; n < system->nodes[NET].count; n++) {
+      graph->inter[n*2] = graph->inter[n*2+1] = n;
+      struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+      struct ncclTopoLinkList* paths = net->paths[GPU];
+      // find the first unsed GPU that is closest to NIC
+      int f, m;
+      for (f = 0; f < ngpus; f++) {
+        int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
+        if(j >= n) break;
+      }
+      for (int i = 0; i < ngpus; i++) {
+        int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
+        if (j < n) continue;
+        if (paths[i].count < paths[f].count) f = i;
+      }
+      for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
+      used[n] = graph->intra[n*ngpus+m];
+      for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
+      for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
+    }
+    free(used);
+    free(intra);
+  }
  return ncclSuccess;
 }

-static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *ngpu, int *nnet) {
-  *ngpu = 0; *nnet = 0;
+static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *g, int *n, int nnet, int *net_map) {
+  *g = 0; *n = 0;
  int i;
  if (ncclTopoIdToIndex(system, CPU, id, &i) == ncclInternalError) return false;
-  for (int n = 0; n < system->nodes[NET].count; n++)
-    if (system->nodes[NET].nodes[n].paths[CPU][i].count == 2) (*nnet)++;
-  for (int n = 0; n < system->nodes[GPU].count; n++)
-    if (system->nodes[GPU].nodes[n].paths[CPU][i].count == 2) (*ngpu)++;
+  for (int j = 0; j < nnet; j++)
+    if (system->nodes[NET].nodes[net_map[j]].paths[CPU][i].count == 2) (*n)++;
+  for (int j = 0; j < system->nodes[GPU].count; j++)
+    if (system->nodes[GPU].nodes[j].paths[CPU][i].count == 2) (*g)++;
  return true;
 }

-/* compare GPUs by PCI ID */
-static int compareGPU (const void *g1, const void *g2, void *s) {
-  struct ncclTopoSystem* system = (struct ncclTopoSystem*)s;
-  return system->nodes[GPU].nodes[*(int *)g1].id > system->nodes[GPU].nodes[*(int *)g2].id;
+static ncclResult_t ncclGpuIdToIndex(struct ncclTopoSystem* system, int id, int* index) {
+  *index = -1;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    if (system->nodes[GPU].nodes[i].gpu.dev == id) {
+      *index = i;
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
 }

-static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int *gpu1, int *gpu2, int use_shared, int ex1, int ex2) {
-  int n, m, k, idx, c1, c2;
-  uint64_t gid;
-  int ngpus = system->nodes[GPU].count;
-  if (ncclTopoIdToIndex(system, CPU, cpu1, &c1) == ncclInternalError) return false;
-  if (ncclTopoIdToIndex(system, CPU, cpu2, &c2) == ncclInternalError) return false;
+static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRomeModel* romeTopo, char *pattern, int *net_map) {
+  pattern[0] = 0; // pattern will be NULL for invalid topology
+  romeTopo->nGpus = system->nodes[GPU].count;
+  romeTopo->nCpus = system->nodes[CPU].count;
+  romeTopo->nNics = 0;
+  romeTopo->nLinks = 0;
+  for (int i = 0; i < romeTopo->nGpus; i ++) {
+    int gpu, n;
+    NCCLCHECK(ncclGpuIdToIndex(system, i, &gpu));
+    romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
+    for (n = 0; n < romeTopo->nCpus; n++)
+      if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count == 2) break;
+    if (n < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[n].id;

-  int *s_gpus = (int *)malloc(sizeof(int)*ngpus);
-  int s_ngpus = 0;
-
-  // build a sorted list of source GPUs
-  for (n = 0; n < ngpus; n++) {
-    if (*gpu1 != -1 && system->nodes[GPU].nodes[n].gpu.dev != *gpu1) continue;
-    if (system->nodes[GPU].nodes[n].gpu.dev == ex1) continue;
-    if (system->nodes[GPU].nodes[n].paths[CPU][c1].count != 2) continue;
-    s_gpus[s_ngpus++] = n;
-  }
-  if (s_ngpus) qsort_r(s_gpus, s_ngpus, sizeof(int), compareGPU, system);
-
-  for (n = 0; n < s_ngpus; n++) {
-    struct ncclTopoNode* node = system->nodes[GPU].nodes+s_gpus[n];
+    struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
    if (node->paths[GPU] == NULL) continue;
-    idx = -1; gid = 0;
-    for (m = 0; m < ngpus; m++) {
-      if (*gpu2 != -1 && system->nodes[GPU].nodes[m].gpu.dev != *gpu2) continue;
-      if (system->nodes[GPU].nodes[m].gpu.dev == ex2) continue;
-      if (system->nodes[GPU].nodes[m].paths[CPU][c2].count != 2) continue;
+    int count = 0;
+    for (n = 0; n < romeTopo->nGpus; n++) {
+      romeTopo->connMatrix[i*romeTopo->nGpus+n] = 0;
      struct ncclTopoLink* link;
      for (link = node->links; link->remNode; link++) {
-        if (link->remNode->gpu.dev == system->nodes[GPU].nodes[m].gpu.dev) break;
+        if (link->remNode->gpu.dev == n) break;
      }
      if (!link->remNode) continue;
-      if (link->type == LINK_NVL) {
-        int is_shared = 0;
-        for (k = 0; k < ngpus; k++) {
-          if (k == m || k == s_gpus[n]) continue;
-          if ((system->nodes[GPU].nodes[k].id & 0xf0000) == (system->nodes[GPU].nodes[m].id & 0xf0000))
-            break;
-        }
-        if (k < ngpus) is_shared = 1;
-        if (use_shared == -1 || is_shared == use_shared) {
-          if (idx == -1 || (idx != -1 && system->nodes[GPU].nodes[m].id < gid)) {
-            idx = m;
-            gid = system->nodes[GPU].nodes[m].id;
-          }
-        }
+      if (link->type != LINK_NVL) continue;
+      romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1;
+      count ++;
+    }
+    if (!romeTopo->nLinks) romeTopo->nLinks = count;
+    else if (romeTopo->nLinks != count) return ncclSuccess;
+  }
+
+  // trim ports and create NET map
+  for (int i = 0; i < system->nodes[NET].count; i ++) {
+    int j;
+    for (j = 0; j < romeTopo->nNics; j++) {
+      if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[net_map[j]].net.asic) {
+        if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[net_map[j]].net.width)
+          net_map[j] = i;
+        break;
      }
    }
-    if (idx != -1) break;
+    if (j >= romeTopo->nNics) {
+      net_map[j] = i;
+      (romeTopo->nNics)++;
+      if (romeTopo->nNics >= MAX_ROME_NICS) break;
+    }
  }
-  if (n < s_ngpus) {
-    *gpu1 = system->nodes[GPU].nodes[s_gpus[n]].gpu.dev;
-    *gpu2 = system->nodes[GPU].nodes[idx].gpu.dev;
-    //printf("%s+: c1 %d c2 %d gpu1 %d gpu2 %d use_shared %d ex1 %d, ex2 %d\n",
-    //  __func__, cpu1, cpu2, *gpu1, *gpu2, use_shared, ex1, ex2);
-    free(s_gpus);
+
+  // number of GPUs and NICs on each numa node is used as first screening pattern
+  for (int i = 0; i < romeTopo->nCpus; i++) {
+    int g, n;
+    if (!getGpuNetCount(system, i, &g, &n, romeTopo->nNics, net_map)) return ncclSuccess;
+    pattern[i*2] = '0' + g;
+    pattern[i*2+1] = '0' + n;
+  }
+  pattern[romeTopo->nCpus*2] = 0;
+
+  for (int i = 0; i < romeTopo->nNics; i ++) {
+    int net, n;
+    NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
+    for (n = 0; n < romeTopo->nCpus; n++)
+      if (system->nodes[NET].nodes[net].paths[CPU][n].count == 2) break;
+    if (n < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[n].id;
+    else return ncclSuccess;
+  }
+
+  const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
+  if (romeModelFile) {
+    INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
+    FILE* file = fopen(romeModelFile, "w");
+    if (file == NULL) {
+      WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
+      return ncclSuccess;
+    }
+    fprintf(file, "static struct rcclRomeModel rome_model_ = {\n");
+    fprintf(file, "  .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
+    fprintf(file, "  .gpuIds = { ");
+    for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
+    fprintf(file, "},\n");
+    fprintf(file, "  .gpuNuma = { ");
+    for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
+    fprintf(file, "},\n");
+    fprintf(file, "  .nicNuma = { ");
+    for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
+    fprintf(file, "},\n");
+    fprintf(file, "  .connMatrix = { ");
+    for (int i = 0; i < romeTopo->nGpus; i ++)
+      for (int n = 0; n < romeTopo->nGpus; n++) fprintf(file, "%d, ", romeTopo->connMatrix[i*romeTopo->nGpus+n]);
+    fprintf(file, "},\n");
+    fprintf(file, "  .pattern = \"%s\",\n", pattern);
+    fprintf(file, "  .ringBase = \"\",\n");
+    fprintf(file, "};\n");
+    fclose(file);
+  }
+  return ncclSuccess;
+}
+
+static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel* ref, struct rcclRomeModel* topo, int* time) {
+  (*time) ++;
+  if (n == last) {
+    int i, j;
+    // match GPU numa
+    for (i = 0; i < ref->nGpus; i++)
+      if (ref->gpuNuma[i] != topo->gpuNuma[g[i]]) break;
+    if (i < ref->nGpus) return false;
+    // match XGMI connection
+    for (i = 0; i < ref->nGpus; i++) {
+      for (j = 0; j < ref->nGpus; j++)
+        if (ref->connMatrix[i*ref->nGpus+j] != topo->connMatrix[g[i]*ref->nGpus+g[j]]) break;
+      if (j < ref->nGpus) break;
+    }
+    if (i < ref->nGpus) return false;
+    // match NBIO
+    for (i = 0; i < ref->nGpus; i++) {
+      for (j = 0; j < ref->nGpus; j++) {
+        if (i == j) continue;
+        bool nbio_ref = (ref->gpuIds[i]&0xf0000) == (ref->gpuIds[j]&0xf0000);
+        bool nbio_topo = (topo->gpuIds[g[i]]&0xf0000) == (topo->gpuIds[g[j]]&0xf0000);
+        if (nbio_ref != nbio_topo) break;
+        if (nbio_ref && ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0)) break;
+      }
+      if (j < ref->nGpus) break;
+    }
+    if (i < ref->nGpus) return false;
    return true;
+  } else {
+    for (int i = n; i <= last; i++) {
+      std::swap(g[n], g[i]);
+      if (permuteGpuIds(g, n+1, last, ref, topo, time)) return true;
+      std::swap(g[n], g[i]);
+    }
  }
-  free(s_gpus);
  return false;
 }

-static bool validate4P1H(struct ncclTopoSystem* system, int *hive) {
-  int g, n, m;
-  int ngpus = system->nodes[GPU].count;
-  for (g = 0; g < 4; g++) {
-    int gpu = hive[g];
-    int next_gpu = hive[(g+1)%4];
-    for (n = 0; n < ngpus; n++) {
-      if (system->nodes[GPU].nodes[n].gpu.dev != gpu) continue;
-      struct ncclTopoNode* node = system->nodes[GPU].nodes+n;
-      if (node->paths[GPU] == NULL) continue;
-      for (m = 0; m < ngpus; m++) {
-        struct ncclTopoLink* link;
-        for (link = node->links; link->remNode; link++) {
-          if (link->remNode->gpu.dev == next_gpu) break;
-        }
-        if (!link->remNode) continue;
-        if (link->type == LINK_NVL) break;
-      }
-      if (m < ngpus) break;
-    }
-    if (n < ngpus) continue;
-    else break;
-  }
-  if (g < 4) return false;
-  else return true;
-}
-
-static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
-  static const char *ringBase_10302120_1 = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6";
-  static const char *ringBase_10302120_2 = "6 4 7 5 0 1 3 2|6 5 7 4 2 3 1 0";
-  static const char *ringBase_11303011_1 = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0";
-  static const char *ringBase_11303011_2 = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2";
-  static const char *ringBase_0110201010200110_1 = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3";
-  static const char *ringBase_0110201010200110_2 = "3 0 6 2 1 4 5 7|4 1 0 3 2 6 7 5";
-  static const char *ringBase;
+static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
  static char ringRemap[64];
-  int id[8], dist[8];
  int i;

-  *str = 0;
  int ngpus = system->nodes[GPU].count;
  int ncpus = system->nodes[CPU].count;
-  // 8 GPUs only
-  if (ngpus != 8)
-    return ncclSuccess;
+
  // only valid on Rome
  int arch, vendor, model;
  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
  if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
    return ncclSuccess;
-  system->type = RCCL_TOPO_4P2H_ROME;
-  // 4 or 8 numa nodes only
-  if (ncpus != 4 && ncpus != 8)
-    return ncclSuccess;
-  // number of GPUs and NICs on each numa node is used as first screening pattern
-  char pattern[256];
-  for (i = 0; i < ncpus; i++) {
-    int g, n;
-    if (!getGpuNetCount(system, i, &g, &n)) return ncclSuccess;
-    pattern[i*2] = '0' + g;
-    pattern[i*2+1] = '0' + n;
-  }
-  pattern[i*2] = 0;
-  int g[8], h1[4], h2[4];
-  for (int i = 0; i <8; i++) g[i] = -1;
-  if (strcmp(pattern, "10302120") == 0) {
-    bool cross = findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1);
-    g[2] = g[6] = -1;
-    if (cross) {
-      // identify GPUs for pattern "10302120"
-      if (!findGpuByXGMI(system, 0, 1, &g[1], &g[0], 0, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 0, 1, &g[1], &g[2], 1, -1, g[0])) return ncclSuccess;
-      if (!findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 2, 1, &g[3], &g[5], 1, g[6], g[2])) return ncclSuccess;
-      if (!findGpuByXGMI(system, 1, 3, &g[5], &g[4], -1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 2, 3, &g[3], &g[7], -1, g[6], g[4])) return ncclSuccess;
-      // finally verify two XGMI hives for pattern "10302120"
-      h1[0] = g[1]; h1[1] = g[0]; h1[2] = g[6]; h1[3] = g[2];
-      h2[0] = g[7]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[3];
-      ringBase = ringBase_10302120_1;
-    } else {
-      // identify GPUs for pattern "10302120"
-      if (!findGpuByXGMI(system, 0, 1, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 1, 1, &g[1], &g[2], -1, -1, g[3])) return ncclSuccess;
-      if (!findGpuByXGMI(system, 2, 3, &g[5], &g[7], -1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 2, 3, &g[4], &g[6], -1, g[5], g[7])) return ncclSuccess;
-      // finally verify two XGMI hives for pattern "10302120"
-      h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
-      h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
-      ringBase = ringBase_10302120_2;
-    }
-  }
-  else if (strcmp(pattern, "11303011") == 0) {
-    // there are 2 configurations for pattern "11303011"
-    if (findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1)) {
-      if (!findGpuByXGMI(system, 2, 1, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess;
-      if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 3, 2, &g[7], &g[5], 1, -1, -1)) return ncclSuccess;
-      // finally verify two XGMI hives for pattern "11303011"
-      h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6];
-      h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7];
-      ringBase = ringBase_11303011_2;
-    } else {
-      // identify GPUs for pattern "11303011"
-      if (!findGpuByXGMI(system, 0, 1, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 1, 1, &g[1], &g[2], -1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 3, 2, &g[7], &g[5], -1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 3, 2, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess;
-      if (!findGpuByXGMI(system, 2, 2, &g[5], &g[4], -1, -1, -1)) return ncclSuccess;
-      // finally verify two XGMI hives for pattern "11303011"
-      h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
-      h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
-      ringBase = ringBase_11303011_1;
-    }
-  }
-  else if (strcmp(pattern, "0110201010200110") == 0) {
-    if (findGpuByXGMI(system, 2, 5, &g[2], &g[6], 1, -1, -1)) {
-      if (!findGpuByXGMI(system, 4, 2, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess;
-      if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], 1, -1, -1)) return ncclSuccess;
-      h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6];
-      h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7];
-      ringBase = ringBase_0110201010200110_2;
-    } else {
-      if (!findGpuByXGMI(system, 1, 2, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 2, 2, &g[1], &g[2], -1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], -1, -1, -1)) return ncclSuccess;
-      if (!findGpuByXGMI(system, 7, 5, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess;
-      if (!findGpuByXGMI(system, 4, 5, &g[4], &g[5], -1, -1, -1)) return ncclSuccess;
-      h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
-      h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
-      ringBase = ringBase_0110201010200110_1;
-    }
-}
-  else
-    return ncclSuccess;

-  if (!validate4P1H(system, h1)) return ncclSuccess;
-  if (!validate4P1H(system, h2)) return ncclSuccess;
-  // passed all validation
-  // create 4P2H based on reference and remapped ids
-  for (i = 0; i <strlen(ringBase); i++) {
-    if (ringBase[i] >= '0' && ringBase[i] <= '9')
-      ringRemap[i] = g[ringBase[i]-'0'] + '0';
-    else
-      ringRemap[i] = ringBase[i];
+  // number of GPUs and NICs on each numa node is used as first screening pattern
+  struct rcclRomeModel romeTopo;
+  char pattern[256];
+  int net_map[MAX_ROME_NICS];
+  parseRomeSystem(system, &romeTopo, pattern, net_map);
+
+  // recognize system as Rome 4P2H even if no matching model
+  if (ngpus == 8 && romeTopo.nLinks) system->type = RCCL_TOPO_4P2H_ROME;
+
+  int g[MAX_ROME_GPUS];
+  int time = 0;
+  struct timeval tvs, tve;
+  gettimeofday(&tvs, NULL);
+  for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
+    if (romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus ||
+      romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
+    if (strcmp(romeTopoModels[i].pattern, pattern)) continue;
+    for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus;
+    if (permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time)) break;
  }
-  ringRemap[i] = 0;
-  *str = ringRemap;
-  INFO(NCCL_GRAPH, "Use 4P2H on Rome: %s", ringRemap);
+  gettimeofday(&tve, NULL);
+  float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
+  if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
+    //printf("No solution in %.2fms (%d iter)\n", t, time);
+    return ncclSuccess;
+  }
+  //printf("Solution in %.2fms (%d iter): ", t, time);
+  //for (int k = 0; k < ngpus; k++) printf("%d ", g[k]);
+  //printf("\n");
+
+  // create 4P2H based on reference and remapped ids
+  NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, romeTopo.nNics, net_map));
  return ncclSuccess;
 }

@@ -1014,6 +1054,7 @@ float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0,

 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
  int ngpus = system->nodes[GPU].count;
+  int nnets = system->nodes[NET].count;
  int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
  graph->speedIntra = graph->speedInter = 0;
  if (graph->crossNic == 2) graph->crossNic = 0;
@@ -1036,59 +1077,19 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
  }

  str = getenv("NCCL_RINGS");
-  if (str) system->type = RCCL_TOPO_4P2H_ROME;
-  if (!str) NCCLCHECK(parseChordalRing(system, &str));
-  if (!str) NCCLCHECK(parseRome4P2H(system, &str));
  if (str) {
-    NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
-    for (int i=0; i<graph->nChannels*ngpus; i++) {
-      // Translate gpu numbers into ranks
-      int j = 0;
-      for (j = 0; j < system->nodes[GPU].count; j++)
-        if (graph->intra[i] == system->nodes[GPU].nodes[j].gpu.dev)
-          break;
-      if (j < system->nodes[GPU].count)
-        graph->intra[i] = system->nodes[GPU].nodes[j].gpu.rank;
-      else
-        return ncclInternalError;
-    }
-    graph->speedIntra = graph->speedInter = system->maxWidth;
-    if (system->nodes[NET].count) {
-      // do not change ring order for multi node 4P2H on Rome
-      if (system->type == RCCL_TOPO_4P2H_ROME) {
-        for (int n = 0; n < graph->nChannels; n++) {
-          graph->inter[n*2] = n%system->nodes[NET].count;
-          graph->inter[n*2+1] = (n+1)%system->nodes[NET].count;
-        }
-      } else {
-        int *intra, *used;
-        graph->nChannels = system->nodes[NET].count;
-        NCCLCHECK(ncclCalloc(&intra, ngpus));
-        NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
-        for (int n = 0; n < system->nodes[NET].count; n++) {
-          graph->inter[n*2] = graph->inter[n*2+1] = n;
-          struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-          struct ncclTopoLinkList* paths = net->paths[GPU];
-          // find the first unsed GPU that is closest to NIC
-          int f, m;
-          for (f = 0; f < ngpus; f++) {
-            int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
-            if(j >= n) break;
-          }
-          for (int i = 0; i < ngpus; i++) {
-            int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
-            if (j < n) continue;
-            if (paths[i].count < paths[f].count) f = i;
-          }
-          for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
-          used[n] = graph->intra[n*ngpus+m];
-          for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
-          for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
-        }
-        free(used);
-        free(intra);
-      }
+    // user supplied topo
+    NCCLCHECK(parseGraph(str, system, graph, NULL, nnets, NULL));
+    if (graph->nChannels) {
+      system->type = RCCL_TOPO_4P2H_ROME;
+      return ncclSuccess;
    }
+  } else {
+    // try to match 8P6L
+    NCCLCHECK(parseChordalRing(system, graph));
+    if (graph->nChannels) return ncclSuccess;
+    // try to match Rome 4P2H
+    NCCLCHECK(parseRome4P2H(system, graph));
    if (graph->nChannels) return ncclSuccess;
  }

@@ -95,7 +95,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
    struct ncclTopoRanks* topoRanks);

 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
-    struct ncclTopoRanks** allTopoRanks, int* rings, int gcn);
+    struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);

 ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);

@@ -849,6 +849,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  }
  INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled");

+  // count NETs used by ring
+  int nNets = 0;
+  int nets[MAXCHANNELS*2];
+  for (int i = 0; i < ringGraph.nChannels; i++) {
+    for (int j = 0; j < 2; j++) {
+      int k;
+      for (k = 0; k < nNets; k++)
+        if (nets[k] == ringGraph.inter[i*2+j]) break;
+      if (k >= nNets) {
+        nets[nNets] = ringGraph.inter[i*2+j];
+        nNets++;
+      }
+    }
+  }
+
  if (comm->nChannels < nChannelsOrig) {
    // We started duplicating channels during Preset(), so we need to move the
    // duplicated channels since we have removed some.
@@ -858,7 +873,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  int *rings;
  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));

-  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
+  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
  if (comm->nNodes > 1 &&
      ncclParamCollNetEnable() == 1 &&
      collNetSupport() && collNetGraph.nChannels) {
@@ -21,29 +21,17 @@ struct p2pConnectInfo {
    void* directPtr;
    hipIpcMemHandle_t devIpc;
  };
-  uint64_t pidHash;
-  int id;
-  int sendRank;
-  int recvRank;
 };

 struct p2pSendResources {
  struct ncclSendMem* devMem;
  void* ipcPtr;
  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
-  uint64_t* opCount;  // opCount allocated in host memory
-  uint64_t* devOpCount;  // device side pointer to opCount
-  uint64_t* remOpCount;  // remote opCount allocated in host memory
-  uint64_t* devRemOpCount;  // device side pointer to remote opCount
 };

 struct p2pRecvResources {
  struct ncclRecvMem* devMem;
  void* ipcPtr;
-  uint64_t* opCount;  // opCount allocated in host memory
-  uint64_t* devOpCount;  // device side pointer to opCount
-  uint64_t* remOpCount;  // remote opCount allocated in host memory
-  uint64_t* devRemOpCount;  // device side pointer to remote opCount
 };

 #include <sys/types.h>
@@ -123,7 +111,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
  } while (0)

-#define MAX_SHM_NAME_LEN 1024
 // Setting this to non zero causes P2P to use Reads rather than Writes
 NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);

@@ -165,16 +152,6 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  }

  struct p2pConnectInfo info;
-  info.id = channelId;
-  info.pidHash = myInfo->pidHash;
-  info.sendRank = myInfo->cudaDev;
-  info.recvRank = peerInfo->cudaDev;
-
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-p2p-send-opcount-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
-  TRACE(NCCL_P2P,"Open shmName %s", shmName);
-  NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1));
-
  info.read = useRead;
  const char* useReadStr = info.read ? "/read" : "";
  if (myInfo->pidHash == peerInfo->pidHash) {
@@ -232,16 +209,6 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));

  struct p2pConnectInfo info;
-  info.id = channelId;
-  info.pidHash = myInfo->pidHash;
-  info.sendRank = peerInfo->cudaDev;
-  info.recvRank = myInfo->cudaDev;
-
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-p2p-recv-opcount-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
-  TRACE(NCCL_P2P,"Open shmName %s", shmName);
-  NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1));
-
  info.read = useRead;
  if (myInfo->pidHash == peerInfo->pidHash) {
    info.direct = 1;
@@ -298,13 +265,6 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
    }
  }

-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-p2p-recv-opcount-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
-  TRACE(NCCL_P2P,"Open shmName %s", shmName);
-  NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0));
-  // Remove the file to ensure proper clean-up
-  NCCLCHECK(shmUnlink(shmName));
-
  int offset = 0;
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (info->read && p == NCCL_PROTO_SIMPLE) {
@@ -344,12 +304,6 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
    }
  }

-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-p2p-send-opcount-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
-  TRACE(NCCL_P2P,"Open shmName %s", shmName);
-  NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0));
-  NCCLCHECK(shmUnlink(shmName));
-
  int offset = 0;
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    if (info->read && p == NCCL_PROTO_SIMPLE) {
@@ -370,8 +324,6 @@ ncclResult_t p2pSendFree(void* resources) {
  if (sendRes->ipcPtr)
    CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
  CUDACHECK(hipFree(sendRes->devMem));
-  NCCLCHECK(shmClose(sendRes->opCount, sendRes->devOpCount, sizeof(uint64_t)));
-  NCCLCHECK(shmClose(sendRes->remOpCount, sendRes->devRemOpCount, sizeof(uint64_t)));
  free(sendRes);
  return ncclSuccess;
 }
@@ -381,8 +333,6 @@ ncclResult_t p2pRecvFree(void* resources) {
  if (recvRes->ipcPtr)
    CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
  CUDACHECK(hipFree(recvRes->devMem));
-  NCCLCHECK(shmClose(recvRes->opCount, recvRes->devOpCount, sizeof(uint64_t)));
-  NCCLCHECK(shmClose(recvRes->remOpCount, recvRes->devRemOpCount, sizeof(uint64_t)));
  free(recvRes);
  return ncclSuccess;
 }
@@ -21,7 +21,7 @@

 DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

-for i in {0..35}
+for i in {0..37}
 do
 	$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
 	$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
@@ -0,0 +1,81 @@
+
+<system version="2">
+  <cpu numaid="1" affinity="00000000,00000000,ffffffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="0" affinity="00000000,00000000,00000000,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0x20cd600003da341c" maxconn="262144" gdr="1"/>
+        <net name="mlx5_1" dev="1" speed="200000" port="2" guid="0x20cd600003da341c" maxconn="262144" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,93 @@
+<system version="2">
+  <cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:61:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" guid="0xa8134300039f59b8" maxconn="262144" gdr="1"/>
+        <net name="mlx5_1" dev="1" speed="100000" port="2" guid="0xa8134300039f59b8" maxconn="262144" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+  <cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:a1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+      <nic>
+        <net name="mlx5_2" dev="2" speed="200000" port="1" guid="0x38815600039f59b8" maxconn="262144" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+</system>
@@ -105,6 +105,8 @@ NodeModelDesc model_descs[] = {
  {4, "topo_8p_ts1_n4_1.xml",   "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"},
  {1, "topo_4p3l_ia.xml",       "single node 8 gfx908"},
  {4, "topo_4p3l_ia.xml",       "4 nodes 8 gfx908"},
+  {4, "topo_8p_rome_n2_2.xml",  "4 nodes 8 VEGA20 Rome NPS=2 Alt. Model 2 NET/IF"},
+  {4, "topo_8p_ts1_n4_2.xml",   "4 nodes 8 VEGA20 TS1 NPS=4 3 NET/IF"},
 };

 int main(int argc,char* argv[])
@@ -428,6 +428,21 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  }
  INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled");

+  // count NETs used by ring
+  int nNets = 0;
+  int nets[MAXCHANNELS*2];
+  for (int i = 0; i < ringGraph.nChannels; i++) {
+    for (int j = 0; j < 2; j++) {
+      int k;
+      for (k = 0; k < nNets; k++)
+        if (nets[k] == ringGraph.inter[i*2+j]) break;
+      if (k >= nNets) {
+        nets[nNets] = ringGraph.inter[i*2+j];
+        nNets++;
+      }
+    }
+  }
+
  if (comm->nChannels < nChannelsOrig) {
    // We started duplicating channels during Preset(), so we need to move the
    // duplicated channels since we have removed some.
@@ -437,7 +452,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  int *rings;
  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));

-  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
+  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
  if (comm->nNodes > 1 &&
      ncclParamCollNetEnable() == 1 &&
      collNetSupport() && collNetGraph.nChannels) {