diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index 6e551e8309..16a34a03de 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -28,6 +28,39 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ### Fixed - Potential race-condition during ncclSocketClose() +## RCCL 2.16.2 for ROCm 5.6.0 +### Changed +- Modifying rings to be rail-optimized topology friendly +### Added +### Fixed +### Removed + +## Unreleased - RCCL 2.18.6 for ROCm 6.1.0 +### Changed +- Compatibility with NCCL 2.18.6 +### Added +### Fixed +### Removed + +## RCCL 2.18.3 for ROCm 6.0.0 +### Changed +- Compatibility with NCCL 2.18.3 +### Added +### Fixed +### Removed + +## RCCL 2.17.1-1 for ROCm 5.7.0 +### Changed +- Compatibility with NCCL 2.17.1-1 +- Performance tuning for some collective operations +### Added +- Minor improvements to MSCCL codepath +- NCCL_NCHANNELS_PER_PEER support +- Improved compilation performance +- Support for gfx94x +### Fixed +- Potential race-condition during ncclSocketClose() + ## RCCL 2.16.2 for ROCm 5.6.0 ### Changed - Compatibility with NCCL 2.16.2 diff --git a/projects/rccl/src/graph/connect.cc b/projects/rccl/src/graph/connect.cc index 617e487af1..8d627d94e0 100644 --- a/projects/rccl/src/graph/connect.cc +++ b/projects/rccl/src/graph/connect.cc @@ -224,6 +224,33 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next); if (channel1) TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next); } + + // [RCCL] Print off the recv/send local ranks per node, per channel + if (comm->rank == 0) + { + char buff[2048] = ""; + int offset = 0; + int inc; + int numChannels = (nChannels > MAXCHANNELS/2) ? 2 * nChannels : nChannels; + + for (int c = 0; c < numChannels; c++) { + sprintf(buff + offset, " %02d%n", c, &inc); + offset += inc; + } + INFO(NCCL_GRAPH, "[RINGS] %s", buff); + + for (int n = 0; n < nNodes; n++) { + offset = 0; + for (int c = 0; c < nChannels; c++) { + int recvRank = comm->rankToLocalRank[ringRecv[c*comm->nNodes+n]]; + int sendRank = comm->rankToLocalRank[ringSend[c*comm->nNodes+n]]; + sprintf(buff + offset, " %02d->%02d%n", recvRank, sendRank, &inc); + offset += inc; + } + INFO(NCCL_GRAPH, "[RINGS] %s", buff); + } + } + return ncclSuccess; } diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc index 914d00a95e..387bbd69e2 100644 --- a/projects/rccl/src/graph/rome_models.cc +++ b/projects/rccl/src/graph/rome_models.cc @@ -42,8 +42,12 @@ struct rcclRomeModel { uint8_t gdrLevel[NCCL_TOPO_MAX_NODES*NCCL_TOPO_MAX_NODES]; const char *pattern; const char *ringBase; + const char *ringTail2; // Lines to use for node N-2 if the total number of nodes is odd + const char *ringTail1; // Lines to use for node N-1 if the total number of nodes is odd const char *options; const char *treeBase; + + }; static struct rcclRomeModel rome_model_22 = { @@ -570,15 +574,76 @@ static struct rcclRomeModel rome_model_80 = { static struct rcclRomeModel rome_model_81 = { .nGpus = 8, .nCpus = 2, .nNics = 8, .nLinks = 7, - .gpuIds = { 0xc000, 0x22000, 0x38000, 0x5c000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, }, - .nicIds = { 0x7000, 0x1d000, 0x33000, 0x57000, 0x9a000, 0xaa000, 0xba000, 0xda000, }, - .gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, }, - .nicNuma = { 0, 0, 0, 0, 1, 1, 1, 1, }, - .connMatrix = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, }, - .gdrLevel = { PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, }, - .pattern = "4444", - .ringBase = "N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5|N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5", - .options = "noCpuCheck=1,tuning=5", + .gpuIds = { 0xc000, 0x22000, 0x38000, 0x5c000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, }, + .nicIds = { 0x7000, 0x1d000, 0x33000, 0x57000, 0x9a000, 0xaa000, 0xba000, 0xda000, }, + .gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, }, + .nicNuma = { 0, 0, 0, 0, 1, 1, 1, 1, }, + .connMatrix = { 0, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 0, }, + .gdrLevel = {PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, + PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, + PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, + PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, + PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, + PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, + PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, + PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, }, + .pattern = "4444", + .ringBase = "N0 0 1 2 3 4 5 6 7 N7|" + "N1 1 0 2 4 3 5 7 6 N6|" + "N2 2 5 0 3 6 1 7 4 N4|" + "N3 3 7 0 4 2 1 6 5 N5|" + "N4 4 6 2 7 3 0 5 1 N1|" + "N5 5 4 7 1 3 2 6 0 N0|" + "N6 6 3 1 4 0 7 5 2 N2|" + "N7 7 2 0 6 4 1 5 3 N3|" + "N0 0 1 2 3 4 5 6 7 N7|" + "N1 1 0 2 4 3 5 7 6 N6|" + "N2 2 5 0 3 6 1 7 4 N4|" + "N3 3 7 0 4 2 1 6 5 N5|" + "N4 4 6 2 7 3 0 5 1 N1|" + "N5 5 4 7 1 3 2 6 0 N0|" + "N6 6 3 1 4 0 7 5 2 N2|" + "N7 7 2 0 6 4 1 5 3 N3", + .ringTail2 = "N7 7 4 1 3 2 0 6 5 N5|" + "N6 6 3 0 7 5 1 4 2 N2|" + "N4 4 6 2 1 7 0 5 3 N3|" + "N5 5 2 7 3 1 6 0 4 N4|" + "N1 1 0 2 4 3 5 7 6 N6|" + "N0 0 1 2 3 4 5 6 7 N7|" + "N2 2 5 0 3 6 4 7 1 N1|" + "N3 3 7 2 6 1 5 4 0 N0|" + "N7 7 4 1 3 2 0 6 5 N5|" + "N6 6 3 0 7 5 1 4 2 N2|" + "N4 4 6 2 1 7 0 5 3 N3|" + "N5 5 2 7 3 1 6 0 4 N4|" + "N1 1 0 2 4 3 5 7 6 N6|" + "N0 0 1 2 3 4 5 6 7 N7|" + "N2 2 5 0 3 6 4 7 1 N1|" + "N3 3 7 2 6 1 5 4 0 N0", + .ringTail1 = "N5 5 4 2 7 1 6 3 0 N0|" + "N2 2 5 0 3 7 4 6 1 N1|" + "N3 3 6 4 0 5 1 7 2 N2|" + "N4 4 7 0 6 5 2 1 3 N3|" + "N6 6 2 0 7 5 3 1 4 N4|" + "N7 7 3 2 6 0 4 1 5 N5|" + "N1 1 0 2 4 3 5 7 6 N6|" + "N0 0 1 2 3 4 5 6 7 N7|" + "N5 5 4 2 7 1 6 3 0 N0|" + "N2 2 5 0 3 7 4 6 1 N1|" + "N3 3 6 4 0 5 1 7 2 N2|" + "N4 4 7 0 6 5 2 1 3 N3|" + "N6 6 2 0 7 5 3 1 4 N4|" + "N7 7 3 2 6 0 4 1 5 N5|" + "N1 1 0 2 4 3 5 7 6 N6|" + "N0 0 1 2 3 4 5 6 7 N7", + .options = "noCpuCheck=1,tuning=5", }; static struct rcclRomeModel rome_model_84 = { @@ -609,49 +674,49 @@ static struct rcclRomeModel rome_model_85 = { static struct rcclRomeModel romeTopoModels[] = { - rome_model_22, - rome_model_25, - rome_model_27, - rome_model_29, - rome_model_31, - rome_model_33, - rome_model_30, - rome_model_32, - rome_model_24, - rome_model_26, - rome_model_23, - rome_model_38, - rome_model_28, - rome_model_40, - rome_model_42, - rome_model_44, - rome_model_45, - rome_model_46, - rome_model_48, - rome_model_49, - rome_model_52, - rome_model_53, - rome_model_43, - rome_model_55, - rome_model_56, - rome_model_58, - rome_model_59, - rome_model_62, - rome_model_63, - rome_model_65, - rome_model_66, - rome_model_67, - rome_model_68, - rome_model_71, - rome_model_72, - rome_model_73, - rome_model_74, - rome_model_76, - rome_model_79, - rome_model_80, - rome_model_81, - rome_model_84, - rome_model_85, + rome_model_22, /* 0 */ + rome_model_25, /* 1 */ + rome_model_27, /* 2 */ + rome_model_29, /* 3 */ + rome_model_31, /* 4 */ + rome_model_33, /* 5 */ + rome_model_30, /* 6 */ + rome_model_32, /* 7 */ + rome_model_24, /* 8 */ + rome_model_26, /* 9 */ + rome_model_23, /* 10 */ + rome_model_38, /* 11 */ + rome_model_28, /* 12 */ + rome_model_40, /* 13 */ + rome_model_42, /* 14 */ + rome_model_44, /* 15 */ + rome_model_45, /* 16 */ + rome_model_46, /* 17 */ + rome_model_48, /* 18 */ + rome_model_49, /* 19 */ + rome_model_52, /* 20 */ + rome_model_53, /* 21 */ + rome_model_43, /* 22 */ + rome_model_55, /* 23 */ + rome_model_56, /* 24 */ + rome_model_58, /* 25 */ + rome_model_59, /* 26 */ + rome_model_62, /* 27 */ + rome_model_63, /* 28 */ + rome_model_65, /* 29 */ + rome_model_66, /* 30 */ + rome_model_67, /* 31 */ + rome_model_68, /* 32 */ + rome_model_71, /* 33 */ + rome_model_72, /* 34 */ + rome_model_73, /* 35 */ + rome_model_74, /* 36 */ + rome_model_76, /* 37 */ + rome_model_79, /* 38 */ + rome_model_80, /* 39 */ + rome_model_81, /* 40 */ + rome_model_84, /* 41 */ + rome_model_85, /* 42 */ }; /* Parse user defined rings. Format is like : @@ -660,7 +725,7 @@ static struct rcclRomeModel romeTopoModels[] = { * Rings with a non-matching number of gpus are ignored so we can provide * rings for multiple cases. */ -ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int* net_map) { +ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int* net_map, int reverse) { int gpus[NCCL_TOPO_MAX_NODES]; int nChannels = 0; int gpu = 0; @@ -725,8 +790,10 @@ ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct n for (j = 0; j < ngpus; j++) if (g == system->nodes[GPU].nodes[j].gpu.dev) break; - if (j < ngpus) - graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank; + if (j < ngpus) { + int idx = (nChannels*ngpus) + (reverse ? ngpus - 1 - r : r); + graph->intra[idx] = system->nodes[GPU].nodes[j].gpu.rank; + } else return ncclInternalError; } @@ -736,6 +803,12 @@ ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct n if (nets[i]-'N' < 0 || nets[i]-'N' >= nnets) continue; nets[i] = net_map[nets[i]-'N']+'N'; } + // Swap input/output NICs if reversed + if (reverse) { + int t = nets[0]; + nets[0] = nets[ngpus*2-1]; + nets[ngpus*2-1] = t; + } memcpy(&graph->intraNets[ngpus*nChannels*2], nets, ngpus*2*sizeof(int)); graph->nIntraChannels++; if (nets[0]-'N' >= nnets || nets[ngpus*2-1]-'N' >= nnets) goto newchannel; @@ -764,11 +837,12 @@ end: graph->inter[i*2+1] = net; } } + #if 0 for (int i=0; inChannels; i++) { - printf("%d: ", i); - printf ("NET/%d ", graph->inter[i*2]); - for (int j=0; jintra[i*ngpus+j]); + printf("Channel %02d: ", i); + printf ("NET/%02d ", graph->inter[i*2]); + for (int j=0; jintra[i*ngpus+j]); printf ("NET/%d ", graph->inter[i*2+1]); printf("\n"); } @@ -972,7 +1046,7 @@ ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGrap } // create chordal ring based on reference and remapped ids system->type |= RCCL_TOPO_CR8G; - NCCLCHECK(parseGraph(ringBase, system, graph, id, NULL)); + NCCLCHECK(parseGraph(ringBase, system, graph, id, NULL, false)); if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) { int *intra, *used; graph->nChannels = system->nodes[NET].count; @@ -1257,6 +1331,11 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* int ncpus = system->nodes[CPU].count; int nnets = system->nodes[NET].count; + // Only support ring and tree graphs + if (graph->pattern != NCCL_TOPO_PATTERN_RING && + graph->pattern != NCCL_TOPO_PATTERN_BALANCED_TREE) + return ncclSuccess; + if (ngpus > 8) return ncclSuccess; // only valid on Rome int arch, vendor, model; @@ -1365,8 +1444,42 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* parseOptions(system, romeTopoModels[i].options); // create 4P2H based on reference and remapped ids - NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL)); - if (romeTopoModels[i].treeBase != nullptr) NCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g)); + switch (graph->pattern) { + case NCCL_TOPO_PATTERN_RING: + // Attempt to use rail-optimized rings if they exist + if (system->nHosts % 2 == 0) { + // For even number of nodes, alternate forward/reverse on ringBase + NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); + } + else { + // For odd number of nodes, check first to see if ringTail1 and ringTail2 are defined + if (system->nHosts == 1 || romeTopoModels[i].ringTail1 == nullptr || romeTopoModels[i].ringTail2 == nullptr) { + if (system->nHosts > 1) + INFO(NCCL_GRAPH, "[WARN] Dropping back due to lack of support for odd-number of nodes for model index %d\n", i); + NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); + } + else + { + if (system->hostIdx == (system->nHosts - 1)) { + NCCLCHECK(parseGraph(romeTopoModels[i].ringTail1, system, graph, g, nnets > 1 ? n : NULL, 0)); + } else if (system->hostIdx == (system->nHosts - 2)) { + NCCLCHECK(parseGraph(romeTopoModels[i].ringTail2, system, graph, g, nnets > 1 ? n : NULL, 0)); + } else { + NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); + } + } + } + break; + case NCCL_TOPO_PATTERN_BALANCED_TREE: + if (romeTopoModels[i].treeBase != nullptr) { + NCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g)); + if (graph->nChannels) return ncclSuccess; + } + + // Fall back to tree from ringBase + NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, 0)); + break; + } return ncclSuccess; } @@ -1499,7 +1612,7 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra parseOptions(system, romeTopoModels[i].options); // create 16P1H based on reference and remapped ids - NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g16, nnets > 1 ? n : NULL)); + NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g16, nnets > 1 ? n : NULL, false)); if (romeTopoModels[i].treeBase != nullptr) NCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g16)); // clean up @@ -1601,6 +1714,6 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap system->type |= RCCL_TOPO_4P2H_ROME; parseOptions(system, rome_model_68.options); // create 4P4H based on reference and remapped ids - NCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives)); + NCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives, false)); return ncclSuccess; } diff --git a/projects/rccl/src/graph/rome_models.h b/projects/rccl/src/graph/rome_models.h index 1d7f27daa6..0a33a10e93 100644 --- a/projects/rccl/src/graph/rome_models.h +++ b/projects/rccl/src/graph/rome_models.h @@ -22,11 +22,11 @@ THE SOFTWARE. #ifndef RCCL_ROME_MODELS_H_ #define RCCL_ROME_MODELS_H_ -ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int* net_map); +ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int* net_map, int reverse); ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map); ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); -#endif \ No newline at end of file +#endif diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc index 1f8bec0faa..72217a742d 100644 --- a/projects/rccl/src/graph/search.cc +++ b/projects/rccl/src/graph/search.cc @@ -946,7 +946,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph NCCLCHECK(parseGraphLight(strTrees, system, graph, NULL)); system->treeDefined=true; } else { - NCCLCHECK(parseGraph(str, system, graph, NULL, NULL)); + NCCLCHECK(parseGraph(str, system, graph, NULL, NULL, false)); int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) { @@ -959,6 +959,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph if (graph->nChannels) return ncclSuccess; // try to match Rome 4P2H NCCLCHECK(parseRome4P2H(system, graph)); + if (graph->nChannels) return ncclSuccess; // try to match 1H16P NCCLCHECK(parse1H16P(system, graph)); diff --git a/projects/rccl/src/graph/topo.h b/projects/rccl/src/graph/topo.h index fc6336cb5d..fabb0a798a 100644 --- a/projects/rccl/src/graph/topo.h +++ b/projects/rccl/src/graph/topo.h @@ -177,6 +177,10 @@ struct ncclTopoSystem { bool ll128Enabled; float baseBw; bool mscclEnabled; + + // [RCCL] Track hostIdx and number of hosts to support rail-optimized rings/trees + int nHosts; + int hostIdx; }; ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); diff --git a/projects/rccl/src/graph/xml.cc b/projects/rccl/src/graph/xml.cc index 59cba85408..6af3d0db27 100644 --- a/projects/rccl/src/graph/xml.cc +++ b/projects/rccl/src/graph/xml.cc @@ -307,8 +307,11 @@ ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, struct ncclX } const char* name; NCCLCHECK(xmlGetAttr(head, "name", &name)); - if (name != NULL) INFO(NCCL_GRAPH, "Loading topology %s", name); - else INFO(NCCL_GRAPH, "Loading unnamed topology"); + if (name != NULL) { + INFO(NCCL_GRAPH, "Loading topology %s", name); + } else { + INFO(NCCL_GRAPH, "Loading unnamed topology"); + } struct xmlHandler handlers[] = { { "cpu", ncclTopoXmlLoadCpu } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 324b80a245..08c19f5f3a 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1060,6 +1060,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p goto fail; } } + // AllGather1 - end do { @@ -1159,6 +1160,27 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Determine local Nvls support NCCLCHECK(ncclNvlsInit(comm)); + // [RCCL] Compute hostIdx (based on hostHash) + { + comm->topo->nHosts = 0; + for (int r = 0; r < nranks; r++) { + int isNewHost = 1; + // Check if this is the first time this hostname has been used + for (int i = 0; i < r && isNewHost; i++) { + if (comm->peerInfo[i].hostHash == comm->peerInfo[r].hostHash) { + isNewHost = 0; + } + } + if (isNewHost) + { + // Check if this is the same hostname associated with this rank + if (comm->peerInfo[r].hostHash == comm->peerInfo[rank].hostHash) + comm->topo->hostIdx = comm->topo->nHosts; + comm->topo->nHosts++; + } + } + } + // Get rings and trees memset(&ringGraph, 0, sizeof(struct ncclTopoGraph)); ringGraph.id = 0; diff --git a/projects/rccl/tools/scripts/topo_val.sh b/projects/rccl/tools/scripts/topo_val.sh index cfdb668f94..954a2be4a7 100755 --- a/projects/rccl/tools/scripts/topo_val.sh +++ b/projects/rccl/tools/scripts/topo_val.sh @@ -21,7 +21,7 @@ DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -for i in {0..81} +for i in {0..82} do if [[ $i -eq 50 ]] || [[ $i -eq 51 ]] then diff --git a/projects/rccl/tools/topo_expl/topo_expl.cpp b/projects/rccl/tools/topo_expl/topo_expl.cpp index 2d8d18312b..c1b6ffb729 100644 --- a/projects/rccl/tools/topo_expl/topo_expl.cpp +++ b/projects/rccl/tools/topo_expl/topo_expl.cpp @@ -158,6 +158,7 @@ NodeModelDesc model_descs[] = { {2, "topo_8p_940vm.xml", "2 nodes gfx940 VM"}, {2, "topo_8p_940_16n.xml", "2 nodes gfx940 16 NICs"}, {2, "topo_8p1h_6.xml", "2 nodes 8P1H Alt."}, + {5, "topo_8p_940.xml", "5 nodes gfx940 8P"}, }; NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS); diff --git a/projects/rccl/tools/topo_expl/utils.cpp b/projects/rccl/tools/topo_expl/utils.cpp index 3d9de6a0d4..19bb504555 100644 --- a/projects/rccl/tools/topo_expl/utils.cpp +++ b/projects/rccl/tools/topo_expl/utils.cpp @@ -823,6 +823,29 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a // Determine local Nvls support //NCCLCHECK(ncclNvlsInit(comm)); + // [RCCL] Compute hostIdx (based on hostHash) + { + comm->topo->nHosts = 0; + for (int r = 0; r < nranks; r++) { + int isNewHost = 1; + // Check if this is the first time this hostname has been used + for (int i = 0; i < r && isNewHost; i++) { + if (comm->peerInfo[i].hostHash == comm->peerInfo[r].hostHash) + isNewHost = 0; + } + if (isNewHost) + { + // Check if this is the same hostname associated with this rank + if (comm->peerInfo[r].hostHash == comm->peerInfo[rank].hostHash) + { + comm->topo->hostIdx = comm->topo->nHosts; + printf("Rank %d is on host %d\n", rank, comm->topo->hostIdx); + } + comm->topo->nHosts++; + } + } + } + // Get rings and trees ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING;