From 767fde8210160ced688f494e20c37873d23bcaed Mon Sep 17 00:00:00 2001 From: PedramAlizadeh Date: Fri, 12 Jan 2024 16:54:19 +0000 Subject: [PATCH] Revert "2.18.5-1" This reverts commit 559b70f86c190a0d8f67f0d7a0f2c9810dd1e8c7. --- src/graph/paths.cc | 4 +- src/graph/search.cc | 32 +++------- src/graph/topo.cc | 137 +++++++++++++++++++++++++++++----------- src/graph/topo.h | 9 --- src/graph/xml.cc | 13 +--- src/transport/net_ib.cc | 2 +- 6 files changed, 115 insertions(+), 82 deletions(-) diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 41fb1ead74..30de6a2d95 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -866,7 +866,9 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { // fill the whole space of nChannels. To do so we mirror the bits in the // nChannels space. for (int c=0; cp2pnChannels; c++) { - comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels); + int mirror = 0; + for (int b=1, mb=(comm->p2pnChannels>>1); bp2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb; + comm->p2pChannels[c] = mirror; } return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 406383bef9..23cb9dbc59 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -412,28 +412,6 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in int* localNets; NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count)); - // First add the preferred NICs - for (int g=0; gnodes[GPU].count; g++) { - if (gpu != -1 && gpu != g) continue; - localNetCount = 0; - struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - for (int c = 0;; c++) { - int netId; - NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId)); - NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); - if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; - localNetCount++; - } - // Append NICs to list - for (int i=0; inodes[GPU].count; g++) { if (gpu != -1 && gpu != g) continue; @@ -443,6 +421,14 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in for (int n=0; nnodes[NET].count; n++) { if (paths[n].type == t) localNets[localNetCount++] = n; } + if (localNetCount == 0) continue; + // Shuffle by gpu NVML device number so that GPUs on the same PCI switch + // with multiple NICs don't use the same one as first choice. + for (int r=0; rnodes[GPU].nodes[g].gpu.dev % localNetCount; r++) { + int net0 = localNets[0]; + for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS) { if (graph->nChannels < netcount) { int gpu; - NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu)); + NCCLCHECK(ncclTopoGetLocalGpu(system, nets[graph->nChannels], &gpu)); if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu)); } } else { diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 0cac204dac..dd6d01d3c0 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -758,61 +758,126 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy return ncclSuccess; } -ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) { +static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) { int minType = PATH_DIS; float maxBw = 0; int count = 0; - NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count)); - struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType]; - for (int i=0; inodes[resultType].count; i++) { - if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) { - maxBw = paths[i].bw; - minType = paths[i].type; - if (pathType) *pathType = minType; + int* nets; + NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); + for (int n=0; nnodes[NET].count; n++) { + struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g; + if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) { + maxBw = path->bw; + minType = path->type; + if (type) *type = minType; count = 0; } - if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i; + if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } - *localCount = count; + + *localNetMask = 0ULL; + for (int n=0; n= 64) return ncclInternalError; + *localNetMask |= 1ULL<nodes[GPU].count; + NCCLCHECK(ncclCalloc(&localNetMasks, ngpus)); + + // Fill localNetMasks for all GPUs. + for (int g=0; gnodes[GPU].nodes[gpu].gpu.dev; - if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount); - if (localNetCount == 0) { - *id = -1; - } else { - net += channelId%(DIVUP(localNetCount,localGpuCount)); - *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id; + int netLocalGpus = 0, netLocalGpu = 0; + for (int g=0; gnodes[GPU].count; g++) { - struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - int id; - NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id)); - if (net == id) { - *gpuIndex = g; - return ncclSuccess; + int ngpus = system->nodes[GPU].count; + int* gpus; + NCCLCHECK(ncclCalloc(&gpus, ngpus)); + + // Find localNetMask which includes net with the most local GPUs. + int netLocalGpus = 0, minType = PATH_DIS; + uint64_t localNetMask = 0ULL; + for (int g=0; g>1); b>=1) if (val & b) mirror |= mb; - return mirror; -} #endif diff --git a/src/graph/xml.cc b/src/graph/xml.cc index c72d6c94f1..902477a275 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -592,18 +592,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* } } pciNode->parent = parent; - // Keep PCI sub devices ordered by PCI Bus ID (Issue #820) - int subIndex = parent->nSubs; - const char* newBusId; - NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId)); - for (int s=0; snSubs; s++) { - const char* busId; - NCCLCHECK(xmlGetAttrStr(parent->subs[s], "busid", &busId)); - if (strcmp(newBusId, busId) < 0) { subIndex = s; break; } - } - for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1]; - parent->subs[subIndex] = pciNode; - parent->nSubs++; + parent->subs[parent->nSubs++] = pciNode; } if (strcmp(parent->name, "pci") == 0) { NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index c54418f608..d8e7c217c7 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -68,7 +68,7 @@ struct userIbDev { uint16_t port_en; }; -#define MAX_IB_DEVS 32 +#define MAX_IB_DEVS 16 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; struct userIbDev userIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;