/************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "channel.h" #include "nvmlwrap.h" #include "bootstrap.h" #include "transport.h" #include "group.h" #include "net.h" #include "graph.h" #include "argcheck.h" #include #include #include #include #include #include #include #include #include #include #include #include "xml.h" #include "coll_net.h" #include "model.h" #include "utils.h" const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "Gather", "Scatter", "AllToAll", "AllToAllv" }; const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; extern NodeModel *node_model; NCCL_PARAM(CrossNic, "CROSS_NIC", 2); NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0); NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); thread_local int ncclDebugNoWarn = 0; ncclCollNet_t* ncclCollNet = NULL; // Get current Compute Capability int ncclCudaCompCap() { int ccMajor = 1, ccMinor = 0; return ccMajor*10+ccMinor; } ncclResult_t int64ToBusId(int64_t id, char* busId) { sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); return ncclSuccess; } ncclResult_t busIdToInt64(const char* busId, int64_t* id) { const int size = strlen(busId); char* hexStr; NCCLCHECK(ncclCalloc(&hexStr, size)); int hexOffset = 0; for (int i=0; i= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) { hexStr[hexOffset++] = busId[i]; } else break; } hexStr[hexOffset] = '\0'; *id = strtol(hexStr, NULL, 16); free(hexStr); return ncclSuccess; } int ncclDebugLevel = -1; void ncclDebugInit() { if (ncclDebugLevel != -1) return; const char* nccl_debug = getenv("NCCL_DEBUG"); if (nccl_debug == NULL) { ncclDebugLevel = NCCL_LOG_NONE; } else if (strcasecmp(nccl_debug, "VERSION") == 0) { ncclDebugLevel = NCCL_LOG_VERSION; } else if (strcasecmp(nccl_debug, "WARN") == 0) { ncclDebugLevel = NCCL_LOG_WARN; } else if (strcasecmp(nccl_debug, "INFO") == 0) { ncclDebugLevel = NCCL_LOG_INFO; } else if (strcasecmp(nccl_debug, "ABORT") == 0) { ncclDebugLevel = NCCL_LOG_ABORT; } else if (strcasecmp(nccl_debug, "TRACE") == 0) { ncclDebugLevel = NCCL_LOG_TRACE; } } void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { if (ncclDebugLevel == -1) ncclDebugInit(); if (level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE) return; char buffer[1024]; size_t len = 0; if (node_model) len = snprintf(buffer, sizeof(buffer), "[%d:%d] ", node_model->nodeId, node_model->currRank); va_list args; va_start(args, fmt); vsprintf(buffer+len, fmt, args); va_end(args); printf("%s\n", buffer); if (level == NCCL_LOG_WARN) { fprintf(stderr,"[%d:%d] %s:%d TOPO EXPL ABORT\n", node_model->nodeId, node_model->currRank, filefunc, line); abort(); } } ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system) { struct ncclXml* xml; NCCLCHECK(ncclCalloc(&xml, 1)); NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml)); NCCLCHECK(ncclTopoGetSystemFromXml(xml, system)); free(xml); return ncclSuccess; } ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) { // AllGather1 - begin allGather1Data[comm->rank].peerInfo.rank = comm->rank; allGather1Data[comm->rank].peerInfo.cudaDev = node_model->rankToCudaDev(comm->rank); allGather1Data[comm->rank].peerInfo.gdrSupport = 1; allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash; allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash; allGather1Data[comm->rank].peerInfo.shmDev = 0x19; allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(comm->rank); return ncclSuccess; } ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { // We use 3 AllGathers // 1. { peerInfo, comm } // 2. ConnectTransport[nranks], ConnectValue[nranks] // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] } int rank = comm->rank; int nranks = comm->nRanks; //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); //NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); // AllGather1 - begin //struct allGather1Data_t *allGather1Data; //NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); //allGather1Data[rank].comm = comm; struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo; //NCCLCHECK(fillInfo(comm, myInfo, commHash)); //NCCLCHECK(bootstrapAllGather(comm, allGather1Data)); NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); for (int i = 0; i < nranks; i++) { memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId); return ncclInvalidUsage; } } // AllGather1 data is used again below // AllGather1 - end // Topo detection / System graph creation //NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo)); // Compute paths between GPUs and NICs NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); // Remove inaccessible GPUs and unused NICs NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm)); // Recompute paths after trimming NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); // Init search NCCLCHECK(ncclTopoSearchInit(comm->topo)); // Print final topology NCCLCHECK(ncclTopoPrint(comm->topo)); // Get rings and trees //struct ncclTopoGraph ringGraph; ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; ringGraph.crossNic = ncclParamCrossNic(); ringGraph.collNet = 0; ringGraph.minChannels = 1; ringGraph.maxChannels = MAXCHANNELS/2; NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph)); //struct ncclTopoGraph treeGraph; treeGraph.id = 1; treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; treeGraph.crossNic = ncclParamCrossNic(); treeGraph.collNet = 0; treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels; treeGraph.maxChannels = ringGraph.nChannels; NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph)); //struct ncclTopoGraph collNetGraph; collNetGraph.id = 2; collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; collNetGraph.crossNic = ncclParamCrossNic(); collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); if (comm->rank == ncclParamGraphDumpFileRank()) { struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs)); } // AllGather3 - begin int idx; NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap; allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn; allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); allGather3Data[rank].alltoallDisable = comm->alltoallDisable; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; allGather3Data[rank].tree.speedInter = treeGraph.speedInter; allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra; allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; allGather3Data[rank].ring.speedInter = ringGraph.speedInter; allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra; allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels; allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra; allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter; allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra; NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks)); //INFO(NCCL_GRAPH, "%d: nvlink %d nChannels %d tree.sameChannels %d tree.speedIntra %d tree.speedInter %d tree.nvlink %d ring.sameChannels %d ring.speedIntra %d ring.speedInter %d ring.nvlink %d", // rank, allGather3Data[rank].nvlink, allGather3Data[rank].nChannels, allGather3Data[rank].tree.sameChannels, allGather3Data[rank].tree.speedIntra, allGather3Data[rank].tree.speedInter, allGather3Data[rank].tree.nvlink, // allGather3Data[rank].ring.sameChannels, allGather3Data[rank].ring.speedIntra, allGather3Data[rank].ring.speedInter, allGather3Data[rank].ring.nvlink); //INFO(NCCL_GRAPH, " ringRecv %d ringSend %d ringPrev %d ringNext %d treeUpRecv %d treeUpSend %d treeDnRecv %d treeDnSend %d", // allGather3Data[rank].topoRanks.ringRecv[0], allGather3Data[rank].topoRanks.ringSend[0], allGather3Data[rank].topoRanks.ringPrev[0], allGather3Data[rank].topoRanks.ringNext[0], // allGather3Data[rank].topoRanks.treeUpRecv[0], allGather3Data[rank].topoRanks.treeUpSend[0], allGather3Data[rank].topoRanks.treeDnRecv[0], allGather3Data[rank].topoRanks.treeDnSend[0]); return ncclSuccess; } ncclResult_t initChannel(struct ncclComm* comm, int channelid) { struct ncclChannel* channel = comm->channels+channelid; channel->id = channelid; // Setup intermediate buffering //int buffSize = ncclParamBuffsize(); int cpuArch, cpuVendor, cpuModel; NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); //channel->buffSize = buffSize != -2 ? buffSize : // cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES; // Ring index to user rank table. //NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); // Communication structures with peers. //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network) NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1)); for (size_t i=0; inRanks+1; ++i) { channel->peers[i].send.comm = comm; channel->peers[i].recv.comm = comm; } // Per-channel operation list. //NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); return ncclSuccess; } static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) { TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); NCCLCHECK(initChannel(comm, channelId)); struct ncclRing* ring = &comm->channels[channelId].ring; // Reorganize ranks to start with rank. int shift; for (shift = 0; shiftuserRanks[i] = ringRanks[(i+shift)%nranks]; } return ncclSuccess; } template static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) { for (int t=0; tsend : &transport->recv; int ret = 0; NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo)); if (ret) { connector->transportComm = transportComm; NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, channelId)); return ncclSuccess; } } WARN("No transport found !"); return ncclInternalError; } ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ struct ncclConnect connect; struct ncclConnector* conn; for (int i=0; i= comm->nRanks) continue; conn = &channel->peers[peer].recv; if (conn->connected) { ++nSkippedRecv; continue; } memset(&connect, 0, sizeof(connect)); NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id)); //NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } for (int i=0; i= comm->nRanks) continue; conn = &channel->peers[peer].send; if (conn->connected) { ++nSkippedSend; continue; } memset(&connect, 0, sizeof(connect)); NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id)); //NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } for (int i=0; i= comm->nRanks) continue; conn = &channel->peers[peer].send; if (conn->connected) {++nSkippedSend; continue; } memset(&connect, 0, sizeof(connect)); //NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); //NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn)); conn->connected = 1; //CUDACHECK(hipMemcpy(&channel->devPeers[peer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); } for (int i=0; i= comm->nRanks) continue; conn = &channel->peers[peer].recv; if (conn->connected) {++nSkippedRecv; continue; } memset(&connect, 0, sizeof(connect)); //NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); //NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn)); conn->connected = 1; //CUDACHECK(hipMemcpy(&channel->devPeers[peer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); } TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); return ncclSuccess; } RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 0); ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { int rank = comm->rank; int nranks = comm->nRanks; //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); // Determine nNodes, firstRanks, ... int* nodesFirstRank; NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks)); for (int i=0; inNodes; n++) { if (nodesFirstRank[n] == firstRank) node = n; } if (node == -1) { node = comm->nNodes++; nodesFirstRank[node] = firstRank; } if (i == comm->rank) comm->node = node; } char line[1024]; sprintf(line, "nodesFirstRank: "); int offset = strlen(line); for (int i=0; inNodes; i++) { sprintf(line+offset, "%d ", nodesFirstRank[i]); offset = strlen(line); } INFO(NCCL_INIT, "%s", line); // Determine the minimum CUDA Compute capability of all GPUs int myCompCap = allGather3Data[rank].cudaCompCap; int minCompCap = myCompCap, maxCompCap = myCompCap; for (int i = 0; i < nranks; i++) { minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap); maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap); } int nChannelsOrig = comm->nChannels; struct ncclTopoRanks** allTopoRanks; NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); int gcn = allGather3Data[0].gcn; int alltoallDisable = 0; for (int i=0; inChannels = std::min(allGather3Data[i].nChannels, comm->nChannels); treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); } if (comm->alltoallDisable != alltoallDisable) { comm->alltoallDisable = alltoallDisable; } INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled"); // count NETs used by ring int nNets = 0; int nets[MAXCHANNELS*2]; for (int i = 0; i < ringGraph.nChannels; i++) { for (int j = 0; j < 2; j++) { int k; for (k = 0; k < nNets; k++) if (nets[k] == ringGraph.inter[i*2+j]) break; if (k >= nNets) { nets[nNets] = ringGraph.inter[i*2+j]; nNets++; } } } if (comm->nChannels < nChannelsOrig) { // We started duplicating channels during Preset(), so we need to move the // duplicated channels since we have removed some. for (int i=0; inChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); } int *rings; NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets)); if (comm->nNodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() && collNetGraph.nChannels) { NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank)); } free(allTopoRanks); free(nodesFirstRank); //free(allGather3Data); // AllGather3 - end TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); line[0]='\0'; for (int c=0; cnChannels; c++) { struct ncclTree* treeUp = &comm->channels[c].treeUp; struct ncclTree* treeDn = &comm->channels[c].treeDn; snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d", c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up, treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]); INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next); } line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line); // Set Affinity to a CPU local the our GPU, so that all memory we allocate // on the host is local. cpu_set_t affinitySave; sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank)); ncclResult_t ret; //NCCLCHECK(computeBuffSizes(comm)); // Connect with prev/next for each ring struct ncclConnect *connect; NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore); for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore); if (comm->nRanks == 1) continue; NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore); NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore); NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore); } // Check if we can setup CollNet #if 0 if (comm->nNodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport()) { int logicChannels = comm->nChannels/2; int collNetSetupFail = 0; const int recvIndex = 0; // recv GPU index is always 0 const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern for (int c=0; cchannels+logicChannels+c; struct ncclChannel* channelSend = comm->channels+c; NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down)); NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up)); const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex]; const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex]; if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1) collNetSetupFail = 1; else if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1) collNetSetupFail = 1; } // Verify CollNet setup across ranks NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail)); } #endif TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); free(connect); free(rings); // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm)); if (!alltoallDisable) { int nc = comm->nChannels; if (comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 2; for (int c=0; c= nranks ? 1 : DIVUP(nranks, nc)); struct ncclP2PConnect* connect = &comm->p2plist.connect; connect->nrecv[c] = 0; connect->nsend[c] = 0; for (int p=0; pchannels[c].peers[peerSend].send.connected == 0) { connect->send[c*nranks+connect->nsend[c]++] = peerSend; } if (comm->channels[c].peers[peerRecv].recv.connected == 0) { connect->recv[c*nranks+connect->nrecv[c]++] = peerRecv; } } } for (int c=0; cchannels+c; struct ncclP2PConnect* connect = &comm->p2plist.connect; #if 0 printf("channel %d recv: ", c); for (int i=0; inrecv[c]; i++) printf("%d ", connect->recv[c*nranks+i]); printf("\n"); printf("channel %d send: ", c); for (int i=0; insend[c]; i++) printf("%d ", connect->send[c*nranks+i]); printf("\n"); #endif NCCLCHECK(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*nranks, connect->nsend[c], connect->send+c*nranks)); connect->nrecv[c] = 0; connect->nsend[c] = 0; } } // We should have allocated all buffers, collective fifos, ... we can // restore the affinity. affinity_restore: sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); if (ret != ncclSuccess) return ret; return ncclSuccess; }