From 5fa796af0c2ec32523e6332eea10ee3ea22dd442 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Mon, 26 Feb 2024 02:52:26 -0800 Subject: [PATCH] 2.20.5-1 Fix UDS connection failure when using ncclCommSplit. Issue #1185 [ROCm/rccl commit: 48bb7fec7953112ff37499a272317f6663f8f600] --- projects/rccl/makefiles/version.mk | 2 +- projects/rccl/src/bootstrap.cc | 10 ++++++++-- projects/rccl/src/graph/search.cc | 2 +- projects/rccl/src/init.cc | 7 ++++--- projects/rccl/src/proxy.cc | 6 ++++-- projects/rccl/src/transport/net_ib.cc | 2 +- 6 files changed, 19 insertions(+), 10 deletions(-) diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index ab4fd3c9f7..20f199141f 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 20 -NCCL_PATCH := 3 +NCCL_PATCH := 5 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc index a1475d375e..bbcabc8770 100644 --- a/projects/rccl/src/bootstrap.cc +++ b/projects/rccl/src/bootstrap.cc @@ -305,7 +305,10 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank)); NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress))); // cuMem UDS support - state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash; + // Make sure we create a unique UDS socket name + uint64_t randId; + NCCLCHECK(getRandomData(&randId, sizeof(randId))); + state->peerProxyAddressesUDS[rank] = getPidHash()+randId; NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS))); NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS)); @@ -371,7 +374,10 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail); // cuMem UDS support NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail); - state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash; + // Make sure we create a unique UDS socket name + uint64_t randId; + NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail); + state->peerProxyAddressesUDS[rank] = getPidHash()+randId; NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail); NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail); } diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc index c3287b0bae..b189165eb4 100644 --- a/projects/rccl/src/graph/search.cc +++ b/projects/rccl/src/graph/search.cc @@ -1077,7 +1077,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i int localRanks = comm->topo->nodes[GPU].count; int netNum = 0; int net[MAXCHANNELS]; - + for (int c = 0; c < graph->nChannels; c++) { if (graph->intra[c * localRanks] == comm->rank) { net[netNum++] = graph->inter[c * 2]; diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 39d0213b36..060fa8b9d6 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -826,6 +826,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; + int nNodes = 1; cpu_set_t affinitySave; struct ncclTopoGraph ringGraph; struct ncclTopoGraph treeGraph; @@ -865,6 +866,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail); for (int i = 0; i < nranks; i++) { + if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++; if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); ret = ncclInvalidUsage; @@ -879,7 +881,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p #include "cudawrap.h" // MNNVL support - { + if (nNodes > 1) { int cliqueSize = 0; comm->MNNVL = 0; // Determine the size of the MNNVL domain/clique @@ -1485,15 +1487,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { if (job->color == NCCL_SPLIT_NOCOLOR) goto exit; snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color); NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); - comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail); } else { NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); - comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail); } comm->cudaArch = cudaArch; + comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); diff --git a/projects/rccl/src/proxy.cc b/projects/rccl/src/proxy.cc index b2b488264d..7e25e3525c 100644 --- a/projects/rccl/src/proxy.cc +++ b/projects/rccl/src/proxy.cc @@ -1660,6 +1660,9 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union comm->proxyState->listenSock = sock; comm->proxyState->peerAddresses = peerAddresses; comm->proxyState->peerAddressesUDS = peerAddressesUDS; + + // UDS support + NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag)); // Seed the random number generator for UDS filename generation struct timeval time; gettimeofday(&time,NULL); @@ -1693,8 +1696,7 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) { ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev); // UDS support - INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d pidHash %lx", comm, comm->rank, comm->peerInfo[comm->rank].pidHash); - NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, comm->peerInfo[comm->rank].pidHash, comm->abortFlag)); + INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank); pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState); ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev); } diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index 43b110956d..97dc60c493 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -853,7 +853,7 @@ ib_connect_check: // Print just the QPs for this dev if (comm->base.qps[q].devIndex == i) INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x", - comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, + comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, ncclParamIbGidIndex(), devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey); }