diff --git a/makefiles/version.mk b/makefiles/version.mk index 4a82cb951b..6a1deca40b 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 7 -NCCL_PATCH := 3 +NCCL_PATCH := 6 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/graph/search.cc b/src/graph/search.cc index 5dce974373..cedf71522d 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -947,8 +947,8 @@ done: int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); - graph->speedIntra /= 2; - graph->speedInter /= 2; + graph->speedIntra /= DIVUP(dupChannels, graph->nChannels); + graph->speedInter /= DIVUP(dupChannels, graph->nChannels); graph->nChannels = dupChannels; } return ncclSuccess; diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 192dd9aa6c..b7399d6909 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -108,7 +108,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom for (int p=0; pnNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter; float busBw = graphs[a]->nChannels * speed; - if (compCap80) busBw *= 0.92; // Various model refinements if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0; diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 2970efe860..2d26163c7f 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -687,9 +687,14 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm if (index == -1) { const char* busId; NCCLCHECK(xmlGetAttr(sub, "target", &busId)); - char* path; - NCCLCHECK(getPciPath(busId, &path)); - NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); + if (strcmp(busId, "fffffff:ffff:ff") == 0) { + // Remote NVLink device is not visible inside this VM. Assume NVSwitch. + NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000")); + } else { + char* path; + NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); + } } } *gpuNodeRet = gpuNode; diff --git a/src/group.cc b/src/group.cc index 09ae4eff4c..0385ccf44e 100644 --- a/src/group.cc +++ b/src/group.cc @@ -146,7 +146,10 @@ void* ncclAsyncThreadPreconnect(void* args_) { NCCL_API(ncclResult_t, ncclGroupEnd); ncclResult_t ncclGroupEnd() { - if (ncclGroupMode == 0) return ncclInvalidUsage; + if (ncclGroupMode == 0) { + WARN("ncclGroupEnd: not in a group call."); + return ncclInvalidUsage; + } ncclGroupMode--; if (ncclGroupMode > 0) return ncclSuccess; int savedDev;