Merge branch 'develop' into 2.10.3

Этот коммит содержится в:
Wenkai Du
2021-08-04 09:45:22 -07:00
родитель bf2339f93e 2d0ed8dff6
Коммит babbd1047b
2 изменённых файлов: 4 добавлений и 13 удалений
+2 -13
Просмотреть файл
@@ -8,7 +8,6 @@
#include "comm.h"
#include "info.h"
#include "bootstrap.h"
#include "../graph/topo.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
@@ -20,16 +19,6 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
netTransport,
};
static ncclResult_t connectedByXGMI(int* ret, struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 0;
if (info1->hostHash != info2->hostHash) return ncclSuccess;
int g1, g2;
NCCLCHECK(ncclTopoRankToIndex(system, info1->rank, &g1));
NCCLCHECK(ncclTopoRankToIndex(system, info2->rank, &g2));
if (system->nodes[GPU].nodes[g1].paths[GPU][g2].type == PATH_NVL) *ret = 1;
return ncclSuccess;
}
template <int type>
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) {
struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
@@ -44,8 +33,8 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, peer, graph, channelId, (type == 1) ? 0 : 1, &n2));
}
int xgmi;
NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, myInfo, peerInfo));
bool xgmi;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
for (int t=0; t<NTRANSPORTS; t++) {
if (graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P))) continue;
if (graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET) continue;
+2
Просмотреть файл
@@ -199,6 +199,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
if (ncclNIbDevs == 0) {
INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
} else {
auto cmpIbDevs = [](const void* n1, const void* n2) { return strcmp(((struct ncclIbDev*)n1)->devName, ((struct ncclIbDev*)n2)->devName); };
qsort(ncclIbDevs, ncclNIbDevs, sizeof(struct ncclIbDev), cmpIbDevs);
char line[1024];
line[0] = '\0';
for (int d=0; d<ncclNIbDevs; d++) {