Add more info to RCCL logging for topo-aware optim.

[ROCm/rccl commit: bd6dbca8fb]
Этот коммит содержится в:
Lu
2021-07-16 17:20:00 -07:00
коммит произвёл hubertlu-tw
родитель f773636575
Коммит 2ab26a2ff9
5 изменённых файлов: 29 добавлений и 25 удалений
+11 -10
Просмотреть файл
@@ -247,9 +247,9 @@ void *ncclCommThreadMain(void *arg) {
if (fIdx > FUNC_INDEX_P2P)
sprintf(line+offset, "ERROR bad function index %d", fIdx);
else if (fIdx == FUNC_INDEX_P2P)
sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta);
sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks);
else
sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels);
sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
break;
case ncclCollTraceCollEndType:
if (fIdx != 0xffff) {
@@ -258,12 +258,12 @@ void *ncclCommThreadMain(void *arg) {
if (fIdx > FUNC_INDEX_P2P)
sprintf(line+offset, "ERROR bad function index %d", fIdx);
else if (fIdx == FUNC_INDEX_P2P)
sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta);
sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks);
else
sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels);
sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
}
else
sprintf(line+offset, " KE");
sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks);
break;
case ncclCollTraceAbortType:
sprintf(line+offset, " Abort");
@@ -983,10 +983,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
struct ncclTree* tree = &comm->channels[c].tree;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev,
comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s", line);
INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId);
// Set Affinity to a CPU local the our GPU, so that all memory we allocate
// on the host is local.
@@ -1016,7 +1017,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
}
free(rings);
INFO(NCCL_INIT, "Connected all rings");
INFO(NCCL_INIT, "Connected all rings comm %p nRanks %02d busId %lx", comm, comm->nRanks, comm->busId);
// Connect Trees
for (int c=0; c<comm->nChannels; c++) {
@@ -1026,7 +1027,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
INFO(NCCL_INIT, "Connected all trees");
INFO(NCCL_INIT, "Connected all trees comm %p nRanks %02d busId %lx", comm, comm->nRanks, comm->busId);
// Check if we can setup CollNet
if (comm->collNetSupport > 0) {
@@ -1068,7 +1069,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
INFO(NCCL_INIT, "rank %d Connected CollNet comm %p nRanks %02d", rank, comm, comm->nRanks);
collnet_cleanup:
free(heads);
+4 -4
Просмотреть файл
@@ -108,8 +108,8 @@ ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
}
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
return ncclSuccess;
}
@@ -137,8 +137,8 @@ ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
}
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle));
+6 -4
Просмотреть файл
@@ -130,8 +130,9 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
send->conn.curr_hdp_reg = resources->curr_hdp_reg;
}
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank,
peerInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "",
resources->shared ? "/Shared" : "", comm, comm->nRanks);
return ncclSuccess;
}
@@ -210,8 +211,9 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
}
}
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, peerInfo->rank,
peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "",
resources->shared ? "/Shared" : "", comm, comm->nRanks);
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
+6 -6
Просмотреть файл
@@ -198,19 +198,19 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
info.rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
if (info.read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
} else {
CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s comm %p nRanks %02d",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
}
} else {
NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
info.rank = intermediateRank;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
comm->peerInfo[intermediateRank].busId, useReadStr);
comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
}
resources->memRank = info.rank;
+2 -1
Просмотреть файл
@@ -74,7 +74,8 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d", channelId, myInfo->rank,
myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks);
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
return ncclSuccess;