Add more info to RCCL logging for topo-aware optim.
[ROCm/rccl commit: bd6dbca8fb]
Этот коммит содержится в:
@@ -247,9 +247,9 @@ void *ncclCommThreadMain(void *arg) {
|
||||
if (fIdx > FUNC_INDEX_P2P)
|
||||
sprintf(line+offset, "ERROR bad function index %d", fIdx);
|
||||
else if (fIdx == FUNC_INDEX_P2P)
|
||||
sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta);
|
||||
sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks);
|
||||
else
|
||||
sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels);
|
||||
sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
|
||||
break;
|
||||
case ncclCollTraceCollEndType:
|
||||
if (fIdx != 0xffff) {
|
||||
@@ -258,12 +258,12 @@ void *ncclCommThreadMain(void *arg) {
|
||||
if (fIdx > FUNC_INDEX_P2P)
|
||||
sprintf(line+offset, "ERROR bad function index %d", fIdx);
|
||||
else if (fIdx == FUNC_INDEX_P2P)
|
||||
sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta);
|
||||
sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks);
|
||||
else
|
||||
sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels);
|
||||
sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
|
||||
}
|
||||
else
|
||||
sprintf(line+offset, " KE");
|
||||
sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks);
|
||||
break;
|
||||
case ncclCollTraceAbortType:
|
||||
sprintf(line+offset, " Abort");
|
||||
@@ -983,10 +983,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
struct ncclTree* tree = &comm->channels[c].tree;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
|
||||
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
|
||||
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
|
||||
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev,
|
||||
comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId);
|
||||
}
|
||||
line[1023] = '\0';
|
||||
INFO(NCCL_INIT, "Trees%s", line);
|
||||
INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId);
|
||||
|
||||
// Set Affinity to a CPU local the our GPU, so that all memory we allocate
|
||||
// on the host is local.
|
||||
@@ -1016,7 +1017,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
|
||||
}
|
||||
free(rings);
|
||||
INFO(NCCL_INIT, "Connected all rings");
|
||||
INFO(NCCL_INIT, "Connected all rings comm %p nRanks %02d busId %lx", comm, comm->nRanks, comm->busId);
|
||||
|
||||
// Connect Trees
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
@@ -1026,7 +1027,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
|
||||
INFO(NCCL_INIT, "Connected all trees");
|
||||
INFO(NCCL_INIT, "Connected all trees comm %p nRanks %02d busId %lx", comm, comm->nRanks, comm->busId);
|
||||
|
||||
// Check if we can setup CollNet
|
||||
if (comm->collNetSupport > 0) {
|
||||
@@ -1068,7 +1069,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
|
||||
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
|
||||
INFO(NCCL_INIT, "rank %d Connected CollNet comm %p nRanks %02d", rank, comm, comm->nRanks);
|
||||
|
||||
collnet_cleanup:
|
||||
free(heads);
|
||||
|
||||
@@ -108,8 +108,8 @@ ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -137,8 +137,8 @@ ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
|
||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||
|
||||
NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle));
|
||||
|
||||
@@ -130,8 +130,9 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
|
||||
send->conn.curr_hdp_reg = resources->curr_hdp_reg;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank,
|
||||
peerInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "",
|
||||
resources->shared ? "/Shared" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -210,8 +211,9 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
}
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, peerInfo->rank,
|
||||
peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "",
|
||||
resources->shared ? "/Shared" : "", comm, comm->nRanks);
|
||||
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
|
||||
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
|
||||
|
||||
|
||||
@@ -198,19 +198,19 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
info.rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (info.read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
|
||||
} else {
|
||||
CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr));
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
|
||||
}
|
||||
} else {
|
||||
NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
|
||||
info.rank = intermediateRank;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
|
||||
comm->peerInfo[intermediateRank].busId, useReadStr);
|
||||
comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
|
||||
}
|
||||
resources->memRank = info.rank;
|
||||
|
||||
|
||||
@@ -74,7 +74,8 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
|
||||
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d", channelId, myInfo->rank,
|
||||
myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks);
|
||||
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
|
||||
memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
|
||||
return ncclSuccess;
|
||||
|
||||
Ссылка в новой задаче
Block a user