diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 07ce41424f..6da023d14d 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -247,9 +247,9 @@ void *ncclCommThreadMain(void *arg) { if (fIdx > FUNC_INDEX_P2P) sprintf(line+offset, "ERROR bad function index %d", fIdx); else if (fIdx == FUNC_INDEX_P2P) - sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta); + sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks); else - sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels); + sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks); break; case ncclCollTraceCollEndType: if (fIdx != 0xffff) { @@ -258,12 +258,12 @@ void *ncclCommThreadMain(void *arg) { if (fIdx > FUNC_INDEX_P2P) sprintf(line+offset, "ERROR bad function index %d", fIdx); else if (fIdx == FUNC_INDEX_P2P) - sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta); + sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks); else - sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels); + sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks); } else - sprintf(line+offset, " KE"); + sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks); break; case ncclCollTraceAbortType: sprintf(line+offset, " Abort"); @@ -983,10 +983,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm struct ncclTree* tree = &comm->channels[c].tree; snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d", c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up); - INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next); + INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev, + comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId); } line[1023] = '\0'; - INFO(NCCL_INIT, "Trees%s", line); + INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId); // Set Affinity to a CPU local the our GPU, so that all memory we allocate // on the host is local. @@ -1016,7 +1017,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore); } free(rings); - INFO(NCCL_INIT, "Connected all rings"); + INFO(NCCL_INIT, "Connected all rings comm %p nRanks %02d busId %lx", comm, comm->nRanks, comm->busId); // Connect Trees for (int c=0; cnChannels; c++) { @@ -1026,7 +1027,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore); - INFO(NCCL_INIT, "Connected all trees"); + INFO(NCCL_INIT, "Connected all trees comm %p nRanks %02d busId %lx", comm, comm->nRanks, comm->busId); // Check if we can setup CollNet if (comm->collNetSupport > 0) { @@ -1068,7 +1069,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup); - INFO(NCCL_INIT, "rank %d Connected CollNet", rank); + INFO(NCCL_INIT, "rank %d Connected CollNet comm %p nRanks %02d", rank, comm, comm->nRanks); collnet_cleanup: free(heads); diff --git a/projects/rccl/src/transport/coll_net.cc b/projects/rccl/src/transport/coll_net.cc index db6650321e..34e6841ae8 100644 --- a/projects/rccl/src/transport/coll_net.cc +++ b/projects/rccl/src/transport/coll_net.cc @@ -108,8 +108,8 @@ ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph } NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize)); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : ""); + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks); return ncclSuccess; } @@ -137,8 +137,8 @@ ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph } NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize)); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : ""); + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle)); diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc index 33cee51f53..701e091945 100644 --- a/projects/rccl/src/transport/net.cc +++ b/projects/rccl/src/transport/net.cc @@ -130,8 +130,9 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev)); send->conn.curr_hdp_reg = resources->curr_hdp_reg; } - INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : ""); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, + peerInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "", + resources->shared ? "/Shared" : "", comm, comm->nRanks); return ncclSuccess; } @@ -210,8 +211,9 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st } } - INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : ""); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, peerInfo->rank, + peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "", + resources->shared ? "/Shared" : "", comm, comm->nRanks); struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc index da594a7897..d2f98ab0b1 100644 --- a/projects/rccl/src/transport/p2p.cc +++ b/projects/rccl/src/transport/p2p.cc @@ -198,19 +198,19 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st info.rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash) { if (info.read == 0) send->conn.direct |= NCCL_DIRECT_GPU; - INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", - channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); + INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks); } else { CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr)); - INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s", - channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); + INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s comm %p nRanks %02d", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks); } } else { NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr)); info.rank = intermediateRank; - INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s", + INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank, - comm->peerInfo[intermediateRank].busId, useReadStr); + comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks); } resources->memRank = info.rank; diff --git a/projects/rccl/src/transport/shm.cc b/projects/rccl/src/transport/shm.cc index 98e25a9dd2..1faa6c5341 100644 --- a/projects/rccl/src/transport/shm.cc +++ b/projects/rccl/src/transport/shm.cc @@ -74,7 +74,8 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d", channelId, myInfo->rank, + myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks); static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess;