Enhancement of RCCL logging information for topology-aware utilities
[ROCm/rccl commit: a1842df858]
Este cometimento está contido em:
@@ -157,8 +157,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -175,8 +175,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -198,11 +198,13 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||
|
||||
if (proxyRank == myInfo->rank) {
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
|
||||
} else {
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
|
||||
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
|
||||
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
|
||||
}
|
||||
*((int*)connectInfo) = proxyRank;
|
||||
return ncclSuccess;
|
||||
@@ -236,8 +238,9 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
req.remoteRank = peerInfo->rank;
|
||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -216,18 +216,18 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
|
||||
} else {
|
||||
send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
|
||||
}
|
||||
} else {
|
||||
info->rank = intermediateRank;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
|
||||
comm->peerInfo[intermediateRank].busId, useReadStr);
|
||||
comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
|
||||
|
||||
@@ -70,7 +70,8 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador