Merge remote-tracking branch 'ncclRepo/master' into develop

Esse commit está contido em:
Aristotelis
2022-06-02 15:27:24 +00:00
12 arquivos alterados com 97 adições e 44 exclusões
+1 -1
Ver Arquivo
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 12
NCCL_PATCH := 10
NCCL_PATCH := 12
NCCL_SUFFIX :=
PKG_REVISION := 1
+1 -1
Ver Arquivo
@@ -166,8 +166,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
pthread_t thread;
pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
pthread_detach(thread); // will not be pthread_join()'d
ncclSetThreadName(thread, "NCCL BootstrapR");
pthread_detach(thread); // will not be pthread_join()'d
return ncclSuccess;
}
+9 -10
Ver Arquivo
@@ -13,7 +13,7 @@
#include <hip/hip_ext.h>
#include "gdrwrap.h"
#include "bootstrap.h"
#include <cstring>
#include "channel.h"
#include <cstring> // std::memcpy
@@ -908,16 +908,14 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
struct ncclComm* comm = info->comm;
int peer = info->root;
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
int peerNode = comm->rankToNode[peer];
int peerIndex = comm->rankToLocalRank[peer];
int nsteps = comm->maxLocalRanks;
int rankIndex = comm->rankToLocalRank[comm->rank];
int channelBaseId;
NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId));
if (info->coll == ncclFuncSend) {
if (peer != comm->rank) {
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
// Mark channels that need pre-connect
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
int channelId;
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer+comm->nRanks*1] |= (1<<channelId);
comm->connect[1] = 1;
@@ -932,9 +930,10 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
comm->p2pSendCount++;
} else {
if (peer != comm->rank) {
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
// Mark channels that need pre-connect
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
int channelId;
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer+comm->nRanks*1] |= (1<<channelId);
comm->connect[1] = 1;
+16 -10
Ver Arquivo
@@ -227,6 +227,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
}
}
// Old style numbering
// levelsOldToNew to is an array with each index corresponding to the
// "old level" int, and each value mapping to the correct value defined in topo.h
// maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
if (l == -1 && str[0] >= '0' && str[0] <= '9') {
int oldLevel = strtol(str, NULL, 0);
const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
@@ -541,24 +544,27 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
int pxnGpu = -1;
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
// To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one.
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
// Make sure we can allocate memory on that GPU.
if (netDev != netNode->id) continue;
// PXN = PCI + NVLink.
if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
if (peerNode->paths[NET][n].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
pxnGpu = p;
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
// To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
if (netDev == netNode->id) break;
}
if (pxnGpu != -1) {
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n));
break;
NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n));
}
}
// Update path when we dont want to / can't use GPU Direct RDMA.
+1 -1
Ver Arquivo
@@ -382,7 +382,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
struct kvDict kvDictPciGen[] = {
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, /* Kernel 5.6 and earlier */
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
{ NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
+15
Ver Arquivo
@@ -52,13 +52,28 @@ extern const char* topoNodeTypeStr[];
#define LINK_NET 8
extern const char* topoLinkTypeStr[];
// Local (myself)
#define PATH_LOC 0
// Connection traversing NVLink
#define PATH_NVL 1
// Connection through NVLink using an intermediate GPU
#define PATH_NVB 2
// Connection traversing at most a single PCIe bridge
#define PATH_PIX 3
// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
#define PATH_PXB 4
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
#define PATH_PXN 5
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
#define PATH_PHB 6
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
#define PATH_SYS 7
#define PATH_DIS 7
extern const char* topoPathTypeStr[];
+10 -6
Ver Arquivo
@@ -9,7 +9,7 @@
#include "debug.h"
#include "enqueue.h"
#include "transport.h"
#include <unistd.h>
#include "channel.h"
#define MAX_ASYNC_OPS 128
thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
@@ -105,20 +105,25 @@ ncclResult_t ncclGroupStart() {
return ncclSuccess;
}
static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
struct ncclInfo info = { ncclFuncSend, "Send",
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
1, 1 };
int channelId;
NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
info.channelId = channelId;
info.opCount = opCount;
info.connIndex = connIndex;
NCCLCHECK(ncclSetupP2pKernel(&info));
return ncclSuccess;
}
static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
struct ncclInfo info = { ncclFuncRecv, "Recv",
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
1, 1 };
int channelId;
NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
info.channelId = channelId;
info.opCount = opCount;
info.connIndex = connIndex;
@@ -296,7 +301,6 @@ sched_delta:
int sendRemaining = 1, recvRemaining = 1;
int chunk = 0;
do {
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
ssize_t recvbytes = totRecvBytes-recvOffset;
ssize_t sendbytes = totSendBytes-sendOffset;
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
@@ -306,10 +310,10 @@ sched_delta:
if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
if (recv) {
NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)(recv->buff))+recvOffset, recv->opCount, recvIdx), ret, group_cleanup);
NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, chunk, recvbytes, ((char*)(recv->buff))+recvOffset, recv->opCount, recvIdx), ret, group_cleanup);
}
if (send) {
NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)(send->buff))+sendOffset, send->opCount, sendIdx), ret, group_cleanup);
NCCLCHECKGOTO(scheduleSend(comm, sendPeer, chunk, sendbytes, ((char*)(send->buff))+sendOffset, send->opCount, sendIdx), ret, group_cleanup);
}
recvOffset += recvChunkSize;
sendOffset += sendChunkSize;
+31
Ver Arquivo
@@ -10,5 +10,36 @@
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
int peerNode = comm->rankToNode[peer];
int peerIndex = comm->rankToLocalRank[peer];
int nsteps = comm->maxLocalRanks;
int rankIndex = comm->rankToLocalRank[comm->rank];
int step, delta;
if (coll == ncclFuncSend) {
step = (nsteps + peerIndex - rankIndex)%nsteps;
delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
} else if (coll == ncclFuncRecv) {
step = (nsteps + rankIndex - peerIndex)%nsteps;
delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
} else {
return ncclInternalError;
}
*channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
return ncclSuccess;
}
static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
return ncclSuccess;
}
static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
int base;
NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
return ncclSuccess;
}
#endif
+7 -8
Ver Arquivo
@@ -1140,18 +1140,17 @@ collnet_cleanup:
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
for (int r=0; r<nvbNpeers; r++) {
int peer = nvbPeers[r];
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
int channelId;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1<<channelId);
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
comm->connectSend[peer] |= (1<<channelId);
}
}
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1<<channelId);
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
comm->connectRecv[peer] |= (1<<channelId);
}
}
}
+2 -4
Ver Arquivo
@@ -962,12 +962,10 @@ void* ncclProxyService(void* _args) {
struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
peers[s].sock.fd = pollfds[s].fd = -1;
peers[s].sock.abortFlag = NULL;
peers[s].sock.asyncFlag = 0;
pollfds[s].events = POLLHUP|POLLIN;
peers[s].asyncOps.type = 0;
}
pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
@@ -1068,13 +1066,13 @@ void* ncclProxyService(void* _args) {
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
comm->proxyState.listenSock = sock;
comm->proxyState.peerAddresses = peerAddresses;
ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
return ncclSuccess;
}
ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
// comm->proxyState.thread is pthread_join()'d by commFree() in init.cc
pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
return ncclSuccess;
}
+1 -1
Ver Arquivo
@@ -224,8 +224,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
ncclNIbDevs++;
nPorts++;
// [RCCL]
+3 -2
Ver Arquivo
@@ -140,6 +140,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
// Setting this to non zero causes P2P to use Reads rather than Writes
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);
static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
int p2p;
@@ -209,7 +210,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
} else {
@@ -254,7 +255,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
}