Merge remote-tracking branch 'ncclRepo/master' into develop
Esse commit está contido em:
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 12
|
||||
NCCL_PATCH := 10
|
||||
NCCL_PATCH := 12
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+1
-1
@@ -166,8 +166,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
|
||||
memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
|
||||
pthread_t thread;
|
||||
pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
|
||||
pthread_detach(thread); // will not be pthread_join()'d
|
||||
ncclSetThreadName(thread, "NCCL BootstrapR");
|
||||
pthread_detach(thread); // will not be pthread_join()'d
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+9
-10
@@ -13,7 +13,7 @@
|
||||
#include <hip/hip_ext.h>
|
||||
#include "gdrwrap.h"
|
||||
#include "bootstrap.h"
|
||||
#include <cstring>
|
||||
#include "channel.h"
|
||||
|
||||
#include <cstring> // std::memcpy
|
||||
|
||||
@@ -908,16 +908,14 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
struct ncclComm* comm = info->comm;
|
||||
int peer = info->root;
|
||||
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
|
||||
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
|
||||
int peerNode = comm->rankToNode[peer];
|
||||
int peerIndex = comm->rankToLocalRank[peer];
|
||||
int nsteps = comm->maxLocalRanks;
|
||||
int rankIndex = comm->rankToLocalRank[comm->rank];
|
||||
int channelBaseId;
|
||||
NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId));
|
||||
if (info->coll == ncclFuncSend) {
|
||||
if (peer != comm->rank) {
|
||||
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
// Mark channels that need pre-connect
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
int channelId;
|
||||
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
|
||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
||||
comm->connectSend[peer+comm->nRanks*1] |= (1<<channelId);
|
||||
comm->connect[1] = 1;
|
||||
@@ -932,9 +930,10 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
comm->p2pSendCount++;
|
||||
} else {
|
||||
if (peer != comm->rank) {
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
// Mark channels that need pre-connect
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
int channelId;
|
||||
NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
|
||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
||||
comm->connectRecv[peer+comm->nRanks*1] |= (1<<channelId);
|
||||
comm->connect[1] = 1;
|
||||
|
||||
+16
-10
@@ -227,6 +227,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
}
|
||||
}
|
||||
// Old style numbering
|
||||
// levelsOldToNew to is an array with each index corresponding to the
|
||||
// "old level" int, and each value mapping to the correct value defined in topo.h
|
||||
// maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
|
||||
if (l == -1 && str[0] >= '0' && str[0] <= '9') {
|
||||
int oldLevel = strtol(str, NULL, 0);
|
||||
const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
|
||||
@@ -541,24 +544,27 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
|
||||
int pxnGpu = -1;
|
||||
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
if (p == g) continue;
|
||||
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
|
||||
|
||||
// To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one.
|
||||
int netDev;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
|
||||
// Make sure we can allocate memory on that GPU.
|
||||
if (netDev != netNode->id) continue;
|
||||
|
||||
// PXN = PCI + NVLink.
|
||||
if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
|
||||
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
|
||||
if (peerNode->paths[NET][n].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
|
||||
|
||||
pxnGpu = p;
|
||||
|
||||
int netDev;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
|
||||
// To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
|
||||
if (netDev == netNode->id) break;
|
||||
}
|
||||
if (pxnGpu != -1) {
|
||||
// We can use that GPU as relay to communicate with that NIC.
|
||||
// Only enabling it in the GPU->NIC direction for now to favor
|
||||
// receiving locally and sending remotely (consistent with net.cc)
|
||||
NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n));
|
||||
break;
|
||||
NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n));
|
||||
}
|
||||
}
|
||||
// Update path when we dont want to / can't use GPU Direct RDMA.
|
||||
|
||||
+1
-1
@@ -382,7 +382,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
|
||||
|
||||
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
|
||||
struct kvDict kvDictPciGen[] = {
|
||||
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, /* Kernel 5.6 and earlier */
|
||||
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
|
||||
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
|
||||
{ NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
|
||||
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
|
||||
|
||||
@@ -52,13 +52,28 @@ extern const char* topoNodeTypeStr[];
|
||||
#define LINK_NET 8
|
||||
extern const char* topoLinkTypeStr[];
|
||||
|
||||
// Local (myself)
|
||||
#define PATH_LOC 0
|
||||
|
||||
// Connection traversing NVLink
|
||||
#define PATH_NVL 1
|
||||
|
||||
// Connection through NVLink using an intermediate GPU
|
||||
#define PATH_NVB 2
|
||||
|
||||
// Connection traversing at most a single PCIe bridge
|
||||
#define PATH_PIX 3
|
||||
|
||||
// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
|
||||
#define PATH_PXB 4
|
||||
|
||||
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
|
||||
#define PATH_PXN 5
|
||||
|
||||
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
|
||||
#define PATH_PHB 6
|
||||
|
||||
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
|
||||
#define PATH_SYS 7
|
||||
#define PATH_DIS 7
|
||||
extern const char* topoPathTypeStr[];
|
||||
|
||||
+10
-6
@@ -9,7 +9,7 @@
|
||||
#include "debug.h"
|
||||
#include "enqueue.h"
|
||||
#include "transport.h"
|
||||
#include <unistd.h>
|
||||
#include "channel.h"
|
||||
|
||||
#define MAX_ASYNC_OPS 128
|
||||
thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
|
||||
@@ -105,20 +105,25 @@ ncclResult_t ncclGroupStart() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
|
||||
static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
|
||||
struct ncclInfo info = { ncclFuncSend, "Send",
|
||||
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
|
||||
1, 1 };
|
||||
int channelId;
|
||||
NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
|
||||
info.channelId = channelId;
|
||||
info.opCount = opCount;
|
||||
info.connIndex = connIndex;
|
||||
NCCLCHECK(ncclSetupP2pKernel(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
|
||||
|
||||
static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
|
||||
struct ncclInfo info = { ncclFuncRecv, "Recv",
|
||||
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
|
||||
1, 1 };
|
||||
int channelId;
|
||||
NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
|
||||
info.channelId = channelId;
|
||||
info.opCount = opCount;
|
||||
info.connIndex = connIndex;
|
||||
@@ -296,7 +301,6 @@ sched_delta:
|
||||
int sendRemaining = 1, recvRemaining = 1;
|
||||
int chunk = 0;
|
||||
do {
|
||||
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
|
||||
ssize_t recvbytes = totRecvBytes-recvOffset;
|
||||
ssize_t sendbytes = totSendBytes-sendOffset;
|
||||
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
|
||||
@@ -306,10 +310,10 @@ sched_delta:
|
||||
if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
|
||||
if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
|
||||
if (recv) {
|
||||
NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)(recv->buff))+recvOffset, recv->opCount, recvIdx), ret, group_cleanup);
|
||||
NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, chunk, recvbytes, ((char*)(recv->buff))+recvOffset, recv->opCount, recvIdx), ret, group_cleanup);
|
||||
}
|
||||
if (send) {
|
||||
NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)(send->buff))+sendOffset, send->opCount, sendIdx), ret, group_cleanup);
|
||||
NCCLCHECKGOTO(scheduleSend(comm, sendPeer, chunk, sendbytes, ((char*)(send->buff))+sendOffset, send->opCount, sendIdx), ret, group_cleanup);
|
||||
}
|
||||
recvOffset += recvChunkSize;
|
||||
sendOffset += sendChunkSize;
|
||||
|
||||
@@ -10,5 +10,36 @@
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
|
||||
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
|
||||
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
|
||||
int peerNode = comm->rankToNode[peer];
|
||||
int peerIndex = comm->rankToLocalRank[peer];
|
||||
int nsteps = comm->maxLocalRanks;
|
||||
int rankIndex = comm->rankToLocalRank[comm->rank];
|
||||
int step, delta;
|
||||
if (coll == ncclFuncSend) {
|
||||
step = (nsteps + peerIndex - rankIndex)%nsteps;
|
||||
delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
|
||||
} else if (coll == ncclFuncRecv) {
|
||||
step = (nsteps + rankIndex - peerIndex)%nsteps;
|
||||
delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
|
||||
} else {
|
||||
return ncclInternalError;
|
||||
}
|
||||
*channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
|
||||
*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
|
||||
int base;
|
||||
NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
|
||||
NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
+7
-8
@@ -1140,18 +1140,17 @@ collnet_cleanup:
|
||||
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
|
||||
for (int r=0; r<nvbNpeers; r++) {
|
||||
int peer = nvbPeers[r];
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
int channelId;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
|
||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
}
|
||||
}
|
||||
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
|
||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+2
-4
@@ -962,12 +962,10 @@ void* ncclProxyService(void* _args) {
|
||||
|
||||
struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
|
||||
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
|
||||
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
|
||||
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
|
||||
peers[s].sock.fd = pollfds[s].fd = -1;
|
||||
peers[s].sock.abortFlag = NULL;
|
||||
peers[s].sock.asyncFlag = 0;
|
||||
pollfds[s].events = POLLHUP|POLLIN;
|
||||
peers[s].asyncOps.type = 0;
|
||||
}
|
||||
pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
|
||||
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
|
||||
@@ -1068,13 +1066,13 @@ void* ncclProxyService(void* _args) {
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
|
||||
comm->proxyState.listenSock = sock;
|
||||
comm->proxyState.peerAddresses = peerAddresses;
|
||||
ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||
// comm->proxyState.thread is pthread_join()'d by commFree() in init.cc
|
||||
pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
|
||||
ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -224,8 +224,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
|
||||
|
||||
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
|
||||
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
|
||||
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
|
||||
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
|
||||
ncclNIbDevs++;
|
||||
nPorts++;
|
||||
// [RCCL]
|
||||
|
||||
@@ -140,6 +140,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
|
||||
// Setting this to non zero causes P2P to use Reads rather than Writes
|
||||
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
|
||||
NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);
|
||||
|
||||
static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
|
||||
int p2p;
|
||||
@@ -209,7 +210,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
} else {
|
||||
@@ -254,7 +255,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
} else {
|
||||
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
||||
}
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário