Merge remote-tracking branch 'ncclRepo/master' into develop

2022-06-02 15:27:24 +00:00
commit e0864e7093
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 12
-NCCL_PATCH   := 10
+NCCL_PATCH   := 12
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -166,8 +166,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
  memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
  pthread_t thread;
  pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
-  pthread_detach(thread); // will not be pthread_join()'d
  ncclSetThreadName(thread, "NCCL BootstrapR");
+  pthread_detach(thread); // will not be pthread_join()'d
  return ncclSuccess;
 }

@@ -13,7 +13,7 @@
 #include <hip/hip_ext.h>
 #include "gdrwrap.h"
 #include "bootstrap.h"
-#include <cstring>
+#include "channel.h"

 #include <cstring> // std::memcpy

@@ -908,16 +908,14 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
  struct ncclComm* comm = info->comm;
  int peer = info->root;
  ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
-  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
-  int peerNode = comm->rankToNode[peer];
-  int peerIndex = comm->rankToLocalRank[peer];
-  int nsteps = comm->maxLocalRanks;
-  int rankIndex = comm->rankToLocalRank[comm->rank];
+  int channelBaseId;
+  NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId));
  if (info->coll == ncclFuncSend) {
    if (peer != comm->rank) {
-      int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
+      // Mark channels that need pre-connect
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
+        int channelId;
+        NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
          comm->connectSend[peer+comm->nRanks*1] |= (1<<channelId);
          comm->connect[1] = 1;
@@ -932,9 +930,10 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
    comm->p2pSendCount++;
  } else {
    if (peer != comm->rank) {
-      int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
+      // Mark channels that need pre-connect
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
+        int channelId;
+        NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
          comm->connectRecv[peer+comm->nRanks*1] |= (1<<channelId);
          comm->connect[1] = 1;
@@ -227,6 +227,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
          }
        }
        // Old style numbering
+        // levelsOldToNew to is an array with each index corresponding to the
+        // "old level" int, and each value mapping to the correct value defined in topo.h
+        // maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
        if (l == -1 && str[0] >= '0' && str[0] <= '9') {
          int oldLevel = strtol(str, NULL, 0);
          const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
@@ -541,24 +544,27 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
      // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
      if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
+        int pxnGpu = -1;
+
        for (int p=0; p<system->nodes[GPU].count; p++) {
          if (p == g) continue;
-          struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
-
-          // To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one.
-          int netDev;
-          NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
-          // Make sure we can allocate memory on that GPU.
-          if (netDev != netNode->id) continue;

          // PXN = PCI + NVLink.
-          if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
+          struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
+          if (peerNode->paths[NET][n].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;

+          pxnGpu = p;
+
+          int netDev;
+          NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
+          // To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
+          if (netDev == netNode->id) break;
+        }
+        if (pxnGpu != -1) {
          // We can use that GPU as relay to communicate with that NIC.
          // Only enabling it in the GPU->NIC direction for now to favor
          // receiving locally and sending remotely (consistent with net.cc)
-          NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n));
-          break;
+          NCCLCHECK(addInterStep(system, GPU, pxnGpu, GPU, g, NET, n));
        }
      }
      // Update path when we dont want to / can't use GPU Direct RDMA.
@@ -382,7 +382,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s

 struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
 struct kvDict kvDictPciGen[] = {
-  { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, /* Kernel 5.6 and earlier */
+  { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
  { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
  { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
 ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
@@ -52,13 +52,28 @@ extern const char* topoNodeTypeStr[];
 #define LINK_NET 8
 extern const char* topoLinkTypeStr[];

+// Local (myself)
 #define PATH_LOC 0
+
+// Connection traversing NVLink
 #define PATH_NVL 1
+
+// Connection through NVLink using an intermediate GPU
 #define PATH_NVB 2
+
+// Connection traversing at most a single PCIe bridge
 #define PATH_PIX 3
+
+// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
 #define PATH_PXB 4
+
+// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
 #define PATH_PXN 5
+
+// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
 #define PATH_PHB 6
+
+// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
 #define PATH_SYS 7
 #define PATH_DIS 7
 extern const char* topoPathTypeStr[];
@@ -9,7 +9,7 @@
 #include "debug.h"
 #include "enqueue.h"
 #include "transport.h"
-#include <unistd.h>
+#include "channel.h"

 #define MAX_ASYNC_OPS 128
 thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
@@ -105,20 +105,25 @@ ncclResult_t ncclGroupStart() {
  return ncclSuccess;
 }

-static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
+static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
  struct ncclInfo info = { ncclFuncSend, "Send",
    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
    1, 1 };
+  int channelId;
+  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
  info.channelId = channelId;
  info.opCount = opCount;
  info.connIndex = connIndex;
  NCCLCHECK(ncclSetupP2pKernel(&info));
  return ncclSuccess;
 }
-static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
+
+static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int chunk, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
  struct ncclInfo info = { ncclFuncRecv, "Recv",
    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
    1, 1 };
+  int channelId;
+  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
  info.channelId = channelId;
  info.opCount = opCount;
  info.connIndex = connIndex;
@@ -296,7 +301,6 @@ sched_delta:
            int sendRemaining = 1, recvRemaining = 1;
            int chunk = 0;
            do {
-              int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
              ssize_t recvbytes = totRecvBytes-recvOffset;
              ssize_t sendbytes = totSendBytes-sendOffset;
              if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
@@ -306,10 +310,10 @@ sched_delta:
                if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
                if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
              if (recv) {
-                NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)(recv->buff))+recvOffset, recv->opCount, recvIdx), ret, group_cleanup);
+                NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, chunk, recvbytes, ((char*)(recv->buff))+recvOffset, recv->opCount, recvIdx), ret, group_cleanup);
              }
              if (send) {
-                NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)(send->buff))+sendOffset, send->opCount, sendIdx), ret, group_cleanup);
+                NCCLCHECKGOTO(scheduleSend(comm, sendPeer, chunk, sendbytes, ((char*)(send->buff))+sendOffset, send->opCount, sendIdx), ret, group_cleanup);
              }
              recvOffset += recvChunkSize;
              sendOffset += sendChunkSize;
@@ -10,5 +10,36 @@

 ncclResult_t initChannel(struct ncclComm* comm, int channelid);
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
+static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
+  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
+  int peerNode = comm->rankToNode[peer];
+  int peerIndex = comm->rankToLocalRank[peer];
+  int nsteps = comm->maxLocalRanks;
+  int rankIndex = comm->rankToLocalRank[comm->rank];
+  int step, delta;
+  if (coll == ncclFuncSend) {
+    step = (nsteps + peerIndex - rankIndex)%nsteps;
+    delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
+  } else if (coll == ncclFuncRecv) {
+    step = (nsteps + rankIndex - peerIndex)%nsteps;
+    delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
+  } else {
+    return ncclInternalError;
+  }
+  *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
+  *channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
+  int base;
+  NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
+  NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
+  return ncclSuccess;
+}

 #endif
@@ -1140,18 +1140,17 @@ collnet_cleanup:
    NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
    for (int r=0; r<nvbNpeers; r++) {
      int peer = nvbPeers[r];
-      int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
+      int channelId;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
-          comm->connectRecv[peer] |= (1<<channelId);
+        NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
+        if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
+          comm->connectSend[peer] |= (1<<channelId);
        }
      }
-      delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
-          comm->connectSend[peer] |= (1<<channelId);
+        NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
+        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
+          comm->connectRecv[peer] |= (1<<channelId);
        }
      }
    }
@@ -962,12 +962,10 @@ void* ncclProxyService(void* _args) {

  struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
  struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
+  memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
  for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
    peers[s].sock.fd = pollfds[s].fd = -1;
-    peers[s].sock.abortFlag = NULL;
-    peers[s].sock.asyncFlag = 0;
    pollfds[s].events = POLLHUP|POLLIN;
-    peers[s].asyncOps.type = 0;
  }
  pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
  pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
@@ -1068,13 +1066,13 @@ void* ncclProxyService(void* _args) {
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
  comm->proxyState.listenSock = sock;
  comm->proxyState.peerAddresses = peerAddresses;
-  ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
  return ncclSuccess;
 }

 ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
  // comm->proxyState.thread is pthread_join()'d by commFree() in init.cc
  pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
+  ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
  return ncclSuccess;
 }

@@ -224,8 +224,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
          ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;

          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
-          pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
          ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
+          pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
          ncclNIbDevs++;
          nPorts++;
          // [RCCL]
@@ -140,6 +140,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop

 // Setting this to non zero causes P2P to use Reads rather than Writes
 NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
+NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);

 static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
  int p2p;
@@ -209,7 +210,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
    if (myInfo->pidHash == peerInfo->pidHash) {
-      send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+      if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
    } else {
@@ -254,7 +255,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
    if (myInfo->pidHash == peerInfo->pidHash) {
-      recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+      if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
    } else {
      recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
    }