Merge remote-tracking branch 'nccl/master' into no-target-id

2020-12-01 11:33:47 -05:00
parent 2e8b3a0857 920dbe5b35
commit d469947641
106 changed files with 11943 additions and 4104 deletions
@@ -23,8 +23,8 @@
 #endif

 #define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
-extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
+extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];

 #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
 #define NCCL_ALGO_TREE 0
@@ -59,6 +59,7 @@ union ncclLLFifoLine {
 #define WARP_SIZE 64
 #define MAXCHANNELS 32
 #define NCCL_MAX_NTHREADS 256
+#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
 #define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
 #define NCCL_LL_LINES_PER_THREAD 8
 #ifdef TEST_LL_CLEANUP
@@ -72,7 +73,7 @@ union ncclLLFifoLine {
 // Make sure the clean mask will last for at least NCCL_NSTEPS
 static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");

-#define NCCL_LL128_LINESIZE 64
+#define NCCL_LL128_LINESIZE 128
 #define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
 #define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)

@@ -83,15 +84,12 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 // to 3 dests. Use 70% for reduce and 30% for bcast.
 #define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)

-#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
+#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)

 #define NCCL_DIRECT_GPU 0x01
 #define NCCL_DIRECT_NIC 0x10

-#define MAXBARRIERS 2
-#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
-
 struct ncclConnInfo {
  // Regular comm mechanism
  char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
@@ -99,9 +97,11 @@ struct ncclConnInfo {
  uint64_t *head;     // Local for send, remote for recv

  int direct;         // Direct communication
+  int shared;         // Buffers are shared
  void **ptrExchange; // Pointer exchange for direct communication

-  int *fifo;          // Size fifo for proxy
+  int *sizesFifo;     // Sizes fifo from GPU to proxy
+  void* *ptrsFifo;      // Buffer fifo from proxy to GPU

  uint64_t step;      // Keep where we are
  uint64_t llLastCleaning;
@@ -110,7 +110,6 @@ struct ncclConnInfo {
  // allows software to explicitly initiate a flush read to HDP memory. See more
  // descriptions in primitives.h.
  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
-  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

 struct ncclConnector {
@@ -151,68 +150,53 @@ struct ncclDevComm;

 #pragma pack(push)  /* push current alignment to stack */
 #pragma pack(4)     /* set alignment to 4 bytes boundary */
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
-  struct ncclDevComm* comm;
-  uint64_t opCount;
+#define NCCL_MAX_WORK_ELEMENTS 2
+#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
+
+/* ncclWork is to be a power of two, currently 8x64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclWorkElem. */
+struct ncclWorkElem {
+  // Header
+  struct ncclDevComm* comm;
+  uint16_t nThreads;
+  uint16_t funcIndex;
+  uint16_t index;
+  uint16_t active;

-  // local and remote input, output, and buffer
  const void * sendbuff;
  void * recvbuff;

-  // Op-specific fields. Make sure the common part stays the
-  // same on all structs of the union
+  uint64_t opCount;
+  // Op-specific fields.
  union {
    struct {
-      uint16_t nThreads;
-    } common;
-    struct {
-      uint16_t nThreads;
-      uint8_t bid;
-      uint8_t nChannels;
-      uint32_t root;
      size_t count;
      size_t lastChunkSize;
-    } coll;
-    struct {
-      uint16_t nThreads;
-      uint16_t unused;
-      int32_t delta;
-      size_t sendCount;
-      size_t recvCount;
-    } p2p;
-    struct {
-      uint16_t nThreads;
+      uint32_t root;
      uint8_t bid;
      uint8_t nChannels;
-      size_t count;
-      size_t* extra;
-    } a2av;
-  };
-};
-struct ncclColl {
-  union {
+    } coll;
    struct {
-      struct CollectiveArgs args;
-      uint16_t funcIndex;
-      uint16_t nextIndex;
-      uint8_t  active;
-    };
-    int data[0x10];
+      size_t sendCount;
+      size_t recvCount;
+      int32_t delta;
+      uint16_t nThreads;
+    } p2p;
+    uint64_t align[3];
  };
 };
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+struct ncclWork {
+  struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
+};
+static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");

 struct ncclChannel {
  union {
    struct {
      struct ncclRing ring;
-      struct ncclTree treeUp;
-      struct ncclTree treeDn;
-      struct ncclTree collTreeUp;
-      struct ncclTree collTreeDn;
+      struct ncclTree tree;
+      struct ncclTree collTree;

      int id;

@@ -221,16 +205,10 @@ struct ncclChannel {
      struct ncclPeer* devPeers;

      // Operation list for aggregation
-      struct ncclColl* collectives;
-      size_t* collectivesExtra;
-      int collStart;
-      int collCount;
-      int collFifoHead; // Only used by GPU
-      int collFifoTail; // Only used by CPU
+      struct ncclWork* workFifo;
+      int workCount;
+      uint64_t workFifoTail; // Only used by CPU

-      uint32_t* sync;
-      uint64_t* barrier;
-      uint64_t* barrier_next;
 #ifdef ENABLE_PROFILING
      struct timeval tvs;
      uint64_t sizes;
@@ -288,9 +266,11 @@ struct ncclProf {

 #ifdef ENABLE_COLLTRACE
 typedef enum {
+  ncclCollTraceNotReady,
  ncclCollTraceKernelLaunchType,
  ncclCollTraceCollEndType,
-  ncclCollTraceAbortType
+  ncclCollTraceAbortType,
+  ncclCollTraceDataType
 } ncclCollTraceDataType_t;

 struct ncclCollTrace {
@@ -304,7 +284,7 @@ struct ncclCollTrace {
 };
 static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");

-#define COLLTRACE_NUM_ITEMS 1024
+#define COLLTRACE_NUM_ITEMS 8192
 #endif

 struct ncclDevComm {