2.9.6-1

Add support for CUDA graphs. Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439. Fix bootstrap issue caused by connection reordering. Fix CPU locking block. Improve CollNet algorithm. Improve performance on DGX A100 for communicators with only one GPU per node.
2021-04-12 16:00:11 -07:00
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -14,47 +14,66 @@ enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }
 struct ncclProxyArgs;
 typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);

-struct ncclProxyArgs {
-  proxyProgressFunc_t progress;
+#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
+static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
+
+struct ncclProxySubArgs {
  struct ncclChannel* channel;
  struct ncclConnector* connector;
-  size_t sendbytes;
-  size_t recvbytes;
-  int sliceSteps;
-  int chunkSteps;
  int nsteps;
-  uint64_t opCount;
-  int protocol;
-  int segment; // Only for profiling
-  ncclDataType_t dtype;
-  ncclRedOp_t redOp;
-  int state;   // add component before this line -- it is left out during initialization
+  ssize_t sendbytes;
+  ssize_t recvbytes;
+  int sendChunkSize;
+  int recvChunkSize;
+  int delta;

  // Internal state
+  uint64_t base;
  uint64_t posted;
-  uint64_t received; // Only used by recv proxy to wait for flush.
+  uint64_t received;
+  uint64_t flushed;
  uint64_t transmitted;
  uint64_t done;
  uint64_t end;
  void* requests[NCCL_STEPS];
+};
+
+struct ncclProxyArgs {
+  proxyProgressFunc_t progress;
+  struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
+  int nsubs;
+  int done;
+  int sliceSteps;
+  int chunkSteps;
+  int chunkSize;
+  uint64_t opCount;
+  uint64_t commOpCount;
+  int protocol;
+  ncclDataType_t dtype;
+  ncclRedOp_t redOp;
+  ncclPattern_t pattern;
+  int root;
+  int state;
+  char* sharedBuff[NCCL_STEPS];
+  int sharedSize[NCCL_STEPS];
+
  int idle;

  // Element linking
  pthread_mutex_t mutex;
  struct ncclProxyArgs* next;
  struct ncclProxyArgs* nextPeer;
-  struct ncclProxyArgs* nextGroup;
  struct ncclProxyArgs** proxyAppendPtr;
 };

 struct ncclProxySharedBuffers {
-  int nslots;
-  int slotSize;
-  char* cudaBuff[2*MAXCHANNELS];
-  int* cudaUsed[2*MAXCHANNELS];
-  char* hostBuff[2*MAXCHANNELS];
-  int* hostUsed[2*MAXCHANNELS];
+  int size;
+  char* cudaBuff;
+  char* hostBuff;
  struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
+  // Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
+  struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
+  void* collNetResources;
 };

 struct ncclProxyPool;
@@ -63,11 +82,16 @@ struct ncclProxyState {
  pthread_mutex_t opsMutex;
  pthread_mutex_t poolMutex;
  bool stop;
-  struct ncclProxySharedBuffers* sharedBuffs;
-  struct ncclProxyArgs* ops;
-  struct ncclProxyArgs* nextOps;
+  struct ncclProxySharedBuffers sharedBuffs;
+  struct ncclProxyArgs* ops;           // Running operations, used by proxy thread
+  struct ncclProxyArgs* postedOps;     // Posted operations, shared between proxy and main thread, locked with opsMutex
+  struct ncclProxyArgs* postedOpsEnd;
+  struct ncclProxyArgs* nextOps;       // Pending operations, used by main thread (could still be cancelled)
  struct ncclProxyArgs* nextOpsEnd;
-  struct ncclProxyArgs* pool;
+  struct ncclProxyArgs* pool;          // Free operations for main thread
+  struct ncclProxyArgs* poolFreed;     // Freed operations by the progress thread
+  struct ncclProxyArgs* poolReturned;  // Shared between main and progress thread, lock with poolMutex
+
  struct ncclProxyPool* pools;
 };

@@ -79,15 +103,16 @@ enum proxyMode {
  proxyTo = 2
 };

-ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
-ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
+ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
+ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm);

 ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
-ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
-ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
+ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
+ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
 ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);

 #include <unistd.h>