Enable LL128 protocol support (#605)

* Enable LL128 protocol support * Use shared memory object directly when possible
2022-09-08 14:45:27 -07:00
@@ -32,13 +32,25 @@ struct ncclDevRedOpFull {
 #define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
  ncclKernel_##func##_##algo##_##proto##_##devredop##_##type

+#define NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type) \
+  ncclKernelDebug_##func##_##algo##_##proto##_##devredop##_##type
+
+#define NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type) \
+  ncclKernelLL128_##func##_##algo##_##proto##_##devredop##_##type
+
+#define NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type) \
+  ncclKernelLL128Debug_##func##_##algo##_##proto##_##devredop##_##type
+
 #define NCCL_IMPL_NAME(func, algo, proto) \
  nccl##func##algo##proto

 /* Declare all collective operations */
 #define DECL5(func, algo, proto, devredop, type) \
  extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
+  extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
+  extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
+  extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm);

 #define CONCAT(a,b) a##b
 #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
@@ -197,8 +197,7 @@ struct ncclComm {
  int* intraCudaDevs;
  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclWorkElem args;
-  void* argsptrs[2];
+  void* argsptrs[1];

  struct ncclProxyState proxyState;

@@ -76,18 +76,18 @@ union ncclLLFifoLine {
 // Make sure the clean mask will last for at least NCCL_NSTEPS
 static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");

-#define NCCL_LL128_LINESIZE 128
+#define NCCL_LL128_LINESIZE 64
 #define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
 #define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)

 #define NCCL_LL128_MAX_NTHREADS 256
-#define NCCL_LL128_ELEMS_PER_THREAD 120
+#define NCCL_LL128_ELEMS_PER_THREAD 28

 // Receiving from up to 3 sources is more compute intensive than sending
 // to 3 dests. Use 70% for reduce and 30% for bcast.
 #define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)

-#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
+#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)

 #define NCCL_DIRECT_WRITE 0x01
@@ -16,6 +16,7 @@
 #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */

 size_t ncclKernMaxLocalSize();
+size_t ncclKernLocalSize(int i);
 ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);