Enable LL128 protocol support (#605)

* Enable LL128 protocol support

* Use shared memory object directly when possible
This commit is contained in:
Wenkai Du
2022-09-08 14:45:27 -07:00
کامیت شده توسط GitHub
والد d700a94918
کامیت 7bbce085cc
13فایلهای تغییر یافته به همراه382 افزوده شده و 247 حذف شده
+13 -1
مشاهده پرونده
@@ -32,13 +32,25 @@ struct ncclDevRedOpFull {
#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type) \
ncclKernelDebug_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type) \
ncclKernelLL128_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type) \
ncclKernelLL128Debug_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_IMPL_NAME(func, algo, proto) \
nccl##func##algo##proto
/* Declare all collective operations */
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm);
#define CONCAT(a,b) a##b
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
+1 -2
مشاهده پرونده
@@ -197,8 +197,7 @@ struct ncclComm {
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclWorkElem args;
void* argsptrs[2];
void* argsptrs[1];
struct ncclProxyState proxyState;
+3 -3
مشاهده پرونده
@@ -76,18 +76,18 @@ union ncclLLFifoLine {
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
#define NCCL_LL128_LINESIZE 128
#define NCCL_LL128_LINESIZE 64
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
#define NCCL_LL128_MAX_NTHREADS 256
#define NCCL_LL128_ELEMS_PER_THREAD 120
#define NCCL_LL128_ELEMS_PER_THREAD 28
// Receiving from up to 3 sources is more compute intensive than sending
// to 3 dests. Use 70% for reduce and 30% for bcast.
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
#define NCCL_DIRECT_WRITE 0x01
+1
مشاهده پرونده
@@ -16,6 +16,7 @@
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
size_t ncclKernMaxLocalSize();
size_t ncclKernLocalSize(int i);
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);