Enable LL128 protocol support (#605)
* Enable LL128 protocol support * Use shared memory object directly when possible
This commit is contained in:
@@ -32,13 +32,25 @@ struct ncclDevRedOpFull {
|
||||
#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
|
||||
ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
|
||||
|
||||
#define NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type) \
|
||||
ncclKernelDebug_##func##_##algo##_##proto##_##devredop##_##type
|
||||
|
||||
#define NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type) \
|
||||
ncclKernelLL128_##func##_##algo##_##proto##_##devredop##_##type
|
||||
|
||||
#define NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type) \
|
||||
ncclKernelLL128Debug_##func##_##algo##_##proto##_##devredop##_##type
|
||||
|
||||
#define NCCL_IMPL_NAME(func, algo, proto) \
|
||||
nccl##func##algo##proto
|
||||
|
||||
/* Declare all collective operations */
|
||||
#define DECL5(func, algo, proto, devredop, type) \
|
||||
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
|
||||
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
|
||||
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
|
||||
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
|
||||
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
|
||||
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm);
|
||||
|
||||
#define CONCAT(a,b) a##b
|
||||
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
|
||||
|
||||
@@ -197,8 +197,7 @@ struct ncclComm {
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
struct ncclWorkElem args;
|
||||
void* argsptrs[2];
|
||||
void* argsptrs[1];
|
||||
|
||||
struct ncclProxyState proxyState;
|
||||
|
||||
|
||||
@@ -76,18 +76,18 @@ union ncclLLFifoLine {
|
||||
// Make sure the clean mask will last for at least NCCL_NSTEPS
|
||||
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
|
||||
|
||||
#define NCCL_LL128_LINESIZE 128
|
||||
#define NCCL_LL128_LINESIZE 64
|
||||
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
|
||||
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
|
||||
|
||||
#define NCCL_LL128_MAX_NTHREADS 256
|
||||
#define NCCL_LL128_ELEMS_PER_THREAD 120
|
||||
#define NCCL_LL128_ELEMS_PER_THREAD 28
|
||||
|
||||
// Receiving from up to 3 sources is more compute intensive than sending
|
||||
// to 3 dests. Use 70% for reduce and 30% for bcast.
|
||||
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
|
||||
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
|
||||
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
|
||||
#define NCCL_DIRECT_WRITE 0x01
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
||||
|
||||
size_t ncclKernMaxLocalSize();
|
||||
size_t ncclKernLocalSize(int i);
|
||||
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
|
||||
|
||||
مرجع در شماره جدید
Block a user