diff --git a/src/include/collectives.h b/src/include/collectives.h index db8fc99fd3..4f3f55bed1 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -110,24 +110,18 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)(); extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(); // CHUNKSIZE must be a multiple of SLICESIZE -//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) -//#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) -//#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) -//#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) -//#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) -//#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) -#define ALLREDUCE_SLICESTEPS 2 -#define ALLREDUCE_CHUNKSTEPS 4 -#define ALLGATHER_SLICESTEPS 4 -#define ALLGATHER_CHUNKSTEPS 4 -#define REDUCESCATTER_SLICESTEPS 2 -#define REDUCESCATTER_CHUNKSTEPS 4 +#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) +#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) +#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) +#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) +#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) +#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) #define BROADCAST_SLICESTEPS 1 -#define BROADCAST_CHUNKSTEPS 4 +#define BROADCAST_CHUNKSTEPS 1 #define REDUCE_SLICESTEPS 1 -#define REDUCE_CHUNKSTEPS 4 -#define SENDRECV_SLICEFACTOR 1 -#define NCCL_MAX_SLICE_PER_CHUNK 4 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above +#define REDUCE_CHUNKSTEPS 1 +#define SENDRECV_SLICEFACTOR 4 +#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above #define ALLTOALL_PIVOT_SLICESTEPS 2 #define ALLTOALL_PIVOT_CHUNKSTEPS 4