5d3ab08b69
Add new NVLS algorithm for allreduce using NVLink SHARP (intra-node only). Add new config options: cgaClusterSize, minCTAs, maxCTAs, netName. Enable LL128 when we use PXN to close rings. NVTX3 includes update. Fix crash when one CollNet (SHARP) rail fails to initialize.
27 строки
1.1 KiB
C
27 строки
1.1 KiB
C
/*************************************************************************
|
|
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_ENQUEUE_H_
|
|
#define NCCL_ENQUEUE_H_
|
|
|
|
#include "comm.h"
|
|
#include "group.h"
|
|
#include "collectives.h"
|
|
#include "utils.h"
|
|
|
|
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
|
|
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
|
|
|
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
|
|
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
|
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
|
|
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
|
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
|
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
|
ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
|
|
|
|
#endif // End include guard
|