Arquivos
rocm-systems/src/include/enqueue.h
T
Sylvain Jeaugey 19ab67d172 2.13.4-1
Optimize CUDA graph launch; avoid launching a CPU callback for
intra-node operations.
Simplify kernel common code to improve the latency of send/recv
operations.
Strengthen CUDA streams semantics.
Change NET API to v6, to add dmabuf support.
Add ncclGetLastError() function.
Add ncclRemoteError code and use it for remote network errors.
Support the use of a different NCCL_NET parameter per communicator.
Add support for SHM and P2P transfers using cudaMemcpy.
2022-07-11 08:10:34 -07:00

28 linhas
1.1 KiB
C

/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ENQUEUE_H_
#define NCCL_ENQUEUE_H_
#include "comm.h"
#include "group.h"
#include "collectives.h"
#include "utils.h"
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
size_t ncclKernMaxLocalSize();
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
#endif // End include guard