68b542363f
Add scalable init API * Add new ncclCommInitRankScalable to allow for passing multiple unique IDs to the init function. * Spreads the load onto multiple bootstrap roots, allowing for constant bootstrap time. * Requires multiple ranks to create a unique ID, and the CPU-side ID exchange code to call allgather[v] instead of broadcast. Accelerate init bootstrap operations * Reduce the number of calls to allgather. * Allow roots to reply early to ranks when information is already available. * Add an option to use ncclNet instead of sockets to perform bootstrap allgather operations. Add PAT algorithms for Allgather and ReduceScatter * Parallel Aggregated Trees, variation of Bruck algorithm. * Logarithmic number of network steps for small sizes at scale. * Only supports one rank per node at the moment. Add support for registered buffers for intra-node communication. * Allow registered user buffers to be accessed directly intra-node * Avoids extra copies in algorithms which permit it, saving memory bandwidth and helping with compute overlap. Add profiler plugin API * New plugin API for profiling * Supports various levels of profiling, with a hierarchy. Asynchronous graph allocation * Make calls to cudaMalloc and cudaMemcpy during graph allocation asynchronous. * Significantly speeds up graph capture. Use fatal IB asynchronous events to stop network operation * Avoids many other error messages * Only fatal errors are affected; potentially transient errors (e.g. port down) do not cause an immediate stop. Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node * P2P would cause a significant performance degradation when using many GPUs, and therefore many interleaved data flows. * Disable P2P through the CPU when we have 3+ GPUs per node; keep it enabled when we only have 2 GPUs. Improve the init logs to report the real NCCL function. * Make the log report ncclCommInitRank or ncclCommSplit, rather than the generic ncclCommInitRankFunc. Add a parameter to set the location of the user configuration file. * Add NCCL_CONF_FILE environment variable to set where the user's configuration file resides. Increase default IB timeout * Increase IB timeout value from 18 to 20. * Should help avoid fatal errors on large RoCE systems. Add new check for nvidia peermem * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer present; check for /sys/module/nvidia_peermem/version instead. Fix old performance regression when mixing small and large operations. * Improves distribution of work on channels. Fix crash when NUMA IDs are equal to -1. * Can happen when a NIC is a virtual NIC, or when linux doesn't know which NUMA node a device is attached to * Issue NVIDIA/nccl-tests#233 Fix tree graph search when NCCL_CROSS_NIC is set to 1. * Would force NCCL to use the balanced_tree pattern, thereby disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch. * Would also try to use alternate rings even though it was not needed. Compiler tweaks and fixes * PR #1177 * PR #1228 Fix stack smash * PR #1325 Fixes for multi-node NVLink + IB operation Coverity fixes and comments.
240 lignes
9.4 KiB
C++
240 lignes
9.4 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "argcheck.h" // Need some checks here since we access comm
|
|
#include "collectives.h"
|
|
#include "enqueue.h"
|
|
#include "nccl.h"
|
|
|
|
const char* ncclFuncToString(ncclFunc_t fn) {
|
|
switch (fn) {
|
|
case ncclFuncAllGather: return "AllGather";
|
|
case ncclFuncAllReduce: return "AllReduce";
|
|
case ncclFuncBroadcast: return "Broadcast";
|
|
case ncclFuncRecv: return "Recv";
|
|
case ncclFuncReduce: return "Reduce";
|
|
case ncclFuncReduceScatter: return "ReduceScatter";
|
|
case ncclFuncSendRecv: return "SendRecv";
|
|
case ncclFuncSend: return "Send";
|
|
default: return "Invalid";
|
|
}
|
|
}
|
|
|
|
const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
|
|
switch (op) {
|
|
case ncclDevSum: return "Sum";
|
|
case ncclDevProd: return "Prod";
|
|
case ncclDevMinMax: return "MinMax";
|
|
case ncclDevPreMulSum: return "PreMulSum";
|
|
case ncclDevSumPostDiv: return "SumPostDiv";
|
|
default: return "Unknown";
|
|
}
|
|
}
|
|
|
|
const char* ncclDatatypeToString(ncclDataType_t type) {
|
|
switch (type) {
|
|
case ncclInt8: return "ncclInt8";
|
|
case ncclInt32: return "ncclInt32";
|
|
case ncclUint32: return "ncclUint32";
|
|
case ncclInt64: return "ncclInt64";
|
|
case ncclUint64: return "ncclUint64";
|
|
case ncclFloat16: return "ncclFloat16";
|
|
case ncclFloat32: return "ncclFloat32";
|
|
case ncclFloat64: return "ncclFloat64";
|
|
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
|
case ncclBfloat16: return "ncclBfloat16";
|
|
#endif
|
|
default: return "Unknown";
|
|
}
|
|
}
|
|
|
|
const char* ncclAlgoToString(int algo) {
|
|
switch (algo) {
|
|
case NCCL_ALGO_TREE: return "TREE";
|
|
case NCCL_ALGO_RING: return "RING";
|
|
case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
|
|
case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
|
|
case NCCL_ALGO_NVLS: return "NVLS";
|
|
case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
|
|
case NCCL_ALGO_PAT: return "PAT";
|
|
default: return "Unknown";
|
|
}
|
|
}
|
|
|
|
const char* ncclProtoToString(int proto) {
|
|
switch (proto) {
|
|
case NCCL_PROTO_LL: return "LL";
|
|
case NCCL_PROTO_LL128: return "LL128";
|
|
case NCCL_PROTO_SIMPLE: return "SIMPLE";
|
|
default: return "Unknown";
|
|
}
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
|
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
|
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
|
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
|
|
// Just pass the size of one message and not the total bytes sent/received.
|
|
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
|
|
};
|
|
size_t msgsize = sendcount * ncclTypeSize(datatype);
|
|
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
|
|
|
|
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
|
|
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
|
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
|
|
NCCLCHECK(ncclEnqueueCheck(&info));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
|
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
|
struct NvtxParamsAllReduce {
|
|
size_t bytes;
|
|
ncclRedOp_t op;
|
|
};
|
|
// Just pass the size of one message and not the total bytes sent/received.
|
|
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
|
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
|
offsetof(NvtxParamsAllReduce, op)}
|
|
};
|
|
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
|
|
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
|
|
|
|
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
|
|
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
|
|
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
|
|
NCCLCHECK(ncclEnqueueCheck(&info));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
|
ncclComm_t comm, cudaStream_t stream);
|
|
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
|
ncclComm_t comm, cudaStream_t stream) {
|
|
struct NvtxParamsBroadcast {
|
|
size_t bytes;
|
|
int root;
|
|
};
|
|
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
|
|
};
|
|
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
|
|
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
|
|
|
|
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
|
|
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
|
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
|
|
NCCLCHECK(ncclEnqueueCheck(&info));
|
|
return ncclSuccess;
|
|
}
|
|
/* Deprecated original "in place" function, similar to MPI */
|
|
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
|
|
ncclComm_t comm, cudaStream_t stream);
|
|
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
|
ncclComm_t comm, cudaStream_t stream) {
|
|
NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
|
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
|
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
|
struct NvtxParamsReduce {
|
|
size_t bytes;
|
|
int root;
|
|
ncclRedOp_t op;
|
|
};
|
|
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
|
|
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
|
offsetof(NvtxParamsReduce, op)}
|
|
};
|
|
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
|
|
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
|
|
|
|
struct ncclInfo info = { ncclFuncReduce, "Reduce",
|
|
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
|
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
|
|
NCCLCHECK(ncclEnqueueCheck(&info));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
|
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
|
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
|
struct NvtxParamsReduceScatter {
|
|
size_t bytes;
|
|
ncclRedOp_t op;
|
|
};
|
|
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
|
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
|
offsetof(NvtxParamsReduceScatter, op)}
|
|
};
|
|
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
|
|
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
|
|
|
|
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
|
|
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
|
|
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
|
|
NCCLCHECK(ncclEnqueueCheck(&info));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
struct NvtxParamsSendRecv {
|
|
size_t bytes;
|
|
int peer;
|
|
};
|
|
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
|
|
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
|
|
};
|
|
|
|
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
|
ncclComm_t comm, cudaStream_t stream);
|
|
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
|
ncclComm_t comm, cudaStream_t stream) {
|
|
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
|
|
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
|
|
|
|
struct ncclInfo info = { ncclFuncSend, "Send",
|
|
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
|
1, 1 };
|
|
ncclResult_t ret;
|
|
NCCLCHECK(ncclGroupStart());
|
|
NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
|
|
exit:
|
|
NCCLCHECK(ncclGroupEnd());
|
|
return ret;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
|
ncclComm_t comm, cudaStream_t stream);
|
|
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
|
ncclComm_t comm, cudaStream_t stream) {
|
|
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
|
|
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
|
|
|
|
struct ncclInfo info = { ncclFuncRecv, "Recv",
|
|
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
|
1, 1 };
|
|
ncclResult_t ret;
|
|
NCCLCHECK(ncclGroupStart());
|
|
NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
|
|
exit:
|
|
NCCLCHECK(ncclGroupEnd());
|
|
return ret;
|
|
}
|