Fichiers
rocm-systems/src/collectives.cc
T
Sylvain Jeaugey 68b542363f 2.23.4-1
Add scalable init API
 * Add new ncclCommInitRankScalable to allow for passing multiple
   unique IDs to the init function.
 * Spreads the load onto multiple bootstrap roots, allowing for
   constant bootstrap time.
 * Requires multiple ranks to create a unique ID, and the CPU-side
   ID exchange code to call allgather[v] instead of broadcast.

Accelerate init bootstrap operations
 * Reduce the number of calls to allgather.
 * Allow roots to reply early to ranks when information is already
   available.
 * Add an option to use ncclNet instead of sockets to perform
   bootstrap allgather operations.

Add PAT algorithms for Allgather and ReduceScatter
 * Parallel Aggregated Trees, variation of Bruck algorithm.
 * Logarithmic number of network steps for small sizes at scale.
 * Only supports one rank per node at the moment.

Add support for registered buffers for intra-node communication.
 * Allow registered user buffers to be accessed directly intra-node
 * Avoids extra copies in algorithms which permit it, saving
   memory bandwidth and helping with compute overlap.

Add profiler plugin API
 * New plugin API for profiling
 * Supports various levels of profiling, with a hierarchy.

Asynchronous graph allocation
 * Make calls to cudaMalloc and cudaMemcpy during graph allocation
   asynchronous.
 * Significantly speeds up graph capture.

Use fatal IB asynchronous events to stop network operation
 * Avoids many other error messages
 * Only fatal errors are affected; potentially transient errors
   (e.g. port down) do not cause an immediate stop.

Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node
 * P2P would cause a significant performance degradation when using
   many GPUs, and therefore many interleaved data flows.
 * Disable P2P through the CPU when we have 3+ GPUs per node; keep it
   enabled when we only have 2 GPUs.

Improve the init logs to report the real NCCL function.
 * Make the log report ncclCommInitRank or ncclCommSplit, rather than
   the generic ncclCommInitRankFunc.

Add a parameter to set the location of the user configuration file.
 * Add NCCL_CONF_FILE environment variable to set where the user's
   configuration file resides.

Increase default IB timeout
 * Increase IB timeout value from 18 to 20.
 * Should help avoid fatal errors on large RoCE systems.

Add new check for nvidia peermem
 * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer
   present; check for /sys/module/nvidia_peermem/version instead.

Fix old performance regression when mixing small and large operations.
 * Improves distribution of work on channels.

Fix crash when NUMA IDs are equal to -1.
 * Can happen when a NIC is a virtual NIC, or when linux doesn't
   know which NUMA node a device is attached to
 * Issue NVIDIA/nccl-tests#233

Fix tree graph search when NCCL_CROSS_NIC is set to 1.
 * Would force NCCL to use the balanced_tree pattern, thereby
   disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch.
 * Would also try to use alternate rings even though it was not
   needed.

Compiler tweaks and fixes
 * PR #1177
 * PR #1228

Fix stack smash
 * PR #1325

Fixes for multi-node NVLink + IB operation

Coverity fixes and comments.
2024-09-16 23:41:17 -07:00

240 lignes
9.4 KiB
C++

/*************************************************************************
* Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "argcheck.h" // Need some checks here since we access comm
#include "collectives.h"
#include "enqueue.h"
#include "nccl.h"
const char* ncclFuncToString(ncclFunc_t fn) {
switch (fn) {
case ncclFuncAllGather: return "AllGather";
case ncclFuncAllReduce: return "AllReduce";
case ncclFuncBroadcast: return "Broadcast";
case ncclFuncRecv: return "Recv";
case ncclFuncReduce: return "Reduce";
case ncclFuncReduceScatter: return "ReduceScatter";
case ncclFuncSendRecv: return "SendRecv";
case ncclFuncSend: return "Send";
default: return "Invalid";
}
}
const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
switch (op) {
case ncclDevSum: return "Sum";
case ncclDevProd: return "Prod";
case ncclDevMinMax: return "MinMax";
case ncclDevPreMulSum: return "PreMulSum";
case ncclDevSumPostDiv: return "SumPostDiv";
default: return "Unknown";
}
}
const char* ncclDatatypeToString(ncclDataType_t type) {
switch (type) {
case ncclInt8: return "ncclInt8";
case ncclInt32: return "ncclInt32";
case ncclUint32: return "ncclUint32";
case ncclInt64: return "ncclInt64";
case ncclUint64: return "ncclUint64";
case ncclFloat16: return "ncclFloat16";
case ncclFloat32: return "ncclFloat32";
case ncclFloat64: return "ncclFloat64";
#if defined(__CUDA_BF16_TYPES_EXIST__)
case ncclBfloat16: return "ncclBfloat16";
#endif
default: return "Unknown";
}
}
const char* ncclAlgoToString(int algo) {
switch (algo) {
case NCCL_ALGO_TREE: return "TREE";
case NCCL_ALGO_RING: return "RING";
case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
case NCCL_ALGO_NVLS: return "NVLS";
case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
case NCCL_ALGO_PAT: return "PAT";
default: return "Unknown";
}
}
const char* ncclProtoToString(int proto) {
switch (proto) {
case NCCL_PROTO_LL: return "LL";
case NCCL_PROTO_LL128: return "LL128";
case NCCL_PROTO_SIMPLE: return "SIMPLE";
default: return "Unknown";
}
}
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
};
size_t msgsize = sendcount * ncclTypeSize(datatype);
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsAllReduce {
size_t bytes;
ncclRedOp_t op;
};
// Just pass the size of one message and not the total bytes sent/received.
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsAllReduce, op)}
};
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsBroadcast {
size_t bytes;
int root;
};
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
};
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsReduce {
size_t bytes;
int root;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduce, op)}
};
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsReduceScatter {
size_t bytes;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduceScatter, op)}
};
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
struct NvtxParamsSendRecv {
size_t bytes;
int peer;
};
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
};
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
struct ncclInfo info = { ncclFuncSend, "Send",
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
exit:
NCCLCHECK(ncclGroupEnd());
return ret;
}
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
struct ncclInfo info = { ncclFuncRecv, "Recv",
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
exit:
NCCLCHECK(ncclGroupEnd());
return ret;
}