d97a32fac8
Add support for IB SHARP to NVLS (NVLink SHARP algorithm). Add NVLS+Tree algorithm. Add support for memory management using cuMem* functions. Use all NICs for Send/Receive operations on systems with more than one NIC per GPU (#804). Add ncclCommSplit primitive, with resource sharing option in config. Fix alltoallv hang (#788) Increase number of channels on H100 when we're not limited by NVLink. Improve error reporting in case of IB failure, printing local and remote ID (#779). Add build option to allow compilation against RDMA includes instead of dynamically loading IB verbs symbols (#802). Fix context creation for progress thread (#803). NET/IB: add option to use multiple QPs in round-robin mode. Fix tree performance issue when NVB is disabled on HCM topologies.
33 lines
1.6 KiB
C
33 lines
1.6 KiB
C
/*************************************************************************
|
|
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_BOOTSTRAP_H_
|
|
#define NCCL_BOOTSTRAP_H_
|
|
|
|
#include "nccl.h"
|
|
#include "comm.h"
|
|
|
|
struct ncclBootstrapHandle {
|
|
uint64_t magic;
|
|
union ncclSocketAddress addr;
|
|
};
|
|
static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
|
|
|
|
ncclResult_t bootstrapNetInit();
|
|
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
|
|
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
|
|
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
|
|
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
|
|
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
|
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
|
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
|
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
|
|
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
|
|
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
|
|
ncclResult_t bootstrapClose(void* commState);
|
|
ncclResult_t bootstrapAbort(void* commState);
|
|
#endif
|