Merge pull request #782 from ROCmSoftwarePlatform/2.18.3

Sync up with NCCL 2.18.3
Αυτή η υποβολή περιλαμβάνεται σε:
Wenkai Du
2023-06-29 15:04:16 -07:00
υποβλήθηκε από GitHub
γονέας 9bba4a2f2a abd0615351
υποβολή ce6a2ffac8
83 αρχεία άλλαξαν με 5798 προσθήκες και 4090 διαγραφές
+3
Προβολή Αρχείου
@@ -13,6 +13,9 @@
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_POWER(x, y) \
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
+78 -1
Προβολή Αρχείου
@@ -12,6 +12,7 @@
#include "checks.h"
#include "align.h"
#include "utils.h"
#include "p2p.h"
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
@@ -87,6 +88,77 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
#define MAX_ALLOC_TRACK_NGPU 32
extern struct allocationTracker allocTracker[];
#if CUDART_VERSION >= 11030
#include <cuda.h>
#include "cudawrap.h"
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
CUdevice currentDev;
CUmemAllocationProp prop = {};
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
int cudaDev;
int flag = 0;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
prop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
/* Now allow RW access to the newly mapped memory */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = currentDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
if (handlep) *handlep = handle;
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
return result;
}
static inline ncclResult_t ncclCuMemFree(void *ptr) {
if (ptr == NULL) return ncclSuccess;
ncclResult_t result = ncclSuccess;
CUmemGenericAllocationHandle handle;
size_t size = 0;
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
return result;
}
#else
extern int ncclCuMemEnable();
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemFree(void *ptr) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
#endif
template <typename T>
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
ncclResult_t result = ncclSuccess;
@@ -193,8 +265,13 @@ template <typename T>
ncclResult_t ncclCudaFree(T* ptr) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaFree(ptr), result, finish);
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
} else {
CUDACHECKGOTO(cudaFree(ptr), result, finish);
}
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
@@ -20,6 +20,7 @@ ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
+3 -1
Προβολή Αρχείου
@@ -9,7 +9,9 @@
#include "comm.h"
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
int peerNode = comm->rankToNode[peer];
+29 -29
Προβολή Αρχείου
@@ -18,11 +18,11 @@
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
#define CUDACHECKGOTO(cmd, RES, label) do { \
cudaError_t err = cmd; \
if( err != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
res = ncclUnhandledCudaError; \
RES = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
@@ -60,11 +60,11 @@
} \
} while(true)
#define SYSCHECKGOTO(statement, res, label) do { \
#define SYSCHECKGOTO(statement, RES, label) do { \
if ((statement) == -1) { \
/* Print the back trace*/ \
res = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
@@ -72,16 +72,16 @@
#define NEQCHECK(statement, value) do { \
if ((statement) != value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define NEQCHECKGOTO(statement, value, res, label) do { \
#define NEQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) != value) { \
/* Print the back trace*/ \
res = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
@@ -89,57 +89,57 @@
#define EQCHECK(statement, value) do { \
if ((statement) == value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define EQCHECKGOTO(statement, value, res, label) do { \
#define EQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) == value) { \
/* Print the back trace*/ \
res = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
if (res != ncclSuccess && res != ncclInProgress) { \
ncclResult_t RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
/* Print the back trace*/ \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return RES; \
} \
} while (0);
#define NCCLCHECKGOTO(call, res, label) do { \
res = call; \
if (res != ncclSuccess && res != ncclInProgress) { \
#define NCCLCHECKGOTO(call, RES, label) do { \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
/* Print the back trace*/ \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
} while (0);
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
ncclResult_t res = call; \
if (res != ncclSuccess && res != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
ncclResult_t RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return ncclInternalError; \
} \
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
} while (!(cond));
#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
res = call; \
if (res != ncclSuccess && res != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
} while (!(cond));
#define NCCLCHECKTHREAD(a, args) do { \
@@ -63,11 +63,12 @@ struct ncclDevRedOpFull {
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type))
#define DECL3(func, devredop, type, undef) \
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef)
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef) \
DECL4(func, NVLS_TREE, devredop, type, undef)
#if defined(RCCL_BFLOAT16)
#define DECL2(func, devredop, undefForFloat) \
+55 -20
Προβολή Αρχείου
@@ -101,19 +101,51 @@ struct ncclCommCallback {
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
};
struct ncclSharedResources {
int refCount;
struct ncclComm* owner; /* comm which creates this shared res. */
struct ncclChannelPeer* peers[MAXCHANNELS];
struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
/* P2P operation counter, one per channel */
uint64_t p2pOpCount[MAXCHANNELS];
/* Collective operation counter */
uint64_t collOpCount;
int tpNRanks;
int tpNLocalRanks;
int tpNChannels;
int tpP2pNChannels;
int tpP2pChunkSize;
uint64_t magic;
// top parent rank to localRank translation table
int* tpRankToLocalRank;
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
/* proxy related shared res */
struct ncclProxyState* proxyState;
};
struct ncclChannel {
struct ncclChannelPeer* peers;
struct ncclDevChannelPeer* devPeers;
struct ncclChannelPeer** peers;
struct ncclDevChannelPeer** devPeers;
struct ncclRing ring;
int* devRingUserRanks;
struct ncclTree tree;
struct ncclTree collnetChain;
struct ncclDirect collnetDirect;
struct ncclTree binTree;
struct ncclNvls nvls;
int id; // index of this channel
uint32_t workFifoSent; // last used work index+1
uint64_t p2pOpCount;
/* comm split sharable resources */
struct ncclChannelPeer* collnetPeers;
struct ncclDevChannelPeer* collnetDevPeers;
struct ncclChannelPeer* nvlsPeers;
struct ncclDevChannelPeer* nvlsDevPeers;
};
struct ncclWorkList {
@@ -167,6 +199,10 @@ struct ncclComm {
// List of destructors to run when comm is destructed
struct ncclDestructor* destructorHead;
struct ncclSharedResources* sharedRes;
/* map to top parent ranks. */
int* topParentRanks;
int* topParentLocalRanks;
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
@@ -180,15 +216,16 @@ struct ncclComm {
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
uint64_t commHash;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
//int nvmlDev; // my nvml device index
int compCap; // compute capability of the GPU
int minCompCap; // min compute capability in the communicator
int minCompCap, maxCompCap; // min/max compute capability in the communicator
int64_t busId; // my PCI bus ID in int format
cpu_set_t cpuAffinity; // CPU affinity of the GPU
int WarpSize;
int virtualId;
int cudaArch; // matches __CUDA_ARCH__ of device
int node;
@@ -207,12 +244,11 @@ struct ncclComm {
// Counter for tracking CUDA launches (P2P and collectives included)
uint64_t opCount;
// Collective operation counter
uint64_t collOpCount;
// Channels for collectives
int nChannels;
int nvlsChannels;
int collNetChannels;
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
@@ -237,6 +273,8 @@ struct ncclComm {
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile uint32_t *childAbortFlag;
uint32_t *abortFlagRefCount;
// Flags for enable P2P NET
uint32_t p2pNet;
@@ -268,21 +306,24 @@ struct ncclComm {
char intraPad2[64 - sizeof(uint64_t)];
uint64_t intraBarrierGate; // only used if this is intraComm0
struct ncclProxyState proxyState;
struct ncclProxyState* proxyState;
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
// Whether this communicator uses collNet
int collNetSupport;
uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
int intraHighestTransportType;
int* collNetHeads;
int collNetHeadsNum;
/* sharable collNet proxy progress resource. */
struct ncclCollNetSharedRes* collNetSharedRes;
// NVLink SHARP (NVLS) support
int nvlsSupport;
void* nvlsResources;
/* sharable NVLS resource. */
struct ncclNvlsSharedRes* nvlsResources;
size_t channelSize; // User requested work size (bytes) for channel partitions
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
@@ -319,13 +360,7 @@ struct ncclComm {
volatile bool collTraceExit;
#endif
// communicator mode
int blocking;
// CGA cluster size
int cgaClusterSize;
int minCTAs, maxCTAs;
// network interface name
char *netName;
ncclConfig_t config;
// initState is to more conveniently reclaim resources when errors happen.
ncclResult_t initState;
// flag to indicate if ncclCommFinalize() is called
@@ -11,6 +11,9 @@
#include <cuda_runtime.h>
#include "checks.h"
// Is cuMem API usage enabled
extern int ncclCuMemEnable();
#if CUDART_VERSION >= 11030
#include <cudaTypedefs.h>
#else
@@ -85,6 +88,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070
+15 -6
Προβολή Αρχείου
@@ -21,12 +21,13 @@
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@@ -112,10 +113,10 @@ struct ncclConnInfo {
};
struct ncclProxyConnector {
int rank;
int localRank;
int tpRank;
int tpLocalRank;
int sameProcess;
struct ncclProxyConnection* connection;
struct ncclComm* comm;
};
struct ncclConnector {
@@ -124,7 +125,6 @@ struct ncclConnector {
struct ncclTransportComm* transportComm;
void* transportResources;
struct ncclConnInfo conn;
struct ncclComm *comm;
};
struct ncclRing {
@@ -141,6 +141,9 @@ struct ncclRing {
};
// The root of each tree only has one node down (+1 intra-node).
#define NCCL_MAX_TREE_ARITY_TOP 2
// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
#define NCCL_MAX_TREE_ARITY 3
struct ncclTree {
int depth;
@@ -161,18 +164,24 @@ struct ncclDirect {
#define NCCL_CONN_IDX_P2P_NET 2
#define NCCL_MAX_NVLS_ARITY 8
#define NCCL_MAX_NVLS_TREE_ARITY 3
struct ncclNvls {
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int up[NCCL_MAX_NVLS_ARITY];
int down;
int treeUp;
int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
int node;
int nNodes;
};
#define NCCL_MAX_CONNS 3
struct ncclChannelPeer {
struct ncclConnector send[NCCL_MAX_CONNS];
struct ncclConnector recv[NCCL_MAX_CONNS];
int refCount;
};
struct ncclDevComm;
@@ -362,7 +371,7 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must
#endif
struct alignas(16) ncclDevChannel {
struct ncclDevChannelPeer *peers;
struct ncclDevChannelPeer** peers;
struct ncclRing ring;
struct ncclTree tree;
struct ncclTree collnetChain;
+1 -1
Προβολή Αρχείου
@@ -298,7 +298,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
CUDACHECK(cudaFree(md->gdrDevMem));
NCCLCHECK(ncclCudaFree(md->gdrDevMem));
free(md);
return ncclSuccess;
+9 -8
Προβολή Αρχείου
@@ -59,9 +59,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
#define NCCL_TOPO_CPU_TYPE_ROME 4
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
#define NCCL_TOPO_MAX_NODES 256
@@ -72,6 +74,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4 // Ring
#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree
struct ncclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
@@ -108,18 +111,16 @@ struct ncclTopoRanks {
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
int nvlsHeads[MAXCHANNELS];
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc);
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
+20 -1
Προβολή Αρχείου
@@ -36,6 +36,7 @@ struct ncclAsyncJob {
void(*destructor)(void*);
ncclGroupJobState_t state;
volatile uint32_t *abortFlag; /* point to comm abortFlag */
volatile uint32_t *childAbortFlag; /* point to child abortFlag */
ncclComm_t comm;
};
@@ -67,6 +68,24 @@ extern __thread ncclResult_t ncclGroupError;
extern __thread struct ncclComm* ncclGroupCommHead;
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
extern __thread int ncclGroupBlocking;
extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
extern __thread struct ncclGroupJob ncclGroupJobMain;
static inline void groupResetJobState() {
ncclGroupBlocking = -1;
ncclGroupJobMainPtr = NULL;
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
return;
}
static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
ncclResult_t ret = ncclSuccess;
if (job) {
ret = ncclAsyncJobComplete(&job->base);
groupResetJobState();
}
return ret;
}
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
if (ncclGroupDepth > 0) {
@@ -91,7 +110,7 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
ncclMemoryStackPush(&comm->memScoped);
}
ncclGroupBlocking = comm->blocking;
ncclGroupBlocking = comm->config.blocking;
}
// Add comm to this thread's group needing preconnect
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο Φόρτωση Διαφορών
@@ -0,0 +1,44 @@
#ifndef NCCL_IBV_SYMBOLS_H_
#define NCCL_IBV_SYMBOLS_H_
#ifdef NCCL_BUILD_RDMA_CORE
#include <infiniband/verbs.h>
#else
#include "ibvcore.h"
#endif
#include "nccl.h"
/* IB Verbs Function Pointers*/
struct ncclIbvSymbols {
int (*ibv_internal_fork_init)(void);
struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
void (*ibv_internal_free_device_list)(struct ibv_device **list);
const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
int (*ibv_internal_close_device)(struct ibv_context *context);
int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
/* DMA-BUF support */
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
};
/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
#endif // NCCL_IBV_SYMBOLS_H_
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο Φόρτωση Διαφορών
+3 -2
Προβολή Αρχείου
@@ -26,6 +26,7 @@ typedef enum : uint8_t {
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
@@ -94,7 +95,6 @@ struct ncclCudaStreamList {
struct ncclCudaStreamList *next;
cudaStream_t stream;
};
struct ncclTasks {
struct Peer {
bool sendSeen, recvSeen;
@@ -104,7 +104,8 @@ struct ncclTasks {
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
size_t collBytesTotal;
struct Peer* peers/*[nRanks]*/;
int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
int *p2pSendOrder, *p2pRecvOrder;
int p2pOrderSteps;
int nTasksColl, nTasksP2p;
// The list of user streams aggregated over all tasks present.
-19
Προβολή Αρχείου
@@ -18,25 +18,6 @@ ncclResult_t ncclNetPluginInit();
ncclResult_t ncclNetInit(struct ncclComm* comm);
int ncclNetVersion(struct ncclComm* comm);
// Translation to external API
static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
/* DMA-BUF support */
static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
// Test whether the current GPU support GPU Direct RDMA.
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
+21 -12
Προβολή Αρχείου
@@ -126,7 +126,7 @@
* Systems:
*
* \image html
* https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png
* https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png
*
* Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
* ranges to your code that automatically use the name of the enclosing function
@@ -561,18 +561,27 @@
/* Temporary helper #defines, removed with #undef at end of header */
#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
#if defined(_MSC_VER) && _MSC_VER < 1914
/* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser
* that does not work with domain::get's specialization for domain::global,
* and would require extra conditions to make SFINAE work for the overloaded
* get() functions. This macro disables use of overloaded get() in order to
* work with VS2015 and versions of VS2017 below 15.7, without penalizing
* users of newer compilers. Building with this flag set to 0 means errors
* when defining tag structs (see documentation for domain, named_category,
* and registered_string) will have more complex compiler error messages
* instead of the clear static_assert messages from the get() overloads.
/* Some compilers do not correctly support SFINAE, which is used in this API
* to detect common usage errors and provide clearer error messages (by using
* static_assert) than the compiler would produce otherwise. These compilers
* will generate errors while compiling this file such as:
*
* error: name is not a member of nvtx3::v1::domain::global
*
* The following compiler versions are known to have this problem, and so are
* set by default to disable the SFINAE-based checks:
*
* - All MSVC versions prior to VS2017 Update 7 (15.7)
* - GCC 8.1-8.3 (the problem was fixed in GCC 8.4)
*
* If you find your compiler hits this problem, you can work around it by
* defining NVTX3_USE_CHECKED_OVERLOADS_FOR_GET to 0 before including this
* header, or you can add a check for your compiler version to this #if.
* Also, please report the issue on the NVTX github page.
*/
#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
#if defined(_MSC_VER) && _MSC_VER < 1914 \
|| defined(__GNUC__) && __GNUC__ == 8 && __GNUC_MINOR__ < 4
#define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0
#else
#define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1
@@ -1,30 +1,33 @@
/*
* Copyright 2021-2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
#endif
/*
* Helper array to get the alignment for each predefined C language type.
*/
typedef void* pointer_type;
#if __STDC_VERSION__ >= 201112L /* or CPP11 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
#include <uchar.h>
#include <stdalign.h>
#endif
/* `alignof` is available as of C11 or C++11 */
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
#define nvtx_alignof(type) alignof(type)
#define nvtx_alignof2(type,tname) alignof(type)
#else /* __STDC_VERSION__ >= 201112L */
#ifndef __cplusplus
#include <stddef.h>
#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
#define nvtx_alignof2(type,tname) nvtx_alignof(type)
#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */
#else /* __cplusplus */
#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
/* Create helper structs to determine type alignment. */
#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type
#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname
MKTYPEDEF(char);
MKTYPEDEF2(unsigned char, uchar);
@@ -54,22 +57,33 @@ MKTYPEDEF(size_t);
MKTYPEDEF(pointer_type);
MKTYPEDEF(wchar_t);
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
{sizeof(char8_t), nvtx_alignof(char8_t)},
/* `char8_t` is available as of C++20 or C23 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
MKTYPEDEF(char8_t);
#endif
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
/* `char16_t` and `char32_t` are available as of C++11 or C11 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
MKTYPEDEF(char16_t);
MKTYPEDEF(char32_t);
#endif
/* C requires to include stddef.h to use `offsetof` */
#ifndef __cplusplus
#include <stddef.h>
#endif
#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d)
#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d)
#endif /* __STDC_VERSION__ >= 201112L */
#undef MKTYPEDEF
#undef MKTYPEDEF2
#endif /* __cplusplus */
#endif /* __STDC_VERSION__ >= 201112L */
/*
* Helper array to get the alignment for each predefined C/C++ language type.
* The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
*/
const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
@@ -109,13 +123,14 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
/*** Special character types ***/
/* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
{sizeof(char8_t), nvtx_alignof(char8_t)},
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)},
#else
{0, 0},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0},
#endif
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
#else
@@ -125,4 +140,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
};
#undef nvtx_alignof
#undef nvtx_alignof2
#undef nvtx_alignof2
+17
Προβολή Αρχείου
@@ -9,4 +9,21 @@
#ifndef NCCL_P2P_H_
#define NCCL_P2P_H_
#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
typedef struct {
int data; // Currently only support an fd based descriptor
} ncclCuDesc;
typedef union {
// Legacy CUDA IPC
cudaIpcMemHandle_t devIpc;
// cuMem API support
ncclCuDesc cuDesc;
} ncclIpcDesc;
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
#endif
+34 -21
Προβολή Αρχείου
@@ -15,11 +15,13 @@
#include "ipcsocket.h"
#include <pthread.h>
#include "shm.h"
#include "p2p.h"
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
enum { proxyRecv=0, proxySend=1 };
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
@@ -130,18 +132,11 @@ struct ncclProxySharedP2p {
int size;
char* cudaBuff;
char* hostBuff;
cudaIpcMemHandle_t ipc;
// CUDA IPC
ncclIpcDesc ipcDesc;
struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
};
struct ncclProxySharedCollNet {
int size;
char* cudaBuff;
char* hostBuff;
struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
void* resources;
};
struct ncclProxyPeer {
struct ncclProxySharedP2p send;
struct ncclProxySharedP2p recv;
@@ -165,7 +160,6 @@ struct ncclProxyProgressState {
bool stop;
struct ncclProxyPeer** localPeers;
struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
struct ncclProxySharedCollNet collNet;
struct ncclProxyArgs* active;
struct ncclProxyArgs* pool;
struct ncclProxyPool* pools;
@@ -192,12 +186,27 @@ struct ncclProxyAsyncOp {
struct ncclProxyLocalPeer {
struct ncclSocket sock;
int localRank;
int tpRank;
int tpLocalRank;
ncclProxyAsyncOp* asyncOps;
int asyncOpCounter;
};
struct ncclProxyState {
int refCount;
int tpRank;
int tpnRanks;
int tpLocalnRanks;
int cudaDev;
int p2pnChannels;
int p2pChunkSize;
int nChannels;
int buffSizes[NCCL_NUM_PROTOCOLS];
bool allocP2pNetLLBuffers;
bool dmaBufSupport;
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
volatile uint32_t* abortFlag;
// Service thread
pthread_t thread;
struct ncclSocket* listenSock;
@@ -209,6 +218,7 @@ struct ncclProxyState {
struct ncclSocket* peerSocks;
struct ncclProxyOps* proxyOps;
void** sharedDevMems;
struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
// Progress thread
struct ncclProxyProgressState progressState;
@@ -228,13 +238,14 @@ enum proxyConnectState {
struct ncclProxyConnection {
int send, transport, shared;
int localRank;
int tpLocalRank, sameProcess;
struct ncclSocket* sock;
struct ncclTransportComm* tcomm;
struct ncclProxyArgs *proxyAppend;
struct ncclProxyArgs **proxyAppendPtr;
void* transportResources;
proxyConnectState state;
struct ncclCollNetSharedRes* collNet;
};
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -250,7 +261,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* prox
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
enum ncclProxyMsgType {
ncclProxyMsgInit = 1,
ncclProxyMsgSharedInit = 2,
@@ -260,22 +271,24 @@ enum ncclProxyMsgType {
ncclProxyMsgClose = 6,
ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8,
ncclProxyMsgConvertFd = 9 // cuMem API support
ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
};
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
ncclResult_t ncclProxyStop(struct ncclComm* comm);
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
enum { proxyRecv=0, proxySend=1 };
ncclResult_t mscclSaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
#endif
+38 -10
Προβολή Αρχείου
@@ -36,7 +36,6 @@ struct ncclComm;
struct ncclPeerInfo {
int rank;
int cudaDev;
int netDev;
int gdrSupport;
bool hasFineGrain;
uint64_t hostHash;
@@ -45,7 +44,6 @@ struct ncclPeerInfo {
int64_t busId;
struct ncclComm* comm;
int cudaCompCap;
int virtualId;
};
#define CONNECT_SIZE 128
@@ -53,15 +51,46 @@ struct ncclConnect {
char data[CONNECT_SIZE];
};
#if CUDART_VERSION >= 12010
#define NVLS_HANDLE_SIZE 64
struct ncclNvlsSharedRes {
int refCount;
CUmulticastObjectProp properties;
CUmemAccessDesc accessDesc;
int dev;
size_t size;
size_t granularity;
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
char* mcBuff; // Multicast NVLS buffer address
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* ucBuff; // Unicast NVLS buffer address
char shareableHandle[NVLS_HANDLE_SIZE];
int nChannels;
};
#endif /* CUDART_VERSION >= 12010 */
struct ncclCollNetSharedRes {
int refCount;
int size;
char* cudaBuff;
char* hostBuff;
struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
void* resources;
int nChannels;
size_t buffSize;
};
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(struct ncclConnector*);
ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels);
ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
};
struct ncclTransport {
@@ -74,10 +103,9 @@ struct ncclTransport {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
#if CUDART_VERSION >= 12010
ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
#endif
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);