Merge pull request #782 from ROCmSoftwarePlatform/2.18.3
Sync up with NCCL 2.18.3
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -13,6 +13,9 @@
|
||||
#define ROUNDUP(x, y) \
|
||||
(DIVUP((x), (y))*(y))
|
||||
|
||||
#define ALIGN_POWER(x, y) \
|
||||
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
|
||||
|
||||
#define ALIGN_SIZE(size, align) \
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "checks.h"
|
||||
#include "align.h"
|
||||
#include "utils.h"
|
||||
#include "p2p.h"
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
@@ -87,6 +88,77 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
|
||||
#define MAX_ALLOC_TRACK_NGPU 32
|
||||
extern struct allocationTracker allocTracker[];
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
size_t granularity = 0;
|
||||
CUdevice currentDev;
|
||||
CUmemAllocationProp prop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle;
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
|
||||
prop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
|
||||
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
ALIGN_SIZE(size, granularity);
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
|
||||
/* Reserve a virtual address range */
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
|
||||
/* Map the virtual address range to the physical allocation */
|
||||
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
|
||||
/* Now allow RW access to the newly mapped memory */
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = currentDev;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
||||
if (handlep) *handlep = handle;
|
||||
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
if (ptr == NULL) return ncclSuccess;
|
||||
ncclResult_t result = ncclSuccess;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
size_t size = 0;
|
||||
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
||||
TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
return result;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
extern int ncclCuMemEnable();
|
||||
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
return ncclInternalError;
|
||||
}
|
||||
static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
@@ -193,8 +265,13 @@ template <typename T>
|
||||
ncclResult_t ncclCudaFree(T* ptr) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaFree(ptr), result, finish);
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
|
||||
} else {
|
||||
CUDACHECKGOTO(cudaFree(ptr), result, finish);
|
||||
}
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
|
||||
@@ -20,6 +20,7 @@ ncclResult_t bootstrapNetInit();
|
||||
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
|
||||
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
|
||||
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
|
||||
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
||||
|
||||
@@ -9,7 +9,9 @@
|
||||
#include "comm.h"
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
|
||||
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
|
||||
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
|
||||
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
|
||||
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
|
||||
int peerNode = comm->rankToNode[peer];
|
||||
|
||||
+29
-29
@@ -18,11 +18,11 @@
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||
#define CUDACHECKGOTO(cmd, RES, label) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
RES = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while(false)
|
||||
@@ -60,11 +60,11 @@
|
||||
} \
|
||||
} while(true)
|
||||
|
||||
#define SYSCHECKGOTO(statement, res, label) do { \
|
||||
#define SYSCHECKGOTO(statement, RES, label) do { \
|
||||
if ((statement) == -1) { \
|
||||
/* Print the back trace*/ \
|
||||
res = ncclSystemError; \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
RES = ncclSystemError; \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
@@ -72,16 +72,16 @@
|
||||
#define NEQCHECK(statement, value) do { \
|
||||
if ((statement) != value) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NEQCHECKGOTO(statement, value, res, label) do { \
|
||||
#define NEQCHECKGOTO(statement, value, RES, label) do { \
|
||||
if ((statement) != value) { \
|
||||
/* Print the back trace*/ \
|
||||
res = ncclSystemError; \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
RES = ncclSystemError; \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
@@ -89,57 +89,57 @@
|
||||
#define EQCHECK(statement, value) do { \
|
||||
if ((statement) == value) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define EQCHECKGOTO(statement, value, res, label) do { \
|
||||
#define EQCHECKGOTO(statement, value, RES, label) do { \
|
||||
if ((statement) == value) { \
|
||||
/* Print the back trace*/ \
|
||||
res = ncclSystemError; \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
RES = ncclSystemError; \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
// Propagate errors up
|
||||
#define NCCLCHECK(call) do { \
|
||||
ncclResult_t res = call; \
|
||||
if (res != ncclSuccess && res != ncclInProgress) { \
|
||||
ncclResult_t RES = call; \
|
||||
if (RES != ncclSuccess && RES != ncclInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
|
||||
return RES; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NCCLCHECKGOTO(call, res, label) do { \
|
||||
res = call; \
|
||||
if (res != ncclSuccess && res != ncclInProgress) { \
|
||||
#define NCCLCHECKGOTO(call, RES, label) do { \
|
||||
RES = call; \
|
||||
if (RES != ncclSuccess && RES != ncclInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
ncclResult_t res = call; \
|
||||
if (res != ncclSuccess && res != ncclInProgress) { \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
ncclResult_t RES = call; \
|
||||
if (RES != ncclSuccess && RES != ncclInProgress) { \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
|
||||
return ncclInternalError; \
|
||||
} \
|
||||
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
|
||||
} while (!(cond));
|
||||
|
||||
#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
|
||||
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
res = call; \
|
||||
if (res != ncclSuccess && res != ncclInProgress) { \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
RES = call; \
|
||||
if (RES != ncclSuccess && RES != ncclInProgress) { \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
|
||||
goto label; \
|
||||
} \
|
||||
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
|
||||
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
|
||||
} while (!(cond));
|
||||
|
||||
#define NCCLCHECKTHREAD(a, args) do { \
|
||||
|
||||
@@ -63,11 +63,12 @@ struct ncclDevRedOpFull {
|
||||
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type))
|
||||
|
||||
#define DECL3(func, devredop, type, undef) \
|
||||
DECL4(func, RING, devredop, type, undef) \
|
||||
DECL4(func, TREE, devredop, type, undef) \
|
||||
DECL4(func, RING, devredop, type, undef) \
|
||||
DECL4(func, TREE, devredop, type, undef) \
|
||||
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
|
||||
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
|
||||
DECL4(func, NVLS, devredop, type, undef)
|
||||
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
|
||||
DECL4(func, NVLS, devredop, type, undef) \
|
||||
DECL4(func, NVLS_TREE, devredop, type, undef)
|
||||
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
#define DECL2(func, devredop, undefForFloat) \
|
||||
|
||||
+55
-20
@@ -101,19 +101,51 @@ struct ncclCommCallback {
|
||||
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
|
||||
};
|
||||
|
||||
struct ncclSharedResources {
|
||||
int refCount;
|
||||
struct ncclComm* owner; /* comm which creates this shared res. */
|
||||
struct ncclChannelPeer* peers[MAXCHANNELS];
|
||||
struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
|
||||
/* P2P operation counter, one per channel */
|
||||
uint64_t p2pOpCount[MAXCHANNELS];
|
||||
/* Collective operation counter */
|
||||
uint64_t collOpCount;
|
||||
int tpNRanks;
|
||||
int tpNLocalRanks;
|
||||
int tpNChannels;
|
||||
int tpP2pNChannels;
|
||||
int tpP2pChunkSize;
|
||||
uint64_t magic;
|
||||
|
||||
// top parent rank to localRank translation table
|
||||
int* tpRankToLocalRank;
|
||||
// Internal streams
|
||||
struct ncclStrongStream deviceStream, hostStream;
|
||||
|
||||
/* proxy related shared res */
|
||||
struct ncclProxyState* proxyState;
|
||||
};
|
||||
|
||||
struct ncclChannel {
|
||||
struct ncclChannelPeer* peers;
|
||||
struct ncclDevChannelPeer* devPeers;
|
||||
struct ncclChannelPeer** peers;
|
||||
struct ncclDevChannelPeer** devPeers;
|
||||
struct ncclRing ring;
|
||||
int* devRingUserRanks;
|
||||
struct ncclTree tree;
|
||||
|
||||
struct ncclTree collnetChain;
|
||||
struct ncclDirect collnetDirect;
|
||||
struct ncclTree binTree;
|
||||
struct ncclNvls nvls;
|
||||
|
||||
int id; // index of this channel
|
||||
uint32_t workFifoSent; // last used work index+1
|
||||
uint64_t p2pOpCount;
|
||||
|
||||
/* comm split sharable resources */
|
||||
struct ncclChannelPeer* collnetPeers;
|
||||
struct ncclDevChannelPeer* collnetDevPeers;
|
||||
struct ncclChannelPeer* nvlsPeers;
|
||||
struct ncclDevChannelPeer* nvlsDevPeers;
|
||||
};
|
||||
|
||||
struct ncclWorkList {
|
||||
@@ -167,6 +199,10 @@ struct ncclComm {
|
||||
// List of destructors to run when comm is destructed
|
||||
struct ncclDestructor* destructorHead;
|
||||
|
||||
struct ncclSharedResources* sharedRes;
|
||||
/* map to top parent ranks. */
|
||||
int* topParentRanks;
|
||||
int* topParentLocalRanks;
|
||||
struct ncclChannel channels[MAXCHANNELS];
|
||||
struct ncclPeerInfo* peerInfo;
|
||||
struct ncclTopoSystem* topo;
|
||||
@@ -180,15 +216,16 @@ struct ncclComm {
|
||||
|
||||
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||
|
||||
uint64_t commHash;
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
//int nvmlDev; // my nvml device index
|
||||
int compCap; // compute capability of the GPU
|
||||
int minCompCap; // min compute capability in the communicator
|
||||
int minCompCap, maxCompCap; // min/max compute capability in the communicator
|
||||
int64_t busId; // my PCI bus ID in int format
|
||||
cpu_set_t cpuAffinity; // CPU affinity of the GPU
|
||||
int WarpSize;
|
||||
int virtualId;
|
||||
int cudaArch; // matches __CUDA_ARCH__ of device
|
||||
|
||||
int node;
|
||||
@@ -207,12 +244,11 @@ struct ncclComm {
|
||||
|
||||
// Counter for tracking CUDA launches (P2P and collectives included)
|
||||
uint64_t opCount;
|
||||
// Collective operation counter
|
||||
uint64_t collOpCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
int nvlsChannels;
|
||||
int collNetChannels;
|
||||
// Channels (per peer) for p2p
|
||||
int p2pnChannels;
|
||||
int p2pnChannelsPerPeer;
|
||||
@@ -237,6 +273,8 @@ struct ncclComm {
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
volatile uint32_t *childAbortFlag;
|
||||
uint32_t *abortFlagRefCount;
|
||||
|
||||
// Flags for enable P2P NET
|
||||
uint32_t p2pNet;
|
||||
@@ -268,21 +306,24 @@ struct ncclComm {
|
||||
char intraPad2[64 - sizeof(uint64_t)];
|
||||
uint64_t intraBarrierGate; // only used if this is intraComm0
|
||||
|
||||
struct ncclProxyState proxyState;
|
||||
|
||||
struct ncclProxyState* proxyState;
|
||||
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
|
||||
// Whether this communicator uses collNet
|
||||
int collNetSupport;
|
||||
uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
|
||||
int intraHighestTransportType;
|
||||
int* collNetHeads;
|
||||
int collNetHeadsNum;
|
||||
/* sharable collNet proxy progress resource. */
|
||||
struct ncclCollNetSharedRes* collNetSharedRes;
|
||||
|
||||
// NVLink SHARP (NVLS) support
|
||||
int nvlsSupport;
|
||||
void* nvlsResources;
|
||||
/* sharable NVLS resource. */
|
||||
struct ncclNvlsSharedRes* nvlsResources;
|
||||
|
||||
size_t channelSize; // User requested work size (bytes) for channel partitions
|
||||
|
||||
// Internal streams
|
||||
struct ncclStrongStream deviceStream, hostStream;
|
||||
|
||||
// pools backed by comm->memPermanent
|
||||
struct ncclMemoryPool memPool_ncclProxyOp;
|
||||
struct ncclMemoryPool memPool_ncclKernelPlan;
|
||||
@@ -319,13 +360,7 @@ struct ncclComm {
|
||||
volatile bool collTraceExit;
|
||||
#endif
|
||||
|
||||
// communicator mode
|
||||
int blocking;
|
||||
// CGA cluster size
|
||||
int cgaClusterSize;
|
||||
int minCTAs, maxCTAs;
|
||||
// network interface name
|
||||
char *netName;
|
||||
ncclConfig_t config;
|
||||
// initState is to more conveniently reclaim resources when errors happen.
|
||||
ncclResult_t initState;
|
||||
// flag to indicate if ncclCommFinalize() is called
|
||||
|
||||
@@ -11,6 +11,9 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include "checks.h"
|
||||
|
||||
// Is cuMem API usage enabled
|
||||
extern int ncclCuMemEnable();
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#include <cudaTypedefs.h>
|
||||
#else
|
||||
@@ -85,6 +88,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
|
||||
#if CUDA_VERSION >= 11070
|
||||
|
||||
@@ -21,12 +21,13 @@
|
||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
|
||||
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
|
||||
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
#define NCCL_ALGO_COLLNET_DIRECT 2
|
||||
#define NCCL_ALGO_COLLNET_CHAIN 3
|
||||
#define NCCL_ALGO_NVLS 4
|
||||
#define NCCL_ALGO_NVLS_TREE 5
|
||||
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
|
||||
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
@@ -112,10 +113,10 @@ struct ncclConnInfo {
|
||||
};
|
||||
|
||||
struct ncclProxyConnector {
|
||||
int rank;
|
||||
int localRank;
|
||||
int tpRank;
|
||||
int tpLocalRank;
|
||||
int sameProcess;
|
||||
struct ncclProxyConnection* connection;
|
||||
struct ncclComm* comm;
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
@@ -124,7 +125,6 @@ struct ncclConnector {
|
||||
struct ncclTransportComm* transportComm;
|
||||
void* transportResources;
|
||||
struct ncclConnInfo conn;
|
||||
struct ncclComm *comm;
|
||||
};
|
||||
|
||||
struct ncclRing {
|
||||
@@ -141,6 +141,9 @@ struct ncclRing {
|
||||
};
|
||||
|
||||
|
||||
// The root of each tree only has one node down (+1 intra-node).
|
||||
#define NCCL_MAX_TREE_ARITY_TOP 2
|
||||
// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
|
||||
#define NCCL_MAX_TREE_ARITY 3
|
||||
struct ncclTree {
|
||||
int depth;
|
||||
@@ -161,18 +164,24 @@ struct ncclDirect {
|
||||
|
||||
#define NCCL_CONN_IDX_P2P_NET 2
|
||||
#define NCCL_MAX_NVLS_ARITY 8
|
||||
#define NCCL_MAX_NVLS_TREE_ARITY 3
|
||||
struct ncclNvls {
|
||||
int out;
|
||||
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
|
||||
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
|
||||
int up[NCCL_MAX_NVLS_ARITY];
|
||||
int down;
|
||||
int treeUp;
|
||||
int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
|
||||
int node;
|
||||
int nNodes;
|
||||
};
|
||||
|
||||
#define NCCL_MAX_CONNS 3
|
||||
struct ncclChannelPeer {
|
||||
struct ncclConnector send[NCCL_MAX_CONNS];
|
||||
struct ncclConnector recv[NCCL_MAX_CONNS];
|
||||
int refCount;
|
||||
};
|
||||
|
||||
struct ncclDevComm;
|
||||
@@ -362,7 +371,7 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must
|
||||
#endif
|
||||
|
||||
struct alignas(16) ncclDevChannel {
|
||||
struct ncclDevChannelPeer *peers;
|
||||
struct ncclDevChannelPeer** peers;
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
struct ncclTree collnetChain;
|
||||
|
||||
@@ -298,7 +298,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
|
||||
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
||||
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
|
||||
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
|
||||
CUDACHECK(cudaFree(md->gdrDevMem));
|
||||
NCCLCHECK(ncclCudaFree(md->gdrDevMem));
|
||||
free(md);
|
||||
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -59,9 +59,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
#define NCCL_TOPO_CPU_TYPE_ROME 4
|
||||
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
|
||||
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
|
||||
|
||||
#define NCCL_TOPO_MAX_NODES 256
|
||||
|
||||
@@ -72,6 +74,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
|
||||
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
|
||||
#define NCCL_TOPO_PATTERN_RING 4 // Ring
|
||||
#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree
|
||||
struct ncclTopoGraph {
|
||||
// Input / output
|
||||
int id; // ring : 0, tree : 1, collnet : 2
|
||||
@@ -108,18 +111,16 @@ struct ncclTopoRanks {
|
||||
int treeToParent[MAXCHANNELS];
|
||||
int treeToChild0[MAXCHANNELS];
|
||||
int treeToChild1[MAXCHANNELS];
|
||||
int nvlsHeads[MAXCHANNELS];
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
|
||||
struct ncclTopoRanks* topoRanks);
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);
|
||||
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc);
|
||||
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
|
||||
#include "info.h"
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ struct ncclAsyncJob {
|
||||
void(*destructor)(void*);
|
||||
ncclGroupJobState_t state;
|
||||
volatile uint32_t *abortFlag; /* point to comm abortFlag */
|
||||
volatile uint32_t *childAbortFlag; /* point to child abortFlag */
|
||||
ncclComm_t comm;
|
||||
};
|
||||
|
||||
@@ -67,6 +68,24 @@ extern __thread ncclResult_t ncclGroupError;
|
||||
extern __thread struct ncclComm* ncclGroupCommHead;
|
||||
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
|
||||
extern __thread int ncclGroupBlocking;
|
||||
extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
|
||||
extern __thread struct ncclGroupJob ncclGroupJobMain;
|
||||
|
||||
static inline void groupResetJobState() {
|
||||
ncclGroupBlocking = -1;
|
||||
ncclGroupJobMainPtr = NULL;
|
||||
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
|
||||
return;
|
||||
}
|
||||
|
||||
static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (job) {
|
||||
ret = ncclAsyncJobComplete(&job->base);
|
||||
groupResetJobState();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
|
||||
if (ncclGroupDepth > 0) {
|
||||
@@ -91,7 +110,7 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
|
||||
ncclMemoryStackPush(&comm->memScoped);
|
||||
}
|
||||
|
||||
ncclGroupBlocking = comm->blocking;
|
||||
ncclGroupBlocking = comm->config.blocking;
|
||||
}
|
||||
|
||||
// Add comm to this thread's group needing preconnect
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -0,0 +1,44 @@
|
||||
#ifndef NCCL_IBV_SYMBOLS_H_
|
||||
#define NCCL_IBV_SYMBOLS_H_
|
||||
|
||||
#ifdef NCCL_BUILD_RDMA_CORE
|
||||
#include <infiniband/verbs.h>
|
||||
#else
|
||||
#include "ibvcore.h"
|
||||
#endif
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
/* IB Verbs Function Pointers*/
|
||||
struct ncclIbvSymbols {
|
||||
int (*ibv_internal_fork_init)(void);
|
||||
struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
|
||||
void (*ibv_internal_free_device_list)(struct ibv_device **list);
|
||||
const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
|
||||
struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
|
||||
int (*ibv_internal_close_device)(struct ibv_context *context);
|
||||
int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
|
||||
void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
|
||||
int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
|
||||
int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
|
||||
int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
|
||||
int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
|
||||
struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
|
||||
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
|
||||
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
|
||||
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
|
||||
/* DMA-BUF support */
|
||||
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
|
||||
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
|
||||
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
|
||||
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
|
||||
struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
|
||||
int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
|
||||
int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
|
||||
const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
|
||||
};
|
||||
|
||||
/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
|
||||
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
|
||||
|
||||
#endif // NCCL_IBV_SYMBOLS_H_
|
||||
+7
-1031
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -26,6 +26,7 @@ typedef enum : uint8_t {
|
||||
ncclPatternCollnetChain,
|
||||
ncclPatternCollnetDirect,
|
||||
ncclPatternNvls,
|
||||
ncclPatternNvlsTree,
|
||||
ncclPatternSend,
|
||||
ncclPatternRecv
|
||||
} ncclPattern_t;
|
||||
@@ -94,7 +95,6 @@ struct ncclCudaStreamList {
|
||||
struct ncclCudaStreamList *next;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
|
||||
struct ncclTasks {
|
||||
struct Peer {
|
||||
bool sendSeen, recvSeen;
|
||||
@@ -104,7 +104,8 @@ struct ncclTasks {
|
||||
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
|
||||
size_t collBytesTotal;
|
||||
struct Peer* peers/*[nRanks]*/;
|
||||
int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
|
||||
int *p2pSendOrder, *p2pRecvOrder;
|
||||
int p2pOrderSteps;
|
||||
int nTasksColl, nTasksP2p;
|
||||
|
||||
// The list of user streams aggregated over all tasks present.
|
||||
|
||||
@@ -18,25 +18,6 @@ ncclResult_t ncclNetPluginInit();
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm);
|
||||
int ncclNetVersion(struct ncclComm* comm);
|
||||
|
||||
// Translation to external API
|
||||
static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
|
||||
static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
/* DMA-BUF support */
|
||||
static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
|
||||
// Test whether the current GPU support GPU Direct RDMA.
|
||||
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@
|
||||
* Systems:
|
||||
*
|
||||
* \image html
|
||||
* https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png
|
||||
* https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png
|
||||
*
|
||||
* Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
|
||||
* ranges to your code that automatically use the name of the enclosing function
|
||||
@@ -561,18 +561,27 @@
|
||||
|
||||
/* Temporary helper #defines, removed with #undef at end of header */
|
||||
|
||||
#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1914
|
||||
/* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser
|
||||
* that does not work with domain::get's specialization for domain::global,
|
||||
* and would require extra conditions to make SFINAE work for the overloaded
|
||||
* get() functions. This macro disables use of overloaded get() in order to
|
||||
* work with VS2015 and versions of VS2017 below 15.7, without penalizing
|
||||
* users of newer compilers. Building with this flag set to 0 means errors
|
||||
* when defining tag structs (see documentation for domain, named_category,
|
||||
* and registered_string) will have more complex compiler error messages
|
||||
* instead of the clear static_assert messages from the get() overloads.
|
||||
/* Some compilers do not correctly support SFINAE, which is used in this API
|
||||
* to detect common usage errors and provide clearer error messages (by using
|
||||
* static_assert) than the compiler would produce otherwise. These compilers
|
||||
* will generate errors while compiling this file such as:
|
||||
*
|
||||
* error: ‘name’ is not a member of ‘nvtx3::v1::domain::global’
|
||||
*
|
||||
* The following compiler versions are known to have this problem, and so are
|
||||
* set by default to disable the SFINAE-based checks:
|
||||
*
|
||||
* - All MSVC versions prior to VS2017 Update 7 (15.7)
|
||||
* - GCC 8.1-8.3 (the problem was fixed in GCC 8.4)
|
||||
*
|
||||
* If you find your compiler hits this problem, you can work around it by
|
||||
* defining NVTX3_USE_CHECKED_OVERLOADS_FOR_GET to 0 before including this
|
||||
* header, or you can add a check for your compiler version to this #if.
|
||||
* Also, please report the issue on the NVTX github page.
|
||||
*/
|
||||
#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1914 \
|
||||
|| defined(__GNUC__) && __GNUC__ == 8 && __GNUC_MINOR__ < 4
|
||||
#define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0
|
||||
#else
|
||||
#define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1
|
||||
|
||||
@@ -1,30 +1,33 @@
|
||||
/*
|
||||
* Copyright 2021-2023 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Helper array to get the alignment for each predefined C language type.
|
||||
*/
|
||||
|
||||
typedef void* pointer_type;
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L /* or CPP11 */
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
|
||||
#include <uchar.h>
|
||||
#include <stdalign.h>
|
||||
#endif
|
||||
|
||||
/* `alignof` is available as of C11 or C++11 */
|
||||
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
|
||||
|
||||
#define nvtx_alignof(type) alignof(type)
|
||||
#define nvtx_alignof2(type,tname) alignof(type)
|
||||
#else /* __STDC_VERSION__ >= 201112L */
|
||||
#ifndef __cplusplus
|
||||
|
||||
#include <stddef.h>
|
||||
#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
|
||||
#define nvtx_alignof2(type,tname) nvtx_alignof(type)
|
||||
#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */
|
||||
|
||||
#else /* __cplusplus */
|
||||
|
||||
#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
|
||||
#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
|
||||
#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
|
||||
#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
|
||||
/* Create helper structs to determine type alignment. */
|
||||
#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type
|
||||
#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname
|
||||
|
||||
MKTYPEDEF(char);
|
||||
MKTYPEDEF2(unsigned char, uchar);
|
||||
@@ -54,22 +57,33 @@ MKTYPEDEF(size_t);
|
||||
MKTYPEDEF(pointer_type);
|
||||
|
||||
MKTYPEDEF(wchar_t);
|
||||
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
|
||||
{sizeof(char8_t), nvtx_alignof(char8_t)},
|
||||
|
||||
/* `char8_t` is available as of C++20 or C23 */
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
|
||||
MKTYPEDEF(char8_t);
|
||||
#endif
|
||||
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
|
||||
|
||||
/* `char16_t` and `char32_t` are available as of C++11 or C11 */
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
|
||||
MKTYPEDEF(char16_t);
|
||||
MKTYPEDEF(char32_t);
|
||||
#endif
|
||||
|
||||
/* C requires to include stddef.h to use `offsetof` */
|
||||
#ifndef __cplusplus
|
||||
#include <stddef.h>
|
||||
#endif
|
||||
|
||||
#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d)
|
||||
#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d)
|
||||
|
||||
#endif /* __STDC_VERSION__ >= 201112L */
|
||||
|
||||
#undef MKTYPEDEF
|
||||
#undef MKTYPEDEF2
|
||||
|
||||
#endif /* __cplusplus */
|
||||
#endif /* __STDC_VERSION__ >= 201112L */
|
||||
|
||||
/*
|
||||
* Helper array to get the alignment for each predefined C/C++ language type.
|
||||
* The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
|
||||
*/
|
||||
const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
|
||||
@@ -109,13 +123,14 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
|
||||
|
||||
/*** Special character types ***/
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
|
||||
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
|
||||
{sizeof(char8_t), nvtx_alignof(char8_t)},
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)},
|
||||
#else
|
||||
{0, 0},
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0},
|
||||
#endif
|
||||
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
|
||||
#else
|
||||
@@ -125,4 +140,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
|
||||
};
|
||||
|
||||
#undef nvtx_alignof
|
||||
#undef nvtx_alignof2
|
||||
#undef nvtx_alignof2
|
||||
|
||||
@@ -9,4 +9,21 @@
|
||||
#ifndef NCCL_P2P_H_
|
||||
#define NCCL_P2P_H_
|
||||
|
||||
#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
|
||||
|
||||
typedef struct {
|
||||
int data; // Currently only support an fd based descriptor
|
||||
} ncclCuDesc;
|
||||
|
||||
typedef union {
|
||||
// Legacy CUDA IPC
|
||||
cudaIpcMemHandle_t devIpc;
|
||||
// cuMem API support
|
||||
ncclCuDesc cuDesc;
|
||||
} ncclIpcDesc;
|
||||
|
||||
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
|
||||
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
|
||||
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
|
||||
|
||||
#endif
|
||||
|
||||
+34
-21
@@ -15,11 +15,13 @@
|
||||
#include "ipcsocket.h"
|
||||
#include <pthread.h>
|
||||
#include "shm.h"
|
||||
#include "p2p.h"
|
||||
|
||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
|
||||
struct ncclProxyArgs;
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
|
||||
|
||||
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
|
||||
@@ -130,18 +132,11 @@ struct ncclProxySharedP2p {
|
||||
int size;
|
||||
char* cudaBuff;
|
||||
char* hostBuff;
|
||||
cudaIpcMemHandle_t ipc;
|
||||
// CUDA IPC
|
||||
ncclIpcDesc ipcDesc;
|
||||
struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
|
||||
};
|
||||
|
||||
struct ncclProxySharedCollNet {
|
||||
int size;
|
||||
char* cudaBuff;
|
||||
char* hostBuff;
|
||||
struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
|
||||
void* resources;
|
||||
};
|
||||
|
||||
struct ncclProxyPeer {
|
||||
struct ncclProxySharedP2p send;
|
||||
struct ncclProxySharedP2p recv;
|
||||
@@ -165,7 +160,6 @@ struct ncclProxyProgressState {
|
||||
bool stop;
|
||||
struct ncclProxyPeer** localPeers;
|
||||
struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
|
||||
struct ncclProxySharedCollNet collNet;
|
||||
struct ncclProxyArgs* active;
|
||||
struct ncclProxyArgs* pool;
|
||||
struct ncclProxyPool* pools;
|
||||
@@ -192,12 +186,27 @@ struct ncclProxyAsyncOp {
|
||||
|
||||
struct ncclProxyLocalPeer {
|
||||
struct ncclSocket sock;
|
||||
int localRank;
|
||||
int tpRank;
|
||||
int tpLocalRank;
|
||||
ncclProxyAsyncOp* asyncOps;
|
||||
int asyncOpCounter;
|
||||
};
|
||||
|
||||
struct ncclProxyState {
|
||||
int refCount;
|
||||
int tpRank;
|
||||
int tpnRanks;
|
||||
int tpLocalnRanks;
|
||||
int cudaDev;
|
||||
int p2pnChannels;
|
||||
int p2pChunkSize;
|
||||
int nChannels;
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
bool allocP2pNetLLBuffers;
|
||||
bool dmaBufSupport;
|
||||
ncclNet_t* ncclNet;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
volatile uint32_t* abortFlag;
|
||||
// Service thread
|
||||
pthread_t thread;
|
||||
struct ncclSocket* listenSock;
|
||||
@@ -209,6 +218,7 @@ struct ncclProxyState {
|
||||
struct ncclSocket* peerSocks;
|
||||
struct ncclProxyOps* proxyOps;
|
||||
void** sharedDevMems;
|
||||
struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
|
||||
|
||||
// Progress thread
|
||||
struct ncclProxyProgressState progressState;
|
||||
@@ -228,13 +238,14 @@ enum proxyConnectState {
|
||||
|
||||
struct ncclProxyConnection {
|
||||
int send, transport, shared;
|
||||
int localRank;
|
||||
int tpLocalRank, sameProcess;
|
||||
struct ncclSocket* sock;
|
||||
struct ncclTransportComm* tcomm;
|
||||
struct ncclProxyArgs *proxyAppend;
|
||||
struct ncclProxyArgs **proxyAppendPtr;
|
||||
void* transportResources;
|
||||
proxyConnectState state;
|
||||
struct ncclCollNetSharedRes* collNet;
|
||||
};
|
||||
|
||||
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
|
||||
@@ -250,7 +261,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* prox
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
|
||||
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
|
||||
enum ncclProxyMsgType {
|
||||
ncclProxyMsgInit = 1,
|
||||
ncclProxyMsgSharedInit = 2,
|
||||
@@ -260,22 +271,24 @@ enum ncclProxyMsgType {
|
||||
ncclProxyMsgClose = 6,
|
||||
ncclProxyMsgAbort = 7,
|
||||
ncclProxyMsgStop = 8,
|
||||
ncclProxyMsgConvertFd = 9 // cuMem API support
|
||||
ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
|
||||
};
|
||||
|
||||
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
|
||||
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
|
||||
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
|
||||
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
|
||||
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
|
||||
|
||||
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
|
||||
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
|
||||
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
|
||||
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
|
||||
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
|
||||
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
|
||||
|
||||
ncclResult_t ncclProxyStop(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
||||
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
ncclResult_t mscclSaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
|
||||
ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -36,7 +36,6 @@ struct ncclComm;
|
||||
struct ncclPeerInfo {
|
||||
int rank;
|
||||
int cudaDev;
|
||||
int netDev;
|
||||
int gdrSupport;
|
||||
bool hasFineGrain;
|
||||
uint64_t hostHash;
|
||||
@@ -45,7 +44,6 @@ struct ncclPeerInfo {
|
||||
int64_t busId;
|
||||
struct ncclComm* comm;
|
||||
int cudaCompCap;
|
||||
int virtualId;
|
||||
};
|
||||
|
||||
#define CONNECT_SIZE 128
|
||||
@@ -53,15 +51,46 @@ struct ncclConnect {
|
||||
char data[CONNECT_SIZE];
|
||||
};
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
|
||||
#define NVLS_HANDLE_SIZE 64
|
||||
struct ncclNvlsSharedRes {
|
||||
int refCount;
|
||||
CUmulticastObjectProp properties;
|
||||
CUmemAccessDesc accessDesc;
|
||||
int dev;
|
||||
size_t size;
|
||||
size_t granularity;
|
||||
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
|
||||
char* mcBuff; // Multicast NVLS buffer address
|
||||
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
|
||||
char* ucBuff; // Unicast NVLS buffer address
|
||||
char shareableHandle[NVLS_HANDLE_SIZE];
|
||||
int nChannels;
|
||||
};
|
||||
|
||||
#endif /* CUDART_VERSION >= 12010 */
|
||||
|
||||
struct ncclCollNetSharedRes {
|
||||
int refCount;
|
||||
int size;
|
||||
char* cudaBuff;
|
||||
char* hostBuff;
|
||||
struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
|
||||
void* resources;
|
||||
int nChannels;
|
||||
size_t buffSize;
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
|
||||
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
|
||||
ncclResult_t (*free)(struct ncclConnector*);
|
||||
ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
|
||||
ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
|
||||
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
|
||||
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
|
||||
ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
|
||||
ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels);
|
||||
ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
|
||||
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
|
||||
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
|
||||
ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
|
||||
};
|
||||
|
||||
struct ncclTransport {
|
||||
@@ -74,10 +103,9 @@ struct ncclTransport {
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
|
||||
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
|
||||
#endif
|
||||
|
||||
enum { collNetRecv=0, collNetSend=1 };
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user