452 linhas
12 KiB
C++
452 linhas
12 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
|
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_PROXY_H_
|
|
#define NCCL_PROXY_H_
|
|
|
|
#include "device.h"
|
|
#include "info.h"
|
|
#include "socket.h"
|
|
#include "ipcsocket.h"
|
|
#include "nccl_net.h"
|
|
#include <pthread.h>
|
|
#include "shmutils.h"
|
|
#include "p2p.h"
|
|
#include "collectives.h"
|
|
#include "proxy_trace/proxy_trace.h"
|
|
|
|
typedef enum : uint8_t {
|
|
ncclPatternRing,
|
|
ncclPatternRingTwice,
|
|
ncclPatternPipelineFrom,
|
|
ncclPatternPipelineTo,
|
|
ncclPatternTreeUp,
|
|
ncclPatternTreeDown,
|
|
ncclPatternTreeUpDown,
|
|
ncclPatternCollnetChain,
|
|
ncclPatternCollnetDirect,
|
|
ncclPatternNvls,
|
|
ncclPatternNvlsTree,
|
|
ncclPatternPatUp,
|
|
ncclPatternPatDown,
|
|
ncclPatternSend,
|
|
ncclPatternRecv,
|
|
ncclPatternProfiler,
|
|
} ncclPattern_t;
|
|
|
|
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
|
enum { proxyRecv=0, proxySend=1 };
|
|
|
|
struct ncclProxyArgs;
|
|
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
|
|
|
|
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
|
|
static_assert(2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH <= MAXCHANNELS, "Not enough sub space for max work elements");
|
|
|
|
union ncclProxyOpSpecifics {
|
|
struct {
|
|
size_t sizePerRank;
|
|
int nNodes, node;
|
|
} collnetDirect;
|
|
};
|
|
|
|
struct ncclProxyOp {
|
|
struct ncclProxyConnection* connection;
|
|
ssize_t nbytes;
|
|
uint64_t opCount;
|
|
int root:30;
|
|
uint32_t connIndex:2;
|
|
int next;
|
|
int nsteps;
|
|
size_t chunkSize;
|
|
size_t sliceSize;
|
|
size_t loopSize;
|
|
size_t loopOffset;
|
|
size_t channelSize;
|
|
uint8_t sliceSteps;
|
|
uint8_t chunkSteps;
|
|
uint8_t channelId;
|
|
uint8_t /*ncclDataType_t*/ dtype;
|
|
uint8_t /*ncclDevRedOp_t*/ redOp;
|
|
uint8_t /*ncclFunc_t*/ coll;
|
|
uint8_t /*ncclPattern_t*/ pattern;
|
|
uint8_t protocol;
|
|
uint8_t algorithm;
|
|
uint8_t reg;
|
|
// collnet/p2p/coll buffer reg handles
|
|
void* sendMhandle;
|
|
void* recvMhandle;
|
|
uint8_t* sendbuff;
|
|
uint8_t* recvbuff;
|
|
int isOneRPN;
|
|
RingAlgorithm *ringAlgo;
|
|
int nextRank;
|
|
int prevRank;
|
|
union ncclProxyOpSpecifics specifics;
|
|
|
|
// Profiler plugin
|
|
union {
|
|
struct ncclTaskColl* coll;
|
|
struct ncclTaskP2p* p2p;
|
|
} task;
|
|
|
|
// Profiler work counter increment flag. Set to 'true' if the profiler work counter for this channel needs increment.
|
|
// Always 'true' for collective operations. Grouped p2p operations are fused into one <send, recv> pair in the GPU kernel,
|
|
// meaning the GPU profiler code increments the work counter for the pair rather than the individual p2p. For this
|
|
// reason, the incWorkCounter flag is used to avoid incrementing the work counter twice in the host code. This is done
|
|
// by setting incWorkCounter to 'true' only for one of the p2ps in the pair during enqueue.
|
|
bool incWorkCounter;
|
|
int eActivationMask;
|
|
void* taskEventHandle;
|
|
int rank;
|
|
int peer;
|
|
pid_t pid;
|
|
void* profilerContext;
|
|
uint64_t workCounter;
|
|
|
|
struct ncclProxyOp *enqNext;
|
|
|
|
// Used to track total real bytes of this op
|
|
uint32_t totalBytes;
|
|
// Used to fetch/update the proxyOp in ProxyTrace map
|
|
facebook_rccl::ProxyTraceRecordKey traceKey;
|
|
facebook_rccl::ProxyTraceExtraInfo traceInfo;
|
|
};
|
|
|
|
struct ncclProxySubArgs;
|
|
|
|
struct ncclProxyEventHandle {
|
|
void* stepEventHandle;
|
|
struct ncclProxySubArgs* subArgPtr;
|
|
};
|
|
|
|
struct ncclProxySubArgs {
|
|
struct ncclProxyConnection* connection;
|
|
int reg;
|
|
// collnet handles
|
|
void* sendMhandle;
|
|
void* recvMhandle;
|
|
uint8_t* sendbuff;
|
|
uint8_t* recvbuff;
|
|
size_t offset;
|
|
ssize_t loopSize;
|
|
ssize_t loopOffset;
|
|
int channelId;
|
|
int nsteps;
|
|
ssize_t nbytes;
|
|
ssize_t chunkSize;
|
|
int peer;
|
|
int isOneRPN;
|
|
RingAlgorithm *ringAlgo;
|
|
int groupSize; // Number of consecutive sub operations sharing the same recvComm
|
|
uint64_t base;
|
|
uint64_t posted;
|
|
uint64_t received;
|
|
uint64_t flushed;
|
|
uint64_t transmitted;
|
|
uint64_t done;
|
|
uint64_t end;
|
|
int regBufferReady;
|
|
void* requests[NCCL_STEPS];
|
|
|
|
// Profiler plugin
|
|
int eActivationMask;
|
|
int rank;
|
|
pid_t pid;
|
|
void* profilerContext;
|
|
void* taskEventHandle;
|
|
void* opEventHandle;
|
|
void* kernelEventHandle;
|
|
struct ncclProxyEventHandle pHandles[NCCL_STEPS];
|
|
size_t transSize;
|
|
uint64_t workCounter;
|
|
|
|
void* recvRequestsCache[NCCL_STEPS];
|
|
int recvRequestsSubCount;
|
|
|
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
|
int npKitSizesFifo[NCCL_STEPS];
|
|
uint64_t timestamp[NCCL_STEPS];
|
|
#endif
|
|
|
|
// Used to fetch/update the proxyOp in ProxyTrace map
|
|
facebook_rccl::ProxyTraceRecordKey traceKey;
|
|
facebook_rccl::ProxyTraceExtraInfo traceInfo;
|
|
};
|
|
|
|
struct ncclProxyArgs {
|
|
struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
|
|
proxyProgressFunc_t progress;
|
|
int nsubs;
|
|
int done;
|
|
int onePPN;
|
|
uint64_t opCount;
|
|
int sliceSteps;
|
|
int chunkSteps;
|
|
size_t chunkSize;
|
|
size_t totalSendSize;
|
|
size_t totalRecvSize;
|
|
size_t sendSizePerRound;
|
|
size_t recvSizePerRound;
|
|
uint8_t /*ncclDataType_t*/ dtype;
|
|
uint8_t /*ncclDevRedOp_t*/ redOp;
|
|
uint8_t /*ncclPattern_t*/ pattern;
|
|
uint8_t /*ncclFunc_t*/ coll;
|
|
uint8_t protocol;
|
|
uint8_t algorithm;
|
|
int state;
|
|
char* sharedBuff[NCCL_STEPS];
|
|
int sharedSize[NCCL_STEPS];
|
|
|
|
int idle;
|
|
uint64_t hdp_flushed;
|
|
|
|
// Element linking
|
|
struct ncclProxyArgs* next;
|
|
struct ncclProxyArgs* nextPeer;
|
|
struct ncclProxyArgs** proxyAppendPtr;
|
|
|
|
union ncclProxyOpSpecifics specifics;
|
|
|
|
int prevRank;
|
|
int nextRank;
|
|
int send;
|
|
int retry_total;
|
|
};
|
|
#define NCCL_MAX_NETDEVS 128
|
|
|
|
// ProxyOps are used to communicate between main thread and service thread
|
|
// Make sure we have enough to store two full rounds of operations on all channels.
|
|
// Otherwise we'd be unable to post half of them to free new elements. Each
|
|
// p2p work contains a send and recv proxy op hence the 2x before it.
|
|
#define MAX_OPS_PER_PEER (2*MAXCHANNELS*2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH)
|
|
|
|
struct ncclProxyOpsPool {
|
|
struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
|
|
volatile int nextOps;
|
|
volatile int nextOpsEnd;
|
|
volatile int freeOps[NCCL_MAX_LOCAL_RANKS];
|
|
pthread_mutex_t mutex;
|
|
pthread_cond_t cond;
|
|
};
|
|
|
|
struct ncclProxyOps {
|
|
ncclProxyOpsPool* pool;
|
|
ncclShmHandle_t handle;
|
|
int count;
|
|
int freeOp;
|
|
int nextOps;
|
|
int nextOpsEnd;
|
|
};
|
|
|
|
struct ncclProxySharedP2p {
|
|
int refcount;
|
|
int64_t size;
|
|
char* cudaBuff;
|
|
char* hostBuff;
|
|
// CUDA IPC
|
|
ncclIpcDesc ipcDesc;
|
|
struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
|
|
};
|
|
|
|
struct ncclProxyPeer {
|
|
struct ncclProxySharedP2p send;
|
|
struct ncclProxySharedP2p recv;
|
|
};
|
|
|
|
struct ncclSharedNetComms {
|
|
int activeConnect[MAXCHANNELS];
|
|
int activeAccept[MAXCHANNELS];
|
|
void* sendComm[MAXCHANNELS];
|
|
void* recvComm[MAXCHANNELS];
|
|
int sendRefCount[MAXCHANNELS];
|
|
int recvRefCount[MAXCHANNELS];
|
|
};
|
|
|
|
struct ncclProxyPool;
|
|
struct ncclProxyProgressState {
|
|
// Used by main threads to send work to progress thread
|
|
struct ncclProxyOpsPool* opsPool;
|
|
ncclShmHandle_t handle;
|
|
char opsPoolShmSuffix[6];
|
|
|
|
pthread_t thread;
|
|
volatile int stop;
|
|
struct ncclProxyPeer** localPeers;
|
|
struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
|
|
struct ncclProxyArgs* active;
|
|
struct ncclProxyArgs* pool;
|
|
struct ncclProxyPool* pools;
|
|
int nextOps;
|
|
};
|
|
|
|
// Expected proxy response fifo
|
|
struct ncclExpectedProxyResponse {
|
|
void* opId;
|
|
int respSize;
|
|
bool done;
|
|
void* respBuff;
|
|
ncclResult_t res;
|
|
struct ncclExpectedProxyResponse* next;
|
|
};
|
|
|
|
struct ncclProxyAsyncOp {
|
|
int type;
|
|
struct ncclProxyConnection* connection;
|
|
int reqSize, respSize;
|
|
char *reqBuff, *respBuff;
|
|
void* opId;
|
|
ncclProxyAsyncOp* next;
|
|
};
|
|
|
|
struct ncclProxyLocalPeer {
|
|
struct ncclSocket sock;
|
|
int tpRank;
|
|
int tpLocalRank;
|
|
ncclProxyAsyncOp* asyncOps;
|
|
int asyncOpCounter;
|
|
};
|
|
|
|
// Common response header for all proxyOps
|
|
// We pack this into a struct to reduce the number of blocking send and recv calls
|
|
struct ncclProxyRpcResponseHeader {
|
|
void* opId;
|
|
ncclResult_t res;
|
|
int respSize;
|
|
};
|
|
|
|
// UDS support
|
|
struct ncclIpcHdr {
|
|
int type;
|
|
int rank;
|
|
int reqSize;
|
|
int respSize;
|
|
void *opId;
|
|
uint64_t data[16]; // 128-bytes
|
|
};
|
|
|
|
struct ncclProxyState {
|
|
int refCount;
|
|
int tpRank;
|
|
int tpnRanks;
|
|
int tpLocalnRanks;
|
|
int cudaDev;
|
|
int p2pnChannels;
|
|
int p2pChunkSize;
|
|
int nChannels;
|
|
int buffSizes[NCCL_NUM_PROTOCOLS];
|
|
bool allocP2pNetLLBuffers;
|
|
bool dmaBufSupport;
|
|
ncclNet_t* ncclNet;
|
|
ncclCollNet_t* ncclCollNet;
|
|
uint32_t* abortFlag;
|
|
bool directMode;
|
|
// Service threads
|
|
pthread_t thread;
|
|
pthread_t threadUDS;
|
|
struct ncclSocket* listenSock;
|
|
struct ncclIpcSocket ipcSock;
|
|
int stop;
|
|
CUcontext cudaCtx;
|
|
ncclResult_t asyncResult;
|
|
|
|
// Used by main thread
|
|
union ncclSocketAddress* peerAddresses;
|
|
struct ncclSocket* peerSocks;
|
|
struct ncclProxyOps* proxyOps;
|
|
void** sharedDevMems;
|
|
struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
|
|
uint64_t *peerAddressesUDS; // cuMem API support (UDS)
|
|
|
|
// Progress thread
|
|
struct ncclProxyProgressState progressState;
|
|
|
|
// Profiler plugin
|
|
void* profilerContext;
|
|
|
|
// Queue of expected responses from the proxy
|
|
struct ncclExpectedProxyResponse* expectedResponses;
|
|
|
|
// A handle to the proxy traces
|
|
std::unique_ptr<facebook_rccl::ProxyTrace> proxyTrace;
|
|
};
|
|
|
|
enum proxyConnectState {
|
|
connUninitialized = 0,
|
|
connInitialized = 1,
|
|
connSharedInitialized = 2,
|
|
connSetupDone = 3,
|
|
connConnected = 4,
|
|
numConnStates = 5
|
|
};
|
|
|
|
struct ncclProxyConnection {
|
|
int send, transport, shared;
|
|
int tpLocalRank, sameProcess;
|
|
struct ncclSocket* sock;
|
|
struct ncclTransportComm* tcomm;
|
|
struct ncclProxyArgs *proxyAppend;
|
|
struct ncclProxyArgs **proxyAppendPtr;
|
|
void* transportResources;
|
|
ncclNetDeviceHandle_t* netDeviceHandle;
|
|
void* mhandles[NCCL_NUM_PROTOCOLS];
|
|
proxyConnectState state;
|
|
struct ncclCollNetSharedRes* collNet;
|
|
int needsProxyProgress;
|
|
};
|
|
|
|
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
|
|
|
|
enum proxyMode {
|
|
proxyRing = 0,
|
|
proxyFrom = 1,
|
|
proxyTo = 2
|
|
};
|
|
|
|
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
|
|
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
|
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
|
|
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
|
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
|
|
|
|
// NB: ncclProxyMsgTypeStr[] in proxy.cc needs to match
|
|
enum ncclProxyMsgType {
|
|
ncclProxyMsgInit = 1,
|
|
ncclProxyMsgSharedInit = 2,
|
|
ncclProxyMsgSetup = 3,
|
|
ncclProxyMsgConnect = 4,
|
|
ncclProxyMsgStart = 5,
|
|
ncclProxyMsgClose = 6,
|
|
ncclProxyMsgAbort = 7,
|
|
ncclProxyMsgStop = 8,
|
|
ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
|
|
ncclProxyMsgQueryFd = 10,
|
|
ncclProxyMsgRegister = 11,
|
|
ncclProxyMsgDeregister = 12
|
|
};
|
|
|
|
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
|
|
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
|
|
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
|
|
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
|
|
|
|
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
|
|
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
|
|
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
|
|
|
|
// UDS support
|
|
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);
|
|
ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd);
|
|
|
|
ncclResult_t ncclProxyStop(struct ncclComm* comm);
|
|
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
|
|
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
|
|
|
ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
|
|
#endif
|