Files
rocm-systems/src/include/proxy.h
T

452 lines
12 KiB
C++
Raw Normal View History

2020-05-12 14:40:18 -07:00
/*************************************************************************
2022-01-07 06:39:55 -08:00
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
2022-12-13 07:51:04 +08:00
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
2020-05-12 14:40:18 -07:00
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROXY_H_
#define NCCL_PROXY_H_
2023-09-26 05:47:28 -07:00
#include "device.h"
2022-01-07 06:39:55 -08:00
#include "info.h"
#include "socket.h"
2023-02-27 02:48:21 -08:00
#include "ipcsocket.h"
2023-09-26 05:47:28 -07:00
#include "nccl_net.h"
2020-05-12 14:40:18 -07:00
#include <pthread.h>
2024-09-10 05:57:10 -07:00
#include "shmutils.h"
2023-04-03 05:32:07 -07:00
#include "p2p.h"
2024-12-18 08:26:06 -08:00
#include "collectives.h"
2025-06-25 21:01:34 -07:00
#include "proxy_trace/proxy_trace.h"
2020-05-12 14:40:18 -07:00
2024-06-11 01:28:01 -07:00
typedef enum : uint8_t {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
2024-09-10 05:57:10 -07:00
ncclPatternPatUp,
ncclPatternPatDown,
2024-06-11 01:28:01 -07:00
ncclPatternSend,
2025-03-12 13:46:21 -07:00
ncclPatternRecv,
ncclPatternProfiler,
2024-06-11 01:28:01 -07:00
} ncclPattern_t;
2020-05-12 14:40:18 -07:00
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
enum { proxyRecv=0, proxySend=1 };
2020-05-12 14:40:18 -07:00
struct ncclProxyArgs;
2023-04-03 05:32:07 -07:00
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
2020-05-12 14:40:18 -07:00
2021-04-12 16:00:11 -07:00
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
2024-06-11 01:28:01 -07:00
static_assert(2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH <= MAXCHANNELS, "Not enough sub space for max work elements");
2021-04-12 16:00:11 -07:00
2024-02-05 05:06:02 -08:00
union ncclProxyOpSpecifics {
struct {
size_t sizePerRank;
int nNodes, node;
} collnetDirect;
};
2022-01-07 06:39:55 -08:00
struct ncclProxyOp {
struct ncclProxyConnection* connection;
ssize_t nbytes;
uint64_t opCount;
int root:30;
uint32_t connIndex:2;
2022-01-07 06:39:55 -08:00
int next;
2024-02-05 05:06:02 -08:00
int nsteps;
2024-12-18 08:26:06 -08:00
size_t chunkSize;
size_t sliceSize;
size_t loopSize;
size_t loopOffset;
size_t channelSize;
2024-02-05 05:06:02 -08:00
uint8_t sliceSteps;
uint8_t chunkSteps;
uint8_t channelId;
2022-05-24 02:02:31 -07:00
uint8_t /*ncclDataType_t*/ dtype;
uint8_t /*ncclDevRedOp_t*/ redOp;
2024-02-05 05:06:02 -08:00
uint8_t /*ncclFunc_t*/ coll;
2022-05-24 02:02:31 -07:00
uint8_t /*ncclPattern_t*/ pattern;
2022-01-07 06:39:55 -08:00
uint8_t protocol;
2024-12-18 08:26:06 -08:00
uint8_t algorithm;
2024-02-05 05:06:02 -08:00
uint8_t reg;
2024-12-18 08:26:06 -08:00
// collnet/p2p/coll buffer reg handles
2024-03-26 06:08:55 -07:00
void* sendMhandle;
void* recvMhandle;
uint8_t* sendbuff;
uint8_t* recvbuff;
2024-12-18 08:26:06 -08:00
int isOneRPN;
RingAlgorithm *ringAlgo;
int nextRank;
int prevRank;
2024-02-05 05:06:02 -08:00
union ncclProxyOpSpecifics specifics;
2022-05-24 02:02:31 -07:00
2024-09-10 05:57:10 -07:00
// Profiler plugin
union {
struct ncclTaskColl* coll;
struct ncclTaskP2p* p2p;
} task;
2025-04-22 13:50:40 -07:00
// Profiler work counter increment flag. Set to 'true' if the profiler work counter for this channel needs increment.
// Always 'true' for collective operations. Grouped p2p operations are fused into one <send, recv> pair in the GPU kernel,
// meaning the GPU profiler code increments the work counter for the pair rather than the individual p2p. For this
// reason, the incWorkCounter flag is used to avoid incrementing the work counter twice in the host code. This is done
// by setting incWorkCounter to 'true' only for one of the p2ps in the pair during enqueue.
bool incWorkCounter;
2024-09-10 05:57:10 -07:00
int eActivationMask;
void* taskEventHandle;
int rank;
int peer;
pid_t pid;
void* profilerContext;
2025-03-12 13:46:21 -07:00
uint64_t workCounter;
2024-09-10 05:57:10 -07:00
2024-02-05 05:06:02 -08:00
struct ncclProxyOp *enqNext;
2025-06-25 21:01:34 -07:00
// Used to track total real bytes of this op
uint32_t totalBytes;
// Used to fetch/update the proxyOp in ProxyTrace map
facebook_rccl::ProxyTraceRecordKey traceKey;
facebook_rccl::ProxyTraceExtraInfo traceInfo;
2022-01-07 06:39:55 -08:00
};
2025-05-29 20:56:40 -07:00
struct ncclProxySubArgs;
struct ncclProxyEventHandle {
void* stepEventHandle;
struct ncclProxySubArgs* subArgPtr;
};
2021-04-12 16:00:11 -07:00
struct ncclProxySubArgs {
2022-01-07 06:39:55 -08:00
struct ncclProxyConnection* connection;
2024-02-05 05:06:02 -08:00
int reg;
2024-03-26 06:08:55 -07:00
// collnet handles
void* sendMhandle;
void* recvMhandle;
uint8_t* sendbuff;
uint8_t* recvbuff;
size_t offset;
2024-12-18 08:26:06 -08:00
ssize_t loopSize;
ssize_t loopOffset;
2022-01-07 06:39:55 -08:00
int channelId;
2020-05-12 14:40:18 -07:00
int nsteps;
2022-01-07 06:39:55 -08:00
ssize_t nbytes;
2024-12-18 08:26:06 -08:00
ssize_t chunkSize;
2022-01-07 06:39:55 -08:00
int peer;
2024-12-18 08:26:06 -08:00
int isOneRPN;
RingAlgorithm *ringAlgo;
2022-01-07 06:39:55 -08:00
int groupSize; // Number of consecutive sub operations sharing the same recvComm
2021-04-12 16:00:11 -07:00
uint64_t base;
2020-09-04 14:35:05 -07:00
uint64_t posted;
2021-04-12 16:00:11 -07:00
uint64_t received;
uint64_t flushed;
2020-09-04 14:35:05 -07:00
uint64_t transmitted;
uint64_t done;
2020-05-12 14:40:18 -07:00
uint64_t end;
2024-12-18 08:26:06 -08:00
int regBufferReady;
2020-05-12 14:40:18 -07:00
void* requests[NCCL_STEPS];
2024-09-10 05:57:10 -07:00
// Profiler plugin
int eActivationMask;
int rank;
2024-12-18 08:26:06 -08:00
pid_t pid;
void* profilerContext;
2024-09-10 05:57:10 -07:00
void* taskEventHandle;
void* opEventHandle;
2025-03-12 13:46:21 -07:00
void* kernelEventHandle;
2025-05-29 20:56:40 -07:00
struct ncclProxyEventHandle pHandles[NCCL_STEPS];
2024-09-10 05:57:10 -07:00
size_t transSize;
2025-03-12 13:46:21 -07:00
uint64_t workCounter;
2024-09-10 05:57:10 -07:00
2023-09-26 05:47:28 -07:00
void* recvRequestsCache[NCCL_STEPS];
int recvRequestsSubCount;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
int npKitSizesFifo[NCCL_STEPS];
uint64_t timestamp[NCCL_STEPS];
#endif
2025-06-25 21:01:34 -07:00
// Used to fetch/update the proxyOp in ProxyTrace map
facebook_rccl::ProxyTraceRecordKey traceKey;
facebook_rccl::ProxyTraceExtraInfo traceInfo;
2021-04-12 16:00:11 -07:00
};
struct ncclProxyArgs {
struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
2022-01-07 06:39:55 -08:00
proxyProgressFunc_t progress;
2021-04-12 16:00:11 -07:00
int nsubs;
int done;
2024-12-18 08:26:06 -08:00
int onePPN;
2022-01-07 06:39:55 -08:00
uint64_t opCount;
2021-04-12 16:00:11 -07:00
int sliceSteps;
int chunkSteps;
2024-12-18 08:26:06 -08:00
size_t chunkSize;
2024-03-26 06:08:55 -07:00
size_t totalSendSize;
size_t totalRecvSize;
size_t sendSizePerRound;
size_t recvSizePerRound;
2022-05-24 02:02:31 -07:00
uint8_t /*ncclDataType_t*/ dtype;
uint8_t /*ncclDevRedOp_t*/ redOp;
uint8_t /*ncclPattern_t*/ pattern;
2024-02-05 05:06:02 -08:00
uint8_t /*ncclFunc_t*/ coll;
2022-01-07 06:39:55 -08:00
uint8_t protocol;
2024-12-18 08:26:06 -08:00
uint8_t algorithm;
2021-04-12 16:00:11 -07:00
int state;
char* sharedBuff[NCCL_STEPS];
int sharedSize[NCCL_STEPS];
2020-05-12 14:40:18 -07:00
int idle;
uint64_t hdp_flushed;
2020-05-12 14:40:18 -07:00
// Element linking
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
2020-09-04 14:35:05 -07:00
struct ncclProxyArgs** proxyAppendPtr;
2024-02-05 05:06:02 -08:00
union ncclProxyOpSpecifics specifics;
int prevRank;
int nextRank;
int send;
int retry_total;
2020-09-04 14:35:05 -07:00
};
2022-01-07 06:39:55 -08:00
#define NCCL_MAX_NETDEVS 128
// ProxyOps are used to communicate between main thread and service thread
// Make sure we have enough to store two full rounds of operations on all channels.
2024-06-11 01:28:01 -07:00
// Otherwise we'd be unable to post half of them to free new elements. Each
// p2p work contains a send and recv proxy op hence the 2x before it.
#define MAX_OPS_PER_PEER (2*MAXCHANNELS*2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH)
2024-02-05 05:06:02 -08:00
2022-01-07 06:39:55 -08:00
struct ncclProxyOpsPool {
struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
volatile int nextOps;
volatile int nextOpsEnd;
volatile int freeOps[NCCL_MAX_LOCAL_RANKS];
pthread_mutex_t mutex;
pthread_cond_t cond;
};
struct ncclProxyOps {
ncclProxyOpsPool* pool;
2022-11-29 04:27:46 -08:00
ncclShmHandle_t handle;
2022-01-07 06:39:55 -08:00
int count;
int freeOp;
int nextOps;
int nextOpsEnd;
};
struct ncclProxySharedP2p {
int refcount;
2024-05-15 16:58:28 -05:00
int64_t size;
2022-01-07 06:39:55 -08:00
char* cudaBuff;
char* hostBuff;
2023-04-03 05:32:07 -07:00
// CUDA IPC
ncclIpcDesc ipcDesc;
2022-01-07 06:39:55 -08:00
struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
};
2020-09-04 14:35:05 -07:00
2022-01-07 06:39:55 -08:00
struct ncclProxyPeer {
struct ncclProxySharedP2p send;
struct ncclProxySharedP2p recv;
};
struct ncclSharedNetComms {
2025-05-29 20:56:40 -07:00
int activeConnect[MAXCHANNELS];
int activeAccept[MAXCHANNELS];
2022-01-07 06:39:55 -08:00
void* sendComm[MAXCHANNELS];
void* recvComm[MAXCHANNELS];
int sendRefCount[MAXCHANNELS];
int recvRefCount[MAXCHANNELS];
2020-05-12 14:40:18 -07:00
};
struct ncclProxyPool;
2022-01-07 06:39:55 -08:00
struct ncclProxyProgressState {
// Used by main threads to send work to progress thread
struct ncclProxyOpsPool* opsPool;
2022-11-29 04:27:46 -08:00
ncclShmHandle_t handle;
2022-01-07 06:39:55 -08:00
char opsPoolShmSuffix[6];
2021-04-12 16:00:11 -07:00
2022-01-07 06:39:55 -08:00
pthread_t thread;
2023-09-26 05:47:28 -07:00
volatile int stop;
2022-01-07 06:39:55 -08:00
struct ncclProxyPeer** localPeers;
struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
struct ncclProxyArgs* active;
struct ncclProxyArgs* pool;
2020-05-12 14:40:18 -07:00
struct ncclProxyPool* pools;
2022-01-07 06:39:55 -08:00
int nextOps;
};
2023-02-27 02:48:21 -08:00
// Expected proxy response fifo
struct ncclExpectedProxyResponse {
2023-09-26 05:47:28 -07:00
void* opId;
int respSize;
bool done;
void* respBuff;
ncclResult_t res;
struct ncclExpectedProxyResponse* next;
2023-02-27 02:48:21 -08:00
};
struct ncclProxyAsyncOp {
int type;
struct ncclProxyConnection* connection;
int reqSize, respSize;
char *reqBuff, *respBuff;
void* opId;
ncclProxyAsyncOp* next;
};
struct ncclProxyLocalPeer {
struct ncclSocket sock;
2023-04-03 05:32:07 -07:00
int tpRank;
int tpLocalRank;
2023-02-27 02:48:21 -08:00
ncclProxyAsyncOp* asyncOps;
int asyncOpCounter;
};
2023-09-26 05:47:28 -07:00
// Common response header for all proxyOps
// We pack this into a struct to reduce the number of blocking send and recv calls
struct ncclProxyRpcResponseHeader {
void* opId;
ncclResult_t res;
int respSize;
};
2024-02-05 05:06:02 -08:00
// UDS support
struct ncclIpcHdr {
int type;
int rank;
int reqSize;
int respSize;
void *opId;
uint64_t data[16]; // 128-bytes
};
2022-01-07 06:39:55 -08:00
struct ncclProxyState {
2023-04-03 05:32:07 -07:00
int refCount;
int tpRank;
int tpnRanks;
int tpLocalnRanks;
int cudaDev;
int p2pnChannels;
int p2pChunkSize;
int nChannels;
int buffSizes[NCCL_NUM_PROTOCOLS];
bool allocP2pNetLLBuffers;
bool dmaBufSupport;
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
2024-06-11 01:28:01 -07:00
uint32_t* abortFlag;
2024-09-10 05:57:10 -07:00
bool directMode;
2024-02-05 05:06:02 -08:00
// Service threads
2022-01-07 06:39:55 -08:00
pthread_t thread;
2024-02-05 05:06:02 -08:00
pthread_t threadUDS;
2022-01-07 06:39:55 -08:00
struct ncclSocket* listenSock;
2024-02-05 05:06:02 -08:00
struct ncclIpcSocket ipcSock;
2023-11-13 10:26:55 -08:00
int stop;
2022-11-07 14:09:26 -08:00
CUcontext cudaCtx;
2023-09-26 05:47:28 -07:00
ncclResult_t asyncResult;
2022-01-07 06:39:55 -08:00
// Used by main thread
union ncclSocketAddress* peerAddresses;
struct ncclSocket* peerSocks;
struct ncclProxyOps* proxyOps;
void** sharedDevMems;
2023-04-03 05:32:07 -07:00
struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
2024-02-05 05:06:02 -08:00
uint64_t *peerAddressesUDS; // cuMem API support (UDS)
2022-01-07 06:39:55 -08:00
// Progress thread
struct ncclProxyProgressState progressState;
2023-02-27 02:48:21 -08:00
2024-09-10 05:57:10 -07:00
// Profiler plugin
void* profilerContext;
2023-02-27 02:48:21 -08:00
// Queue of expected responses from the proxy
struct ncclExpectedProxyResponse* expectedResponses;
2025-06-25 21:01:34 -07:00
// A handle to the proxy traces
std::unique_ptr<facebook_rccl::ProxyTrace> proxyTrace;
2022-01-07 06:39:55 -08:00
};
2022-11-29 04:27:46 -08:00
enum proxyConnectState {
connUninitialized = 0,
connInitialized = 1,
connSharedInitialized = 2,
connSetupDone = 3,
connConnected = 4,
numConnStates = 5
};
2022-01-07 06:39:55 -08:00
struct ncclProxyConnection {
int send, transport, shared;
2023-04-03 05:32:07 -07:00
int tpLocalRank, sameProcess;
2022-01-07 06:39:55 -08:00
struct ncclSocket* sock;
struct ncclTransportComm* tcomm;
struct ncclProxyArgs *proxyAppend;
struct ncclProxyArgs **proxyAppendPtr;
void* transportResources;
2023-09-26 05:47:28 -07:00
ncclNetDeviceHandle_t* netDeviceHandle;
void* mhandles[NCCL_NUM_PROTOCOLS];
2022-11-29 04:27:46 -08:00
proxyConnectState state;
2023-04-03 05:32:07 -07:00
struct ncclCollNetSharedRes* collNet;
2023-09-26 05:47:28 -07:00
int needsProxyProgress;
2020-05-12 14:40:18 -07:00
};
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
enum proxyMode {
proxyRing = 0,
proxyFrom = 1,
proxyTo = 2
};
2022-05-24 02:02:31 -07:00
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
2020-05-12 14:40:18 -07:00
ncclResult_t ncclProxyStart(struct ncclComm* comm);
2024-02-05 05:06:02 -08:00
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
2020-05-12 14:40:18 -07:00
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
2023-04-03 05:32:07 -07:00
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
2025-01-27 03:30:22 -08:00
// NB: ncclProxyMsgTypeStr[] in proxy.cc needs to match
2022-01-07 06:39:55 -08:00
enum ncclProxyMsgType {
ncclProxyMsgInit = 1,
ncclProxyMsgSharedInit = 2,
ncclProxyMsgSetup = 3,
ncclProxyMsgConnect = 4,
ncclProxyMsgStart = 5,
ncclProxyMsgClose = 6,
ncclProxyMsgAbort = 7,
2023-02-27 02:48:21 -08:00
ncclProxyMsgStop = 8,
2023-09-26 05:47:28 -07:00
ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
2024-09-10 05:57:10 -07:00
ncclProxyMsgQueryFd = 10,
ncclProxyMsgRegister = 11,
ncclProxyMsgDeregister = 12
2022-01-07 06:39:55 -08:00
};
2020-05-12 14:40:18 -07:00
2023-02-27 02:48:21 -08:00
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
2023-04-03 05:32:07 -07:00
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
2023-02-27 02:48:21 -08:00
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
2023-04-03 05:32:07 -07:00
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
2023-02-27 02:48:21 -08:00
2024-02-05 05:06:02 -08:00
// UDS support
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);
2024-09-10 05:57:10 -07:00
ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd);
2023-04-03 05:32:07 -07:00
ncclResult_t ncclProxyStop(struct ncclComm* comm);
2022-01-07 06:39:55 -08:00
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
2023-11-13 10:26:55 -08:00
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
2022-12-13 07:51:04 +08:00
ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
2024-05-15 16:58:28 -05:00
#endif