/************************************************************************* * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef EVENT_H_ #define EVENT_H_ #include #include #include #include #include "err.h" #include "profiler.h" #include "queue.h" #include #define MAX_CHANNELS 128 // Match RCCL's MAXCHANNELS #define MAX_STEPS 1024 #define MAX_OPS 16 // Up to 64K ranks for PAT #define MAX_EVENTS_PER_REQ (8) struct proxyOp; struct proxyStep; struct netPlugin { uint64_t type; int pluginType; int pluginVer; uint8_t pluginEvent; union { struct { int device; int qpNum; int opcode; uint64_t wr_id; size_t length; } qp; struct { int fd; int op; size_t length; } sock; }; double startTs; double stopTs; struct proxyStep* parent; }; struct kernelCh { uint8_t type; uint8_t channelId; struct taskEventBase* parent; double startTs; double stopTs; uint64_t startGpuClk; uint64_t stopGpuClk; }; #define PROXY_STEP_SEND_GPU_WAIT 0 #define PROXY_STEP_SEND_PEER_WAIT 1 #define PROXY_STEP_SEND_WAIT 2 #define PROXY_STEP_RECV_WAIT 0 #define PROXY_STEP_RECV_FLUSH_WAIT 1 #define PROXY_STEP_RECV_GPU_WAIT 2 #define PROXY_STEP_MAX_STATES 3 struct proxyStep { uint64_t type; // type of event: network transfer int state; int step; // network transfer id in given channel int isSend; // send/recv channel operation double timestamp[PROXY_STEP_MAX_STATES]; double startTs; double stopTs; struct proxyOp* parent; struct netPlugin net[MAX_EVENTS_PER_REQ]; int nNetEvents; }; struct proxyOp { uint64_t type; // type of event: proxy operation uint8_t channelId; // channel id for this proxy operation pid_t pid; int rank; int peer; // peer rank for this proxy operation int nSteps; // total number of network transfers for this proxy operation int chunkSize; // chunk size for this proxy operation int isSend; // send/recv channel operation size_t transSize; // transfer data size for this proxy operation double startTs; double progrTs; // In progress state transition double stopTs; int stepCount; // last processed network operation for this proxy operation struct proxyStep step[MAX_STEPS]; // array of network transfer events struct taskEventBase* parent; // parent event p2p/collective }; struct group; struct context; struct proxyCtrl { uint64_t type; struct context* ctx; // profiler context double startTs; double stopTs; int state; int appended; // appended proxy operations }; // task level event base structure struct taskEventBase { uint64_t type; // event type: collective/p2p int rank; // rank of the operation in NCCL communicator const char* func; // ncclFunc* int refCount; // number of references for this operation void* parent; // parent API event struct taskEventBase* next; // next top level event double startTs; double stopTs; }; struct collective { struct taskEventBase base; // base structure for this event uint64_t seqNumber; // sequence number for this collective in communicator void const* sendBuff; void* recvBuff; size_t count; int root; const char* datatype; uint8_t nChannels; const char* algo; const char* proto; int nWarps; struct proxyOp op[MAX_CHANNELS][2*MAX_OPS]; int nProxyOps[MAX_CHANNELS]; struct kernelCh kernel[MAX_CHANNELS]; }; struct p2p { struct taskEventBase base; // base structure for this event uint8_t func; void const* buff; size_t count; const char* datatype; int peer; uint8_t nChannels; struct proxyOp op[MAX_CHANNELS]; struct kernelCh kernel[MAX_CHANNELS]; }; struct group { uint64_t type; struct context* ctx; // profiler context int groupId; int refCount; struct taskEventBase* eventHead; // queue head for task events struct taskEventBase* eventTail; // queue tail for task events double startTs; double stopTs; struct group* next; // next group event in queue }; struct collApi { uint64_t type; struct groupApi* parent; struct context* ctx; // profiler context int collApiId; int refCount; cudaStream_t stream; const char* func; size_t count; const char* datatype; int root; bool graphCaptured; struct taskEventBase* eventHead; // queue head for task events struct taskEventBase* eventTail; // queue tail for task events double startTs; double stopTs; struct collApi* next; }; struct p2pApi { uint64_t type; struct groupApi* parent; struct context* ctx; // profiler context int p2pApiId; int refCount; const char* func; cudaStream_t stream; size_t count; const char* datatype; bool graphCaptured; struct taskEventBase* eventHead; // queue head for task events struct taskEventBase* eventTail; // queue tail for task events double startTs; double stopTs; struct p2pApi* next; }; struct kernelLaunch { uint64_t type; struct groupApi* parent; cudaStream_t stream; int kernelLaunchId; double startTs; double stopTs; struct kernelLaunch* next; }; struct groupApi { uint64_t type; struct context* ctx; int groupApiId; int refCount; bool graphCaptured; int groupDepth; struct profilerQueue p2pApiEvents; struct profilerQueue collApiEvents; struct profilerQueue kernelLaunchEvents; double endOfncclGroupStartTs; double startOfncclGroupEndTs; double startTs; double stopTs; struct groupApi* next; }; // arrays for different event objects struct context { const char* commName; uint64_t commHash; int nranks; int rank; int groupApiPoolSize; int groupApiPoolBase; int groupApiPoolIndex; struct groupApi* groupApiPool; int collApiPoolSize; int collApiPoolBase; int collApiPoolIndex; struct collApi* collApiPool; int p2pApiPoolSize; int p2pApiPoolBase; int p2pApiPoolIndex; struct p2pApi* p2pApiPool; int kernelLaunchPoolSize; int kernelLaunchPoolBase; int kernelLaunchPoolIndex; struct kernelLaunch* kernelLaunchPool; int groupPoolSize; int groupPoolBase; int groupPoolIndex; struct group* groupPool; int collPoolSize; int collPoolBase; int collPoolIndex; struct collective* collPool; int p2pPoolSize; int p2pPoolBase; int p2pPoolIndex; struct p2p* p2pPool; int proxyCtrlPoolSize; int proxyCtrlPoolBase; int proxyCtrlPoolIndex; struct proxyCtrl* proxyCtrlPool; }; template inline int taskEventQueueEmpty(T *obj) { return obj->eventHead == NULL; } template inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) { event->next = NULL; if (obj->eventHead) obj->eventTail->next = event; else obj->eventHead = event; obj->eventTail = event; } template inline struct taskEventBase* taskEventQueueHead(T *obj) { return obj->eventHead; } template inline struct taskEventBase* taskEventQueueDequeue(T* obj) { struct taskEventBase* tmp = obj->eventHead; obj->eventHead = obj->eventHead->next; if (obj->eventHead == NULL) obj->eventTail = NULL; return tmp; } template inline void resetTaskEvents(T *obj, struct context* ctx) { while (!taskEventQueueEmpty(obj)) { struct taskEventBase* base = taskEventQueueDequeue(obj); if (base->type == ncclProfileColl) { struct collective* c = (struct collective *)base; // reset event proxyOps & proxySteps memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS); // release collective events in the group and return them to the collective pool __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED); } else if (base->type == ncclProfileP2p) { struct p2p* p = (struct p2p *)base; // reset event proxyOp and proxySteps memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS); // release p2p events in the group and return them to the p2p pool __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED); } } } #endif