/*************************************************************************
 * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#ifndef EVENT_H_
#define EVENT_H_

#include <sys/types.h>
#include <stdint.h>
#include <unistd.h>
#include <cstring>
#include "err.h"
#include "profiler.h"
#include "queue.h"
#include <cuda_runtime.h>

#define MAX_CHANNELS                     128 // Match RCCL's MAXCHANNELS
#define MAX_STEPS                        1024
#define MAX_OPS                          16 // Up to 64K ranks for PAT
#define MAX_EVENTS_PER_REQ               (8)

struct proxyOp;
struct proxyStep;

struct netPlugin {
  uint64_t type;
  int pluginType;
  int pluginVer;
  uint8_t pluginEvent;
  union {
    struct {
      int device;
      int qpNum;
      int opcode;
      uint64_t wr_id;
      size_t length;
    } qp;
    struct {
      int fd;
      int op;
      size_t length;
    } sock;
  };
  double startTs;
  double stopTs;
  struct proxyStep* parent;
};

struct kernelCh {
  uint8_t type;
  uint8_t channelId;
  struct taskEventBase* parent;
  double startTs;
  double stopTs;
  uint64_t startGpuClk;
  uint64_t stopGpuClk;
};

#define PROXY_STEP_SEND_GPU_WAIT 0
#define PROXY_STEP_SEND_PEER_WAIT 1
#define PROXY_STEP_SEND_WAIT 2
#define PROXY_STEP_RECV_WAIT 0
#define PROXY_STEP_RECV_FLUSH_WAIT 1
#define PROXY_STEP_RECV_GPU_WAIT 2
#define PROXY_STEP_MAX_STATES 3

struct proxyStep {
  uint64_t type;                     // type of event: network transfer
  int state;
  int step;                         // network transfer id in given channel
  int isSend;                       // send/recv channel operation
  double timestamp[PROXY_STEP_MAX_STATES];
  double startTs;
  double stopTs;
  struct proxyOp* parent;
  struct netPlugin net[MAX_EVENTS_PER_REQ];
  int nNetEvents;
};

struct proxyOp {
  uint64_t type;                     // type of event: proxy operation
  uint8_t channelId;                // channel id for this proxy operation
  pid_t pid;
  int rank;
  int peer;                         // peer rank for this proxy operation
  int nSteps;                       // total number of network transfers for this proxy operation
  int chunkSize;                    // chunk size for this proxy operation
  int isSend;                       // send/recv channel operation
  size_t transSize;                 // transfer data size for this proxy operation
  double startTs;
  double progrTs;                   // In progress state transition
  double stopTs;
  int stepCount;                    // last processed network operation for this proxy operation
  struct proxyStep step[MAX_STEPS]; // array of network transfer events
  struct taskEventBase* parent;     // parent event p2p/collective
};

struct group;
struct context;

struct proxyCtrl {
  uint64_t type;
  struct context* ctx;              // profiler context
  double startTs;
  double stopTs;
  int state;
  int appended;                     // appended proxy operations
};

// task level event base structure
struct taskEventBase {
  uint64_t type;                     // event type: collective/p2p
  int rank;                         // rank of the operation in NCCL communicator
  const char* func;                 // ncclFunc*
  int refCount;                     // number of references for this operation
  void* parent;                     // parent API event
  struct taskEventBase* next;       // next top level event
  double startTs;
  double stopTs;
};

struct collective {
  struct taskEventBase base;        // base structure for this event
  uint64_t seqNumber;               // sequence number for this collective in communicator
  void const* sendBuff;
  void* recvBuff;
  size_t count;
  int root;
  const char* datatype;
  uint8_t nChannels;
  const char* algo;
  const char* proto;
  int nWarps;
  struct proxyOp op[MAX_CHANNELS][2*MAX_OPS];
  int nProxyOps[MAX_CHANNELS];
  struct kernelCh kernel[MAX_CHANNELS];
};

struct p2p {
  struct taskEventBase base;        // base structure for this event
  uint8_t func;
  void const* buff;
  size_t count;
  const char* datatype;
  int peer;
  uint8_t nChannels;
  struct proxyOp op[MAX_CHANNELS];
  struct kernelCh kernel[MAX_CHANNELS];
};

struct group {
  uint64_t type;
  struct context* ctx;              // profiler context
  int groupId;
  int refCount;
  struct taskEventBase* eventHead;  // queue head for task events
  struct taskEventBase* eventTail;  // queue tail for task events
  double startTs;
  double stopTs;
  struct group* next;               // next group event in queue
};

struct collApi {
  uint64_t type;
  struct groupApi* parent;
  struct context* ctx;              // profiler context
  int collApiId;
  int refCount;
  cudaStream_t stream;
  const char* func;
  size_t count;
  const char* datatype;
  int root;
  bool graphCaptured;
  struct taskEventBase* eventHead;  // queue head for task events
  struct taskEventBase* eventTail;  // queue tail for task events
  double startTs;
  double stopTs;
  struct collApi* next;
};

struct p2pApi {
  uint64_t type;
  struct groupApi* parent;
  struct context* ctx;              // profiler context
  int p2pApiId;
  int refCount;
  const char* func;
  cudaStream_t stream;
  size_t count;
  const char* datatype;
  bool graphCaptured;
  struct taskEventBase* eventHead;  // queue head for task events
  struct taskEventBase* eventTail;  // queue tail for task events
  double startTs;
  double stopTs;
  struct p2pApi* next;
};

struct kernelLaunch {
  uint64_t type;
  struct groupApi* parent;
  cudaStream_t stream;
  int kernelLaunchId;
  double startTs;
  double stopTs;
  struct kernelLaunch* next;
};

struct groupApi {
  uint64_t type;
  struct context* ctx;
  int groupApiId;
  int refCount;
  bool graphCaptured;
  int groupDepth;
  struct profilerQueue<struct p2pApi, &p2pApi::next> p2pApiEvents;
  struct profilerQueue<struct collApi, &collApi::next> collApiEvents;
  struct profilerQueue<struct kernelLaunch, &kernelLaunch::next> kernelLaunchEvents;
  double endOfncclGroupStartTs;
  double startOfncclGroupEndTs;
  double startTs;
  double stopTs;
  struct groupApi* next;
};

// arrays for different event objects
struct context {
  const char* commName;
  uint64_t commHash;
  int nranks;
  int rank;

  int groupApiPoolSize;
  int groupApiPoolBase;
  int groupApiPoolIndex;
  struct groupApi* groupApiPool;

  int collApiPoolSize;
  int collApiPoolBase;
  int collApiPoolIndex;
  struct collApi* collApiPool;

  int p2pApiPoolSize;
  int p2pApiPoolBase;
  int p2pApiPoolIndex;
  struct p2pApi* p2pApiPool;

  int kernelLaunchPoolSize;
  int kernelLaunchPoolBase;
  int kernelLaunchPoolIndex;
  struct kernelLaunch* kernelLaunchPool;

  int groupPoolSize;
  int groupPoolBase;
  int groupPoolIndex;
  struct group* groupPool;

  int collPoolSize;
  int collPoolBase;
  int collPoolIndex;
  struct collective* collPool;

  int p2pPoolSize;
  int p2pPoolBase;
  int p2pPoolIndex;
  struct p2p* p2pPool;

  int proxyCtrlPoolSize;
  int proxyCtrlPoolBase;
  int proxyCtrlPoolIndex;
  struct proxyCtrl* proxyCtrlPool;
};

template <typename T>
inline int taskEventQueueEmpty(T *obj) {
  return obj->eventHead == NULL;
}

template <typename T>
inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) {
  event->next = NULL;
  if (obj->eventHead) obj->eventTail->next = event;
  else obj->eventHead = event;
  obj->eventTail = event;
}

template <typename T>
inline struct taskEventBase* taskEventQueueHead(T *obj) {
    return obj->eventHead;
}

template <typename T>
inline struct taskEventBase* taskEventQueueDequeue(T* obj) {
  struct taskEventBase* tmp = obj->eventHead;
  obj->eventHead = obj->eventHead->next;
  if (obj->eventHead == NULL) obj->eventTail = NULL;
  return tmp;
}

template <typename T>
inline void resetTaskEvents(T *obj, struct context* ctx) {
  while (!taskEventQueueEmpty(obj)) {
    struct taskEventBase* base = taskEventQueueDequeue(obj);
    if (base->type == ncclProfileColl) {
      struct collective* c = (struct collective *)base;
      // reset event proxyOps & proxySteps
      memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
      // release collective events in the group and return them to the collective pool
      __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
    } else if (base->type == ncclProfileP2p) {
      struct p2p* p = (struct p2p *)base;
      // reset event proxyOp and proxySteps
      memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
      // release p2p events in the group and return them to the p2p pool
      __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
    }
  }
}

#endif