src/include/devcomm.h

/*************************************************************************
 * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#ifndef NCCL_DEVICE_H_
#define NCCL_DEVICE_H_

#include "nccl.h"
#include "align.h"
#include <stdint.h>

#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];

#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET 2
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];

#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_LL 0
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2
extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];

#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8

union ncclLLFifoLine {
  /* Flags have to be *after* data, because otherwise, an incomplete receive
     from the network may receive the flag but not the data.
     Note this is assuming that either we receive contiguous chunks of data
     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
  struct {
    uint32_t data1;
    uint32_t flag1;
    uint32_t data2;
    uint32_t flag2;
  };
  uint64_t v[2];
  int4 i4;
};

#define WARP_SIZE 32
#define MAXCHANNELS 32
#define NCCL_MAX_NTHREADS 640
#define NCCL_SIMPLE_MAX_NTHREADS 512
#define NCCL_LL_MAX_NTHREADS 512
#define NCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
#define NCCL_LL_FLAG_MAX   0x100
#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
#else
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
#endif
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");

#define NCCL_LL128_LINESIZE 128
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)

#define NCCL_LL128_MAX_NTHREADS 640
#define NCCL_LL128_ELEMS_PER_THREAD 120

// Receiving from up to 3 sources is more compute intensive than sending
// to 3 dests. Use 70% for reduce and 30% for bcast.
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)

#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)

#define NCCL_DIRECT_GPU 0x01
#define NCCL_DIRECT_NIC 0x10

struct ncclConnInfo {
  // Regular comm mechanism
  char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
  uint64_t *tail;     // Local for recv, remote for send
  uint64_t *head;     // Local for send, remote for recv

  int direct;         // Direct communication
  int shared;         // Buffers are shared
  void **ptrExchange; // Pointer exchange for direct communication

  int *sizesFifo;     // Sizes fifo from GPU to proxy
  void* *ptrsFifo;      // Buffer fifo from proxy to GPU

  uint64_t step;      // Keep where we are
  uint64_t llLastCleaning;
};

struct ncclConnector {
  int connected;
  struct ncclProxyArgs *proxyAppend;
  struct ncclProxyArgs **proxyAppendPtr;
  struct ncclTransportComm* transportComm;
  void* transportResources;
  struct ncclConnInfo conn;
  struct ncclComm *comm;
};

struct ncclRing {
  // Shortcuts for userRanks[1] and userRanks[n-1]
  int prev;
  int next;

  // Maps an internal nccl index to user-specified rank order. This is necessary
  // since we need to know how the user expects data to be ordered across
  // devices. Ordered from current device.
  int* userRanks;
  int* devUserRanks;
};


#define NCCL_MAX_TREE_ARITY 3
struct ncclTree {
  int depth;
  int up;
  int down[NCCL_MAX_TREE_ARITY];
};

#define NCCL_MAX_DIRECT_ARITY 7
struct ncclDirect {
  int depth;
  int out;
  int nHeads;
  int headRank;
  int shift;
  int up[NCCL_MAX_DIRECT_ARITY];
  int down[NCCL_MAX_DIRECT_ARITY];
};

#define NCCL_MAX_CONNS 2
struct ncclPeer {
  struct ncclConnector send[NCCL_MAX_CONNS];
  struct ncclConnector recv[NCCL_MAX_CONNS];
};

struct ncclDevComm;

#define NCCL_MAX_WORK_ELEMENTS 8
#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)

/* ncclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclWorkElem. */
struct ncclWorkElem {
  // Header
  struct ncclDevComm* comm;
  uint16_t nThreads;
  uint16_t funcIndex;
  uint16_t index;
  uint16_t active;

  const void * sendbuff;
  void * recvbuff;

  // Op-specific fields.
  union {
    struct {
      size_t count;
      size_t lastChunkSize;
      uint32_t root;
      uint8_t bid;
      uint8_t nChannels;
    } coll;
    struct {
      size_t sendCount;
      size_t recvCount;
      int sendChunkSize;
      int recvChunkSize;
      int32_t delta;
      uint16_t nThreads;
    } p2p;
    uint64_t align[4];
  };
};
struct ncclWork {
  struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
};
static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");

struct ncclChannel {
  union {
    struct {
      struct ncclRing ring;
      struct ncclTree tree;
      struct ncclDirect collTree;

      int id;

      // Communication structures
      struct ncclPeer* peers;
      struct ncclPeer* devPeers;

      // Operation list for aggregation
      struct ncclWork* workFifo;
      int workCount;
      uint64_t workFifoTail; // Only used by CPU
      uint16_t index;        // Only used by GPU

      // GDRCOPY support
      struct ncclWork* workFifoGdr;
      struct ncclWork* workFifoDev;
      void* gdrMemDesc;
    };
    int data[0x80];
  };
};
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");

struct ncclDevComm {
  int rank;
  int nRanks;
  int buffSizes[NCCL_NUM_PROTOCOLS];

  // Flag to ask NCCL kernels to abort
  volatile uint32_t *abortFlag;

  // Channels, device side
  struct ncclChannel* channels;
};

#endif
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`/*************************************************************************`
2.9.6-1 2021-04-12 16:00:11 -07:00			`* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`*`
			`* See LICENSE.txt for license information`
			`************************************************************************/`

			`#ifndef NCCL_DEVICE_H_`
			`#define NCCL_DEVICE_H_`

			`#include "nccl.h"`
2.6.4-1 2020-01-16 16:02:42 -08:00			`#include "align.h"`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`#include <stdint.h>`

2.7.3-1 2020-05-12 14:40:18 -07:00			`#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now`
2.8.3-1 2020-09-04 14:35:05 -07:00			`typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;`
2.7.3-1 2020-05-12 14:40:18 -07:00			`extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];`

			`#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet`
			`#define NCCL_ALGO_TREE 0`
			`#define NCCL_ALGO_RING 1`
			`#define NCCL_ALGO_COLLNET 2`
			`extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];`

			`#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128`
			`#define NCCL_PROTO_LL 0`
			`#define NCCL_PROTO_LL128 1`
			`#define NCCL_PROTO_SIMPLE 2`
			`extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];`

NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`#define NCCL_MAX_OPS 2048`
			`#define NCCL_STEPS 8`

			`union ncclLLFifoLine {`
			`/* Flags have to be after data, because otherwise, an incomplete receive`
			`from the network may receive the flag but not the data.`
			`Note this is assuming that either we receive contiguous chunks of data`
			`(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */`
			`struct {`
			`uint32_t data1;`
			`uint32_t flag1;`
			`uint32_t data2;`
			`uint32_t flag2;`
			`};`
			`uint64_t v[2];`
			`int4 i4;`
			`};`

2.5.6-1 (#255 ) 2019-11-19 14:57:39 -08:00			`#define WARP_SIZE 32`
			`#define MAXCHANNELS 32`
2.8.3-1 2020-09-04 14:35:05 -07:00			`#define NCCL_MAX_NTHREADS 640`
			`#define NCCL_SIMPLE_MAX_NTHREADS 512`
			`#define NCCL_LL_MAX_NTHREADS 512`
2.5.6-1 (#255 ) 2019-11-19 14:57:39 -08:00			`#define NCCL_LL_LINES_PER_THREAD 8`
			`#ifdef TEST_LL_CLEANUP`
			`#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup`
			`#define NCCL_LL_FLAG_MAX 0x100`
			`#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`#else`
			`#define NCCL_LL_CLEAN_MASK 0x7ffffff8`
			`#define NCCL_LL_FLAG(a) ((uint32_t)(a))`
			`#endif`
			`// Make sure the clean mask will last for at least NCCL_NSTEPS`
			`static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");`

2.5.6-1 (#255 ) 2019-11-19 14:57:39 -08:00			`#define NCCL_LL128_LINESIZE 128`
			`#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))`
			`#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)`

			`#define NCCL_LL128_MAX_NTHREADS 640`
			`#define NCCL_LL128_ELEMS_PER_THREAD 120`

			`// Receiving from up to 3 sources is more compute intensive than sending`
			`// to 3 dests. Use 70% for reduce and 30% for bcast.`
			`#define NCCL_LL128_SPLIT(nt) ((nt7/(1032))*32)`

			`#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8`
			`#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)`

2.6.4-1 2020-01-16 16:02:42 -08:00			`#define NCCL_DIRECT_GPU 0x01`
			`#define NCCL_DIRECT_NIC 0x10`

NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`struct ncclConnInfo {`
			`// Regular comm mechanism`
2.7.3-1 2020-05-12 14:40:18 -07:00			`char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`uint64_t *tail; // Local for recv, remote for send`
			`uint64_t *head; // Local for send, remote for recv`

			`int direct; // Direct communication`
2.8.3-1 2020-09-04 14:35:05 -07:00			`int shared; // Buffers are shared`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`void **ptrExchange; // Pointer exchange for direct communication`

2.8.3-1 2020-09-04 14:35:05 -07:00			`int *sizesFifo; // Sizes fifo from GPU to proxy`
			`void* *ptrsFifo; // Buffer fifo from proxy to GPU`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
			`uint64_t step; // Keep where we are`
			`uint64_t llLastCleaning;`
			`};`

			`struct ncclConnector {`
			`int connected;`
			`struct ncclProxyArgs *proxyAppend;`
2.9.6-1 2021-04-12 16:00:11 -07:00			`struct ncclProxyArgs **proxyAppendPtr;`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`struct ncclTransportComm* transportComm;`
2.9.6-1 2021-04-12 16:00:11 -07:00			`void* transportResources;`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`struct ncclConnInfo conn;`
			`struct ncclComm *comm;`
			`};`

			`struct ncclRing {`
			`// Shortcuts for userRanks[1] and userRanks[n-1]`
			`int prev;`
			`int next;`

			`// Maps an internal nccl index to user-specified rank order. This is necessary`
			`// since we need to know how the user expects data to be ordered across`
			`// devices. Ordered from current device.`
			`int* userRanks;`
			`int* devUserRanks;`
			`};`


			`#define NCCL_MAX_TREE_ARITY 3`
			`struct ncclTree {`
			`int depth;`
			`int up;`
			`int down[NCCL_MAX_TREE_ARITY];`
			`};`

2.9.6-1 2021-04-12 16:00:11 -07:00			`#define NCCL_MAX_DIRECT_ARITY 7`
			`struct ncclDirect {`
			`int depth;`
			`int out;`
			`int nHeads;`
			`int headRank;`
			`int shift;`
			`int up[NCCL_MAX_DIRECT_ARITY];`
			`int down[NCCL_MAX_DIRECT_ARITY];`
			`};`

			`#define NCCL_MAX_CONNS 2`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`struct ncclPeer {`
2.9.6-1 2021-04-12 16:00:11 -07:00			`struct ncclConnector send[NCCL_MAX_CONNS];`
			`struct ncclConnector recv[NCCL_MAX_CONNS];`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`};`

			`struct ncclDevComm;`

2.8.3-1 2020-09-04 14:35:05 -07:00			`#define NCCL_MAX_WORK_ELEMENTS 8`
			`#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)`

			`/* ncclWork is to be a power of two, currently 8x64 bytes, */`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`/* to make sure reads to host from the CUDA kernel are aligned. */`
2.8.3-1 2020-09-04 14:35:05 -07:00			`/* Make sure to adjust padding at the end of ncclWorkElem. */`
			`struct ncclWorkElem {`
			`// Header`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`struct ncclDevComm* comm;`
2.8.3-1 2020-09-04 14:35:05 -07:00			`uint16_t nThreads;`
			`uint16_t funcIndex;`
			`uint16_t index;`
			`uint16_t active;`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
2.7.3-1 2020-05-12 14:40:18 -07:00			`const void * sendbuff;`
			`void * recvbuff;`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
2.8.3-1 2020-09-04 14:35:05 -07:00			`// Op-specific fields.`
2.7.3-1 2020-05-12 14:40:18 -07:00			`union {`
			`struct {`
			`size_t count;`
			`size_t lastChunkSize;`
2.8.3-1 2020-09-04 14:35:05 -07:00			`uint32_t root;`
			`uint8_t bid;`
			`uint8_t nChannels;`
2.7.3-1 2020-05-12 14:40:18 -07:00			`} coll;`
			`struct {`
			`size_t sendCount;`
			`size_t recvCount;`
2.9.6-1 2021-04-12 16:00:11 -07:00			`int sendChunkSize;`
			`int recvChunkSize;`
2.8.3-1 2020-09-04 14:35:05 -07:00			`int32_t delta;`
			`uint16_t nThreads;`
2.7.3-1 2020-05-12 14:40:18 -07:00			`} p2p;`
2.8.3-1 2020-09-04 14:35:05 -07:00			`uint64_t align[4];`
2.7.3-1 2020-05-12 14:40:18 -07:00			`};`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`};`
2.8.3-1 2020-09-04 14:35:05 -07:00			`struct ncclWork {`
			`struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`};`
2.8.3-1 2020-09-04 14:35:05 -07:00			`static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
			`struct ncclChannel {`
			`union {`
			`struct {`
			`struct ncclRing ring;`
2.8.3-1 2020-09-04 14:35:05 -07:00			`struct ncclTree tree;`
2.9.6-1 2021-04-12 16:00:11 -07:00			`struct ncclDirect collTree;`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
			`int id;`

			`// Communication structures`
			`struct ncclPeer* peers;`
			`struct ncclPeer* devPeers;`

			`// Operation list for aggregation`
2.8.3-1 2020-09-04 14:35:05 -07:00			`struct ncclWork* workFifo;`
			`int workCount;`
			`uint64_t workFifoTail; // Only used by CPU`
2.9.6-1 2021-04-12 16:00:11 -07:00			`uint16_t index; // Only used by GPU`

			`// GDRCOPY support`
			`struct ncclWork* workFifoGdr;`
			`struct ncclWork* workFifoDev;`
			`void* gdrMemDesc;`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`};`
			`int data[0x80];`
			`};`
			`};`
			`static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");`

			`struct ncclDevComm {`
			`int rank;`
			`int nRanks;`
2.7.3-1 2020-05-12 14:40:18 -07:00			`int buffSizes[NCCL_NUM_PROTOCOLS];`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
			`// Flag to ask NCCL kernels to abort`
			`volatile uint32_t *abortFlag;`

			`// Channels, device side`
			`struct ncclChannel* channels;`
			`};`

			`#endif`