2019-03-14 19:39:20 -07:00
|
|
|
/*************************************************************************
|
2022-01-07 06:39:55 -08:00
|
|
|
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
2022-04-18 11:14:51 -07:00
|
|
|
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
2019-03-14 19:39:20 -07:00
|
|
|
*
|
|
|
|
|
* See LICENSE.txt for license information
|
|
|
|
|
************************************************************************/
|
|
|
|
|
|
|
|
|
|
#ifndef NCCL_DEVICE_H_
|
|
|
|
|
#define NCCL_DEVICE_H_
|
|
|
|
|
|
|
|
|
|
#include "nccl.h"
|
2019-11-15 10:39:48 -08:00
|
|
|
#include "rccl_bfloat16.h"
|
2020-01-16 16:02:42 -08:00
|
|
|
#include "align.h"
|
2022-06-21 05:30:19 +08:00
|
|
|
#if defined(ENABLE_NPKIT)
|
|
|
|
|
#include "npkit/npkit_struct.h"
|
|
|
|
|
#endif
|
2019-03-14 19:39:20 -07:00
|
|
|
#include <stdint.h>
|
|
|
|
|
|
2021-01-28 09:45:01 -07:00
|
|
|
|
2022-02-21 13:09:47 +08:00
|
|
|
#define NCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
|
2022-04-18 11:14:51 -07:00
|
|
|
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
|
2022-02-21 13:09:47 +08:00
|
|
|
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
|
2020-05-12 14:40:18 -07:00
|
|
|
|
2023-02-27 02:48:21 -08:00
|
|
|
#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
|
2020-05-12 14:40:18 -07:00
|
|
|
#define NCCL_ALGO_TREE 0
|
|
|
|
|
#define NCCL_ALGO_RING 1
|
2022-08-18 02:53:17 -07:00
|
|
|
#define NCCL_ALGO_COLLNET_DIRECT 2
|
|
|
|
|
#define NCCL_ALGO_COLLNET_CHAIN 3
|
2023-02-27 02:48:21 -08:00
|
|
|
#define NCCL_ALGO_NVLS 4
|
2020-05-12 14:40:18 -07:00
|
|
|
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
|
|
|
|
|
|
|
|
|
|
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
|
|
|
|
#define NCCL_PROTO_LL 0
|
|
|
|
|
#define NCCL_PROTO_LL128 1
|
|
|
|
|
#define NCCL_PROTO_SIMPLE 2
|
|
|
|
|
extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
|
|
|
|
|
|
2019-03-14 19:39:20 -07:00
|
|
|
#define NCCL_MAX_OPS 2048
|
|
|
|
|
#define NCCL_STEPS 8
|
|
|
|
|
|
|
|
|
|
union ncclLLFifoLine {
|
|
|
|
|
/* Flags have to be *after* data, because otherwise, an incomplete receive
|
|
|
|
|
from the network may receive the flag but not the data.
|
|
|
|
|
Note this is assuming that either we receive contiguous chunks of data
|
|
|
|
|
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
|
|
|
|
|
struct {
|
|
|
|
|
uint32_t data1;
|
|
|
|
|
uint32_t flag1;
|
|
|
|
|
uint32_t data2;
|
|
|
|
|
uint32_t flag2;
|
|
|
|
|
};
|
|
|
|
|
uint64_t v[2];
|
|
|
|
|
int4 i4;
|
|
|
|
|
};
|
|
|
|
|
|
2023-06-06 16:45:50 -06:00
|
|
|
#define WARP_SIZE warpSize
|
2019-11-19 14:57:39 -08:00
|
|
|
#define MAXCHANNELS 32
|
2019-11-21 13:41:10 -08:00
|
|
|
#define NCCL_MAX_NTHREADS 256
|
2020-12-01 11:33:47 -05:00
|
|
|
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
|
2019-11-19 14:57:39 -08:00
|
|
|
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
|
|
|
|
|
#define NCCL_LL_LINES_PER_THREAD 8
|
|
|
|
|
#ifdef TEST_LL_CLEANUP
|
|
|
|
|
#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
|
|
|
|
|
#define NCCL_LL_FLAG_MAX 0x100
|
|
|
|
|
#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
|
2019-03-14 19:39:20 -07:00
|
|
|
#else
|
|
|
|
|
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
|
|
|
|
|
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
|
|
|
|
|
#endif
|
|
|
|
|
// Make sure the clean mask will last for at least NCCL_NSTEPS
|
|
|
|
|
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
|
|
|
|
|
|
2022-09-08 14:45:27 -07:00
|
|
|
#define NCCL_LL128_LINESIZE 64
|
2019-11-19 14:57:39 -08:00
|
|
|
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
|
|
|
|
|
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
|
|
|
|
|
|
2019-11-21 13:41:10 -08:00
|
|
|
#define NCCL_LL128_MAX_NTHREADS 256
|
2022-09-08 14:45:27 -07:00
|
|
|
#define NCCL_LL128_ELEMS_PER_THREAD 28
|
2019-11-19 14:57:39 -08:00
|
|
|
|
2022-09-08 14:45:27 -07:00
|
|
|
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
|
2019-11-19 14:57:39 -08:00
|
|
|
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
|
|
|
|
|
2021-09-08 13:56:25 -07:00
|
|
|
#define NCCL_DIRECT_WRITE 0x01
|
|
|
|
|
#define NCCL_DIRECT_READ 0x02
|
|
|
|
|
#define NCCL_DIRECT_NIC 0x04
|
|
|
|
|
#define NCCL_IPC_WRITE 0x08
|
|
|
|
|
#define NCCL_IPC_READ 0x10
|
2023-02-27 02:48:21 -08:00
|
|
|
#define NCCL_NVLS_MIN_POLL 0x20
|
2020-01-16 16:02:42 -08:00
|
|
|
|
2019-03-14 19:39:20 -07:00
|
|
|
struct ncclConnInfo {
|
|
|
|
|
// Regular comm mechanism
|
2020-05-12 14:40:18 -07:00
|
|
|
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
|
2019-03-14 19:39:20 -07:00
|
|
|
uint64_t *tail; // Local for recv, remote for send
|
|
|
|
|
uint64_t *head; // Local for send, remote for recv
|
|
|
|
|
|
2023-02-27 02:48:21 -08:00
|
|
|
int flags; // Direct communication / other flags
|
2020-09-04 14:35:05 -07:00
|
|
|
int shared; // Buffers are shared
|
2019-03-14 19:39:20 -07:00
|
|
|
void **ptrExchange; // Pointer exchange for direct communication
|
2021-09-08 13:56:25 -07:00
|
|
|
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
|
2019-03-14 19:39:20 -07:00
|
|
|
|
2020-09-04 14:35:05 -07:00
|
|
|
int *sizesFifo; // Sizes fifo from GPU to proxy
|
2022-01-07 06:39:55 -08:00
|
|
|
int *offsFifo; // Buffer fifo from proxy to GPU
|
2019-03-14 19:39:20 -07:00
|
|
|
|
|
|
|
|
uint64_t step; // Keep where we are
|
|
|
|
|
uint64_t llLastCleaning;
|
2019-07-05 15:43:00 -07:00
|
|
|
|
|
|
|
|
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
|
|
|
|
|
// allows software to explicitly initiate a flush read to HDP memory. See more
|
|
|
|
|
// descriptions in primitives.h.
|
|
|
|
|
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
2021-02-03 02:48:30 +00:00
|
|
|
uint32_t* curr_hdp_reg; // Current GPU's HDP register
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclProxyConnector {
|
|
|
|
|
int rank;
|
|
|
|
|
int localRank;
|
|
|
|
|
struct ncclProxyConnection* connection;
|
|
|
|
|
struct ncclComm* comm;
|
|
|
|
|
};
|
|
|
|
|
|
2019-03-14 19:39:20 -07:00
|
|
|
struct ncclConnector {
|
|
|
|
|
int connected;
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclProxyConnector proxyConn;
|
2019-03-14 19:39:20 -07:00
|
|
|
struct ncclTransportComm* transportComm;
|
2021-04-12 16:00:11 -07:00
|
|
|
void* transportResources;
|
2019-03-14 19:39:20 -07:00
|
|
|
struct ncclConnInfo conn;
|
|
|
|
|
struct ncclComm *comm;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ncclRing {
|
|
|
|
|
// Shortcuts for userRanks[1] and userRanks[n-1]
|
|
|
|
|
int prev;
|
|
|
|
|
int next;
|
|
|
|
|
|
|
|
|
|
// Maps an internal nccl index to user-specified rank order. This is necessary
|
|
|
|
|
// since we need to know how the user expects data to be ordered across
|
|
|
|
|
// devices. Ordered from current device.
|
|
|
|
|
int* userRanks;
|
2021-07-08 14:12:04 -07:00
|
|
|
|
|
|
|
|
int index; // This rank's index in the ring
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define NCCL_MAX_TREE_ARITY 3
|
|
|
|
|
struct ncclTree {
|
|
|
|
|
int depth;
|
|
|
|
|
int up;
|
|
|
|
|
int down[NCCL_MAX_TREE_ARITY];
|
|
|
|
|
};
|
|
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
#define NCCL_MAX_DIRECT_ARITY 7
|
|
|
|
|
struct ncclDirect {
|
|
|
|
|
int depth;
|
|
|
|
|
int out;
|
2023-02-27 02:48:21 -08:00
|
|
|
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
|
|
|
|
|
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
|
|
|
|
|
int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
|
2021-04-12 16:00:11 -07:00
|
|
|
int up[NCCL_MAX_DIRECT_ARITY];
|
|
|
|
|
int down[NCCL_MAX_DIRECT_ARITY];
|
|
|
|
|
};
|
|
|
|
|
|
2021-04-30 16:57:36 -07:00
|
|
|
#define NCCL_CONN_IDX_P2P_NET 2
|
2023-02-27 02:48:21 -08:00
|
|
|
#define NCCL_MAX_NVLS_ARITY 8
|
|
|
|
|
struct ncclNvls {
|
|
|
|
|
int out;
|
|
|
|
|
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
|
|
|
|
|
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
|
|
|
|
|
int up[NCCL_MAX_NVLS_ARITY];
|
|
|
|
|
int down;
|
|
|
|
|
};
|
|
|
|
|
|
2021-04-30 16:57:36 -07:00
|
|
|
#define NCCL_MAX_CONNS 3
|
2022-05-24 02:02:31 -07:00
|
|
|
struct ncclChannelPeer {
|
2021-04-12 16:00:11 -07:00
|
|
|
struct ncclConnector send[NCCL_MAX_CONNS];
|
|
|
|
|
struct ncclConnector recv[NCCL_MAX_CONNS];
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ncclDevComm;
|
|
|
|
|
|
2019-07-05 15:43:00 -07:00
|
|
|
#pragma pack(push) /* push current alignment to stack */
|
2021-11-11 14:21:51 -08:00
|
|
|
#pragma pack(8) /* set alignment to 8 bytes boundary */
|
2020-09-04 14:35:05 -07:00
|
|
|
/* ncclWork is to be a power of two, currently 8x64 bytes, */
|
2019-03-14 19:39:20 -07:00
|
|
|
/* to make sure reads to host from the CUDA kernel are aligned. */
|
2020-09-04 14:35:05 -07:00
|
|
|
/* Make sure to adjust padding at the end of ncclWorkElem. */
|
2022-04-18 11:14:51 -07:00
|
|
|
#define NCCL_WORK_SIZE 256
|
2022-01-07 06:39:55 -08:00
|
|
|
|
2022-05-24 02:02:31 -07:00
|
|
|
enum ncclWorkType : uint8_t {
|
2022-01-07 06:39:55 -08:00
|
|
|
ncclWorkTypeUnused=0,
|
|
|
|
|
ncclWorkTypeColl=1,
|
|
|
|
|
ncclWorkTypeP2p=2,
|
|
|
|
|
ncclWorkTypeRegColl=3
|
|
|
|
|
};
|
2022-05-24 02:02:31 -07:00
|
|
|
enum ncclWorkP2PType : uint8_t {
|
|
|
|
|
ncclWorkP2pTypeUnused=0,
|
|
|
|
|
ncclWorkP2pTypeSend,
|
|
|
|
|
ncclWorkP2pTypeRecv
|
2022-01-07 06:39:55 -08:00
|
|
|
};
|
|
|
|
|
|
2022-05-24 02:02:31 -07:00
|
|
|
struct ncclWorkHeader {
|
|
|
|
|
union {
|
|
|
|
|
int32_t workNext; // when isLast=0: Offset from kernel argument workHead
|
|
|
|
|
uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
|
|
|
|
|
};
|
2020-09-04 14:35:05 -07:00
|
|
|
uint16_t funcIndex;
|
2022-05-24 02:02:31 -07:00
|
|
|
uint8_t isLast:1; // last work for this kernel
|
|
|
|
|
uint8_t inFifo:1; // is this work in the fifo
|
|
|
|
|
enum ncclWorkType type;
|
2022-01-07 06:39:55 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ncclWorkElem {
|
2022-05-24 02:02:31 -07:00
|
|
|
union {
|
|
|
|
|
uint8_t flagBits;
|
|
|
|
|
struct {
|
2022-09-14 00:10:19 +00:00
|
|
|
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, nWarps:5;
|
2022-05-24 02:02:31 -07:00
|
|
|
};
|
|
|
|
|
};
|
2021-09-08 13:56:25 -07:00
|
|
|
uint8_t direct;
|
2022-09-09 01:20:52 +00:00
|
|
|
uint8_t bid;
|
|
|
|
|
uint8_t nChannels;
|
|
|
|
|
struct {
|
2022-09-14 00:10:19 +00:00
|
|
|
uint32_t root:28;
|
|
|
|
|
uint32_t pad_0:2;
|
2022-09-09 01:20:52 +00:00
|
|
|
uint32_t connIndex:2;
|
|
|
|
|
};
|
2019-03-14 19:39:20 -07:00
|
|
|
|
2020-05-12 14:40:18 -07:00
|
|
|
const void * sendbuff;
|
|
|
|
|
void * recvbuff;
|
2019-03-14 19:39:20 -07:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
size_t count;
|
2020-05-12 14:40:18 -07:00
|
|
|
union {
|
2022-04-18 11:14:51 -07:00
|
|
|
size_t lastChunkSize;
|
|
|
|
|
// Pivot A2A kernel computes chunk size itself.
|
|
|
|
|
// Instead, it needs the number of bidirectional rings.
|
|
|
|
|
size_t pivotA2ANumBiRings;
|
2020-05-12 14:40:18 -07:00
|
|
|
};
|
2022-01-07 06:39:55 -08:00
|
|
|
uint64_t redOpArg;
|
2022-04-18 11:14:51 -07:00
|
|
|
uint64_t opCount;
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
2022-05-24 02:02:31 -07:00
|
|
|
|
2022-09-09 01:20:52 +00:00
|
|
|
static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem) == 4, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 4");
|
|
|
|
|
#define NCCL_MAX_WORK_ELEMENTS 1
|
2022-01-07 06:39:55 -08:00
|
|
|
|
|
|
|
|
struct ncclWorkElemP2p {
|
2022-09-09 01:20:52 +00:00
|
|
|
struct {
|
2022-10-20 15:40:03 +00:00
|
|
|
int32_t peer:28;
|
2022-09-09 01:20:52 +00:00
|
|
|
uint32_t connIndex:2;
|
2022-10-20 15:40:03 +00:00
|
|
|
int32_t proto:2;
|
2022-09-09 01:20:52 +00:00
|
|
|
};
|
|
|
|
|
union {
|
|
|
|
|
uint16_t flagBits;
|
|
|
|
|
struct {
|
|
|
|
|
enum ncclWorkP2PType p2pType:4;
|
|
|
|
|
uint16_t nWarps:4;
|
|
|
|
|
uint16_t warpStart:4;
|
|
|
|
|
uint16_t ngroups:4;
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
uint16_t opCount;
|
2022-05-24 02:02:31 -07:00
|
|
|
// Important not to use any fields with greater than 4-byte alignment since
|
|
|
|
|
// we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
|
|
|
|
|
// there were 8-byte fields.
|
|
|
|
|
//void* buff;
|
|
|
|
|
uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
|
|
|
|
|
//size_t count;
|
|
|
|
|
uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
|
2022-01-07 06:39:55 -08:00
|
|
|
int chunkSize;
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
2022-05-24 02:02:31 -07:00
|
|
|
|
2022-09-09 01:20:52 +00:00
|
|
|
static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) == 8, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 8");
|
|
|
|
|
#define NCCL_MAX_WORK_ELEMENTS_P2P 2
|
2021-09-08 13:56:25 -07:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclWorkElemReg {
|
2021-09-08 13:56:25 -07:00
|
|
|
struct ncclWorkElem elem;
|
|
|
|
|
void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
|
|
|
|
|
void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
|
|
|
|
|
void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
|
|
|
|
|
};
|
2022-01-07 06:39:55 -08:00
|
|
|
|
2022-05-24 02:02:31 -07:00
|
|
|
#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
|
2022-09-09 01:20:52 +00:00
|
|
|
static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 1");
|
2022-05-24 02:02:31 -07:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Number of named barriers supported by CUDA
|
2022-04-18 11:14:51 -07:00
|
|
|
#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)
|
2021-09-08 13:56:25 -07:00
|
|
|
|
2020-09-04 14:35:05 -07:00
|
|
|
struct ncclWork {
|
2022-05-24 02:02:31 -07:00
|
|
|
struct ncclWorkHeader header;
|
2021-09-08 13:56:25 -07:00
|
|
|
union {
|
2022-05-24 02:02:31 -07:00
|
|
|
char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
|
2021-09-08 13:56:25 -07:00
|
|
|
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
|
|
|
|
|
struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
|
2021-09-08 13:56:25 -07:00
|
|
|
};
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
2022-05-24 02:02:31 -07:00
|
|
|
static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
|
|
|
|
|
static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
|
|
|
|
|
|
|
|
|
|
struct ncclDevChannelPeer {
|
|
|
|
|
// Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
|
|
|
|
|
// instead of the full ncclConnector.
|
|
|
|
|
struct ncclConnInfo send[NCCL_MAX_CONNS];
|
|
|
|
|
struct ncclConnInfo recv[NCCL_MAX_CONNS];
|
2022-01-07 06:39:55 -08:00
|
|
|
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
2019-07-05 15:43:00 -07:00
|
|
|
#pragma pack(pop) /* restore original alignment from stack */
|
2019-03-14 19:39:20 -07:00
|
|
|
|
2019-07-05 15:43:00 -07:00
|
|
|
#ifdef ENABLE_PROFILING
|
2022-08-18 15:34:46 -07:00
|
|
|
#define PROFILE_NUM_ITEMS 31
|
|
|
|
|
#define PROFILE_NUM_LAUNCHES 1024
|
2021-09-08 14:20:32 -05:00
|
|
|
|
|
|
|
|
struct ncclProf {
|
2022-08-18 15:34:46 -07:00
|
|
|
uint32_t count;
|
|
|
|
|
uint32_t seq; // only entry from first launch is used
|
|
|
|
|
struct {
|
|
|
|
|
uint64_t line:16;
|
|
|
|
|
uint64_t timeStamp:48;
|
|
|
|
|
} elem[PROFILE_NUM_ITEMS];
|
2021-09-08 14:20:32 -05:00
|
|
|
};
|
2022-08-18 15:34:46 -07:00
|
|
|
static_assert(sizeof(struct ncclProf) == 256, "ncclProf must have size of 256");
|
2019-07-05 15:43:00 -07:00
|
|
|
#endif
|
|
|
|
|
|
2019-11-26 16:33:13 -08:00
|
|
|
#ifdef ENABLE_COLLTRACE
|
|
|
|
|
typedef enum {
|
2022-04-18 11:14:51 -07:00
|
|
|
ncclCollTraceNotReady = 0,
|
|
|
|
|
ncclCollTraceKernelLaunchType = 1,
|
|
|
|
|
ncclCollTraceKernelEndType = 2,
|
|
|
|
|
ncclCollTraceCollLaunchType = 3,
|
|
|
|
|
ncclCollTraceAbortType = 4,
|
|
|
|
|
ncclCollTraceDataType = 5,
|
|
|
|
|
ncclCollTraceCollElemType = (1<<4),
|
|
|
|
|
ncclCollTraceP2pElemType = (1<<5),
|
2019-11-26 16:33:13 -08:00
|
|
|
} ncclCollTraceDataType_t;
|
|
|
|
|
|
|
|
|
|
struct ncclCollTrace {
|
|
|
|
|
uint8_t type;
|
|
|
|
|
uint8_t bid;
|
|
|
|
|
int16_t funcIndex;
|
|
|
|
|
uint32_t data_0;
|
|
|
|
|
uint64_t timeStamp;
|
2022-04-18 11:14:51 -07:00
|
|
|
union {
|
|
|
|
|
uint64_t opCount;
|
|
|
|
|
uint32_t p2pOpCount[2];
|
|
|
|
|
};
|
2020-12-22 13:28:21 -05:00
|
|
|
union {
|
|
|
|
|
uint64_t data_1;
|
|
|
|
|
struct {
|
2022-04-18 11:14:51 -07:00
|
|
|
uint8_t nWarps;
|
2020-12-22 13:28:21 -05:00
|
|
|
uint8_t bid;
|
|
|
|
|
uint8_t nChannels;
|
|
|
|
|
} coll;
|
|
|
|
|
struct {
|
2022-04-18 11:14:51 -07:00
|
|
|
int16_t peer;
|
|
|
|
|
uint8_t ngroups:4;
|
|
|
|
|
uint8_t connIndex:4;
|
|
|
|
|
uint8_t warpStart:4;
|
|
|
|
|
uint8_t nWarps:4;
|
|
|
|
|
} p2p[2];
|
2020-12-22 13:28:21 -05:00
|
|
|
};
|
2019-11-26 16:33:13 -08:00
|
|
|
};
|
|
|
|
|
static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
|
|
|
|
|
|
2020-12-01 11:33:47 -05:00
|
|
|
#define COLLTRACE_NUM_ITEMS 8192
|
2019-11-26 16:33:13 -08:00
|
|
|
#endif
|
|
|
|
|
|
2022-05-24 02:02:31 -07:00
|
|
|
struct alignas(16) ncclDevChannel {
|
|
|
|
|
struct ncclDevChannelPeer *peers;
|
|
|
|
|
struct ncclRing ring;
|
|
|
|
|
struct ncclTree tree;
|
2022-08-18 02:53:17 -07:00
|
|
|
struct ncclTree collnetChain;
|
|
|
|
|
struct ncclDirect collnetDirect;
|
2022-09-13 17:19:04 -05:00
|
|
|
struct ncclTree binTree;
|
2023-02-27 02:48:21 -08:00
|
|
|
struct ncclNvls nvls;
|
2022-05-24 02:02:31 -07:00
|
|
|
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ncclDevComm {
|
|
|
|
|
int rank;
|
|
|
|
|
int nRanks;
|
2020-05-12 14:40:18 -07:00
|
|
|
int buffSizes[NCCL_NUM_PROTOCOLS];
|
2019-03-14 19:39:20 -07:00
|
|
|
|
2022-05-24 02:02:31 -07:00
|
|
|
// Operation list for aggregation
|
|
|
|
|
int workFifoDepth;
|
|
|
|
|
struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
|
|
|
|
|
|
2019-03-14 19:39:20 -07:00
|
|
|
// Flag to ask NCCL kernels to abort
|
2022-05-24 02:02:31 -07:00
|
|
|
volatile uint32_t* abortFlag;
|
2019-03-14 19:39:20 -07:00
|
|
|
|
|
|
|
|
// Channels, device side
|
2022-05-24 02:02:31 -07:00
|
|
|
struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
|
2019-07-05 15:43:00 -07:00
|
|
|
|
2022-06-21 05:30:19 +08:00
|
|
|
#if defined(ENABLE_NPKIT)
|
|
|
|
|
NpKitEventCollectContext* npKitEventCollectContexts;
|
2023-05-24 22:41:05 +08:00
|
|
|
uint64_t* cpuTimestamp;
|
2022-06-21 05:30:19 +08:00
|
|
|
#endif
|
|
|
|
|
|
2019-11-26 16:33:13 -08:00
|
|
|
#ifdef ENABLE_COLLTRACE
|
|
|
|
|
struct ncclCollTrace* collTrace;
|
2022-09-09 01:20:52 +00:00
|
|
|
volatile uint32_t *collTraceTail;
|
2019-11-26 16:33:13 -08:00
|
|
|
pthread_t collTraceThread;
|
2022-09-09 01:20:52 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#ifdef ENABLE_PROFILING
|
|
|
|
|
struct ncclProf* devProf;
|
2019-11-26 16:33:13 -08:00
|
|
|
#endif
|
2019-03-14 19:39:20 -07:00
|
|
|
};
|
|
|
|
|
|
2022-05-24 02:02:31 -07:00
|
|
|
struct alignas(16) ncclDevCommAndChannels {
|
|
|
|
|
struct ncclDevComm comm;
|
|
|
|
|
struct ncclDevChannel channels[MAXCHANNELS];
|
2021-07-08 14:12:04 -07:00
|
|
|
};
|
|
|
|
|
|
2023-02-27 02:48:21 -08:00
|
|
|
#ifdef __CUDA_ARCH__
|
|
|
|
|
#define NCCL_CUDA_ARCH __CUDA_ARCH__
|
|
|
|
|
#else
|
|
|
|
|
#define NCCL_CUDA_ARCH 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
|
__host__ __device__ constexpr T min_constexpr(T a) { return a; }
|
|
|
|
|
template<typename T, typename ...Ts>
|
|
|
|
|
__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
|
|
|
|
|
return min_constexpr<T>((a < b ? a : b), c...);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
|
__host__ __device__ constexpr T max_constexpr(T a) { return a; }
|
|
|
|
|
template<typename T, typename ...Ts>
|
|
|
|
|
__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
|
|
|
|
|
return max_constexpr<T>((a > b ? a : b), c...);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Calculate the unroll factor given:
|
|
|
|
|
// * bytePerPack: number of bytes accessed per instruction
|
|
|
|
|
// * insns: max permissible unroll value
|
|
|
|
|
// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
|
|
|
|
|
__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
|
|
|
|
|
return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Note that all unroll value logic should depend on a given cudaArch argument
|
|
|
|
|
// and not __CUDA_ARCH__ since these need to be host-side executable where the
|
|
|
|
|
// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
|
|
|
|
|
// side code can elide passing the arch for brevity.
|
|
|
|
|
|
|
|
|
|
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
|
|
|
|
|
// Our collective unroll should move to the same bytes&insns model as NVLS.
|
|
|
|
|
return cudaArch >= 800 ? 8 : 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
|
|
|
|
|
__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
|
|
|
|
|
|
|
|
|
|
__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
|
|
|
|
|
return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The amount of dynamic shmem per warp
|
|
|
|
|
__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
|
|
|
|
|
return (max_constexpr<int>(
|
|
|
|
|
/*LL */0,
|
|
|
|
|
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
|
|
|
|
|
/*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
|
|
|
|
|
// NVLS needs an extra 16B to read unaligned data.
|
|
|
|
|
/*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
|
|
|
|
|
) + 15) & -16; // pad to 16 bytes
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The amount of dynamic shmem per block
|
|
|
|
|
__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
|
|
|
|
|
return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
|
|
|
|
|
}
|
|
|
|
|
|
2019-03-14 19:39:20 -07:00
|
|
|
#endif
|