Files
rocm-systems/src/include/comm.h
T

255 строки
6.9 KiB
C

/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#include "transport.h"
#include "p2p.h"
// [RCCL]
//#include "clique/CliqueManager.h"
// [/RCCL]
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define HIPRT_CB
#else
#include "collectives.h"
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
dim3 gridDim;
dim3 blockDim;
void **args;
size_t sharedMem;
cudaStream_t stream;
};
#endif
#endif
#define CACHE_LINE_SIZE 64
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
// Channels / LL tuning
#define NCCL_LL_THREAD_THRESHOLD 8
#define NCCL_LL128_THREAD_THRESHOLD 8
#define NCCL_SIMPLE_THREAD_THRESHOLD 64
struct ncclSendMem {
union {
struct {
uint64_t head;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
uint64_t redOpArgExchange[2];
char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
int offsFifo[NCCL_STEPS];
};
char pad3[MEM_ALIGN];
};
};
struct ncclRecvMem {
union {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
int offsFifo[NCCL_STEPS];
int flush; // For GDRCopy-based flush
};
char pad4[MEM_ALIGN];
};
};
typedef hipError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
enum helperThreadState {ThreadStart, ThreadStop};
#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
struct ncclGraphHelperResources {
ncclComm* comm;
pthread_mutex_t threadLock;
pthread_cond_t threadCond;
enum helperThreadState threadState;
void* ipcBases[NCCL_IPC_POOL_SIZE];
int ipcTail;
int ipcHead;
};
struct ncclUserRedOp {
int freeNext; // -1=allocated, otherwise index of next free entry in array
ncclDataType_t datatype;
ncclDevRedOpFull opFull;
};
struct ncclNodeRanks {
int localRanks;
int* localRankToRank;
};
struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
int connect[NCCL_MAX_CONNS];
uint32_t* connectSend;
uint32_t* connectRecv;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int64_t busId; // my PCI bus ID in int format
cpu_set_t cpuAffinity; // CPU affinity of the GPU
int WarpSize;
int node;
int nNodes;
int localRank;
int localRanks;
int maxLocalRanks;
int* rankToNode;
int* rankToLocalRank;
int* localRankToRank;
// localRanks and localRanktoRank for all nodes
struct ncclNodeRanks* nodeRanks;
enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
hipStream_t userStream;
bool userStreamSet;
hipEvent_t doneEvent;
hipEvent_t intDoneEvent;
bool checkPointers;
// Counter for tracking CUDA launches (P2P and collectives included)
uint64_t opCount;
// Collective operation counter
uint64_t collOpCount;
// P2P operation counter
uint64_t p2pOpCount;
// Channels for collectives
int nChannels;
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
int p2pChannels[MAXCHANNELS];
// Buffer sizes
int buffSizes[NCCL_NUM_PROTOCOLS];
// Algorithm/Protocols thresholds
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
hipStream_t groupStream;
// Whether there has been a fatal error in this communicator.
ncclResult_t fatalError;
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
// Flags for enable P2P NET
uint32_t p2pNet;
uint32_t useIntraNet;
// Device side of the communicator
struct ncclDevComm *devComm;
// Host copy of the devComm (to free CUDA allocs)
struct ncclDevComm hostDevComm;
// Intra-process sync
int intraRank;
int intraRanks;
int* intraBarrier;
int intraPhase;
// Storage for deferred intra-process launch
hipLaunchParams * intraParams;
hipLaunchParams *myParams;
pthread_t* intraThreads;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclWorkElem args;
void* argsptrs[2];
struct ncclProxyState proxyState;
// Whether this communicator uses collNet
int collNetSupport;
int intraHighestTransportType;
// Store info of async operations
struct ncclInfo* asyncOps;
int asyncOpCount;
size_t asyncTotalSize;
ssize_t channelSize;
int lastChannel;
enum { ROUND_ROBIN, SHORTEST_QUEUE } asyncAllocMode;
//list of async p2p operation queued in a group semantics
ncclP2Plist** p2pSends;
ncclP2Plist** p2pRecvs;
int p2pSendCount;
int p2pRecvCount;
// [RCCL]
//CliqueManager* cliqueManager; // CliqueManager handles pointer collection / distribution for clique-based kernels
//int rootPid; // Process ID of root
// [/RCCL]
// Store info for cudaGraph
int usingCudaGraph; // Only use it during capture time, not launch time
struct ncclQueueInfo* enqueueInfo;
int nQueueInfoCreated;
int nQueueInfoDestroyed;
hipGraphNode_t lastSetupNode;
unsigned long long lastCudaGraphId;
int driverVersion;
pfn_cuMemGetAddressRange_t pfnCuMemGetAddressRange;
pthread_t graphHelperThread;
struct ncclGraphHelperResources* graphHelperResources;
int disableGraphHelper;
int graphRegister;
// user-created reduction ops
int userRedOpCapacity, userRedOpFreeHead;
ncclUserRedOp *userRedOps;
};
// Scrambles the bits of non-builtin values of ncclRedOp_t according to the
// communicator memory address. Used to catch bugs so that integer handles
// associated with this communicator won't collide with handles of other
// communicatrs. This function is its own inverse.
static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
// Preserve the built-in values.
if(int(op) < int(ncclNumOps))
return op;
uint64_t h = reinterpret_cast<uint64_t>(comm);
h ^= h >> 32;
h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant
h >>= 32; // h is now an excellent 32-bit hash of the comm pointer
h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1
int op1 = int(h) ^ int(op);
// Since builtin values are preserved, we also have to preserve their preimage.
return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1);
}
#endif