a46ea10583
Add support for CUDA graphs. Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439. Fix bootstrap issue caused by connection reordering. Fix CPU locking block. Improve CollNet algorithm. Improve performance on DGX A100 for communicators with only one GPU per node.
58 строки
1.2 KiB
C
58 строки
1.2 KiB
C
/*************************************************************************
|
|
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_INFO_H_
|
|
#define NCCL_INFO_H_
|
|
|
|
#include "nccl.h"
|
|
#include "devcomm.h"
|
|
|
|
typedef enum {
|
|
ncclPatternRing,
|
|
ncclPatternRingTwice,
|
|
ncclPatternPipelineFrom,
|
|
ncclPatternPipelineTo,
|
|
ncclPatternTreeUp,
|
|
ncclPatternTreeDown,
|
|
ncclPatternTreeUpDown,
|
|
ncclPatternCollTreeUpDown
|
|
} ncclPattern_t;
|
|
|
|
// Used to pass NCCL call information between functions
|
|
struct ncclInfo {
|
|
ncclFunc_t coll;
|
|
const char* opName;
|
|
// NCCL Coll Args
|
|
const void* sendbuff;
|
|
void* recvbuff;
|
|
size_t count;
|
|
ncclDataType_t datatype;
|
|
ncclRedOp_t op;
|
|
int root;
|
|
ncclComm_t comm;
|
|
cudaStream_t stream;
|
|
// Algorithm details
|
|
int chunkSteps;
|
|
int sliceSteps;
|
|
// Computed later
|
|
int algorithm;
|
|
int protocol;
|
|
ncclPattern_t pattern;
|
|
int nChannels;
|
|
int nThreads;
|
|
size_t nBytes;
|
|
int nstepsPerLoop;
|
|
int nchunksPerLoop;
|
|
ssize_t sendbytes;
|
|
ssize_t recvbytes;
|
|
int recvChunkSize;
|
|
int sendChunkSize;
|
|
uint32_t delta;
|
|
int channelId;
|
|
};
|
|
|
|
#endif
|