Files
rocm-systems/src/include/info.h
T
Sylvain Jeaugey b6475625fb 2.20.3-1
Add support for alternating rings, allow for cross-nic rings without
cross-rail communication.
Add support for user buffer registration for network send/recv.
Optimize aggregated operations to better utilize all channels.
Add flattening for BCM PCI gen5 switches.
Add support for inter-node NVLink communication
Add support for port fusion in NET/IB.
Add support for ReduceScatter and AllGather using Collnet.
Update net API to v8.
Fix hang during A2A connection.
2024-02-13 04:22:38 -08:00

161 خطوط
4.5 KiB
C

/*************************************************************************
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_INFO_H_
#define NCCL_INFO_H_
#include "nccl.h"
#include "device.h"
#include "collectives.h"
#include "core.h"
#include "utils.h"
#include "strongstream.h"
#define NCCL_MAX_LOCAL_RANKS 64
typedef enum : uint8_t {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
enum ncclRegBufferType {
NCCL_REGULAR_BUFFER = 0,
NCCL_IPC_REG_BUFFER = 1,
NCCL_NVLS_REG_BUFFER = 2,
NCCL_REG_BUFFER_NUM = 3
};
// Used to pass NCCL call information between functions
struct ncclInfo {
ncclFunc_t coll;
const char* opName;
// NCCL Coll Args
const void* sendbuff;
void* recvbuff;
size_t count;
ncclDataType_t datatype;
ncclRedOp_t op;
int root; // peer for p2p operations
ncclComm_t comm;
cudaStream_t stream;
// Algorithm details
int chunkSteps;
int sliceSteps;
// Computed later
ncclDevRedOpFull opFull;
ncclPattern_t pattern;
size_t nBytes;
size_t aggnBytes;
size_t workBytes;
size_t sendbuffSize;
size_t recvbuffSize;
int stepSize;
int chunkCount;
int chunkSize;
int channelId;
int workFuncIndex;
ncclRegBufferType regBufType;
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
// Need to initialize
int nThreads;
int nChannels;
int algorithm;
int protocol;
bool userTuned;
struct ncclInfo *next;
};
inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype);
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
info->count = info->workBytes;
info->datatype = ncclInt8;
}
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
/* compute buffer size for NVLS buffer registration */
if (info->coll == ncclFuncAllGather) {
info->sendbuffSize = info->workBytes;
info->recvbuffSize = info->sendbuffSize * nRanks;
} else if (info->coll == ncclFuncReduceScatter) {
info->recvbuffSize = info->workBytes;
info->sendbuffSize = info->recvbuffSize * nRanks;
} else {
info->sendbuffSize = info->recvbuffSize = info->workBytes;
}
return ncclSuccess;
}
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclDevRedOpFull op;
int chunkSteps, sliceSteps;
struct ncclInfo info;
};
struct ncclTaskP2p {
ncclTaskP2p *next;
void *buff;
size_t bytes;
// Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
// of where it left off.
int chunk;
};
struct ncclCudaStreamList {
struct ncclCudaStreamList *next;
cudaStream_t stream;
};
struct ncclTasks {
struct Peer {
bool sendSeen, recvSeen;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
};
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collQueue;
// Queue for user-tuned executed collectives
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collTunedQueue;
// Queue for continuous bytes distribution (CBD) collectives
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collCBDQueue;
// Queue for collnet
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collnetQueue;
size_t workBytesTotal;
int usableChannels;
bool sorted;
struct Peer* peers/*[nRanks]*/;
int *p2pSendOrder, *p2pRecvOrder;
int p2pOrderSteps;
int nTasksColl, nTasksP2p;
// The list of user streams aggregated over all tasks present.
struct ncclCudaStreamList* streams;
// The most recent user stream. Ignored if streams==nullptr
cudaStream_t streamRecent;
// The graph capturing all user streams or invalid if none. Thus we restrict the
// user that all streams must be captured in the same graph or not captured
// at all. Technically we could probably relax this, but that would mean
// collecting a different `ncclTasks` per graph and one for non-graph.
struct ncclCudaGraph capturingGraph;
};
#endif