178b6b7590
Rework core for NVIDIA Trusted Computing * Compress work structs so that they are shared between channels * Utilize the full amount of kernel argument space permitted (4k) before resorting to work fifo. * Rework the task preprocessing phase. * Use a separate abortDevFlag which is kept in sync with abortFlag using cudaMemcpy operations. * Rename src/include/align.h to src/include/bitops.h Add lazy connection establishment for collective operations * Move buffer allocation and connection establishment to the first collective operation using that algorithm. * Accelerate init time and reduce memory usage. * Avoid allocating NVLS buffers if all calls are registered. * Compute algo/proto in ncclLaunchCollTasksInfo early on. * Connect peers in ncclCollPreconnectFunc if not connected already. * Also move shared buffer creation to the first send/recv call. Accelerate intra-node NVLink detection * Make each rank only detect NVLinks attached to its GPU. * Fuse XMLs to reconstruct the full NVLink topology Add init profiling to report time spend in different init phases. * Report timings of bootstrap, allgather, search, connect, etc. * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS. Add support for PCI p2p on split PCI switches * Detect split PCI switches through a kernel module exposing switch information. * Update the topology XML and graph to add those inter-switch connections. Add cost estimation API * Add a new ncclGroupEndSimulate primitive to return the estimated time a group would take. Net/IB: Add separate traffic class for fifo messages * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages independently from NCCL_IB_TC. Merges PR #1194 Net/IB: Add support for IB router * Use flid instead of lid if subnets do not match * Warn if flid is 0 Optimizations and fixes for device network offload (unpack) * Double the default number of channels * Cache netDeviceType * Fix save/increment head logic to enable Tree support. Support ncclGroupStart/End for ncclCommAbort/Destroy * Allow Abort/Destroy to be called within a group when managing multiple GPUs with a single process. Improve Tuner API * Provide to the plugin the original cost table so that the plugin can leave unknown or disabled algo/proto combinations untouched. * Remove nvlsSupport and collnetSupport. Do not print version to stdout when using a debug file * Also print version from all processes with INFO debug level. Fixes issue #1271 Fix clang warnings in NVTX headers * Update NVTX headers to the latest version Fixes issue #1270 Disable port fusion in heterogeneous systems * Do not fuse ports if a mix of multi-port and single port are detected. Fix NVLS graphs search for dual NICs. * Fix NVLS graph search when we have more than one NIC per GPU. Fix crash with collnetDirect * Add separate graph search for collnetDirect, testing alltoall paths and working similarly to the NVLS search. Fix hang when nodes have different CPU types * Add the CPU type to the rank peer info. * Align all ranks on the CPU type after the first allgather. * Only use the aligned CPU type for all tuning operations. Fixes issue #1136 Fixes issue #1184 Fix performance of registered send/recv operations * Allow for single full size operations * Add INFO to confirm the registration of send/recv buffers. Move all sync ops to finalize stage * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has been called. Improve error reporting during SHM segment creation Improve support of various compilers Merges PR #1177 Merges PR #1228 Allow net and tuner plugins to be statically linked * Search for ncclNet or ncclTuner symbols in the main binary. Merges PR #979 Plugin examples includes cleanup * Harmonize err.h and common.h usage. * Add mixed plugin with both net and tuner.
145 строки
4.7 KiB
C
145 строки
4.7 KiB
C
/*************************************************************************
|
|
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_GROUP_H_
|
|
#define NCCL_GROUP_H_
|
|
|
|
#include "nccl.h"
|
|
#include "comm.h"
|
|
|
|
ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
|
|
void ncclGroupCommJoin(struct ncclComm* comm);
|
|
void ncclGroupCommPreconnect(struct ncclComm* comm);
|
|
ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
|
|
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
|
|
ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob);
|
|
|
|
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
|
|
|
|
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
|
|
|
|
typedef enum ncclGroupJobState {
|
|
ncclGroupJobRunning = 0,
|
|
ncclGroupJobDone = 1,
|
|
ncclGroupJobJoined = 2,
|
|
} ncclGroupJobState_t;
|
|
|
|
struct ncclAsyncJob {
|
|
struct ncclAsyncJob* next;
|
|
pthread_t thread;
|
|
ncclResult_t result;
|
|
ncclResult_t(*func)(struct ncclAsyncJob*);
|
|
void(*undo)(struct ncclAsyncJob*);
|
|
void(*destructor)(void*);
|
|
ncclGroupJobState_t state;
|
|
uint32_t* abortFlag; /* point to comm abortFlag */
|
|
uint32_t* abortFlagDev; /* point to comm abortFlagDev */
|
|
uint32_t* childAbortFlag; /* point to child abortFlag */
|
|
uint32_t* childAbortFlagDev; /* point to child abortFlagDev */
|
|
ncclComm_t comm;
|
|
int destroyFlag;
|
|
};
|
|
|
|
ncclResult_t ncclAsyncLaunch(
|
|
struct ncclAsyncJob* job,
|
|
ncclResult_t(*func)(struct ncclAsyncJob*),
|
|
void(*undo)(struct ncclAsyncJob*),
|
|
void(*destructor)(void*), ncclComm_t comm
|
|
);
|
|
|
|
struct ncclGroupJob {
|
|
struct ncclAsyncJob base;
|
|
struct ncclComm **groupCommHeadPtr;
|
|
struct ncclComm **groupCommPreconnectHeadPtr;
|
|
ncclResult_t *groupErrorPtr;
|
|
bool *abortFlagPtr;
|
|
int *groupBlockingPtr;
|
|
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
|
|
bool initialized;
|
|
};
|
|
|
|
ncclResult_t ncclGroupStartInternal();
|
|
ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo = NULL);
|
|
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
|
|
extern __thread ncclResult_t ncclGroupError;
|
|
extern __thread struct ncclComm* ncclGroupCommHead;
|
|
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
|
|
extern __thread int ncclGroupBlocking;
|
|
extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
|
|
extern __thread struct ncclGroupJob ncclGroupJobMain;
|
|
|
|
static inline void groupResetJobState() {
|
|
ncclGroupBlocking = -1;
|
|
ncclGroupJobMainPtr = NULL;
|
|
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
|
|
return;
|
|
}
|
|
|
|
static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
|
|
ncclResult_t ret = ncclSuccess;
|
|
if (job) {
|
|
ret = ncclAsyncJobComplete(&job->base);
|
|
groupResetJobState();
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
inline ncclResult_t ncclGroupStartInternal() {
|
|
ncclGroupDepth++;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
|
|
if (ncclGroupDepth > 0) {
|
|
if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
// Add comm to this thread's group
|
|
inline void ncclGroupCommJoin(struct ncclComm* comm) {
|
|
if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
|
|
// Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
|
|
// the users program order yet insures siblings occur consecutively. This
|
|
// is required by doLaunches() in "group.cc".
|
|
struct ncclComm** pp = &ncclGroupCommHead;
|
|
while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
|
|
pp = &(*pp)->groupNext;
|
|
comm->groupNext = *pp;
|
|
*pp = comm;
|
|
// Comms gets a new memory stack scope upon joining. Each task batched for
|
|
// this comm is allocated there.
|
|
ncclMemoryStackPush(&comm->memScoped);
|
|
// Initialize planner
|
|
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
|
memset(&comm->planner, 0, sizeof(comm->planner));
|
|
comm->planner.peers = tmp;
|
|
}
|
|
|
|
ncclGroupBlocking = comm->config.blocking;
|
|
}
|
|
|
|
// Add comm to this thread's group needing preconnect
|
|
inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
|
|
if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
|
|
comm->preconnectNext = ncclGroupCommPreconnectHead;
|
|
ncclGroupCommPreconnectHead = comm;
|
|
}
|
|
}
|
|
|
|
// Comm has left group
|
|
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
|
|
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
|
ncclMemoryStackPop(&comm->memScoped);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
#endif
|