2.9.6-1
Add support for CUDA graphs. Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439. Fix bootstrap issue caused by connection reordering. Fix CPU locking block. Improve CollNet algorithm. Improve performance on DGX A100 for communicators with only one GPU per node.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -38,8 +38,13 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
|
||||
// Need async stream for P2P pre-connect + CUDA Graph
|
||||
cudaStream_t stream;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
CUDACHECK(cudaStreamDestroy(stream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -14,8 +14,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
|
||||
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
||||
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr);
|
||||
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
|
||||
ncclResult_t bootstrapClose(void* commState);
|
||||
|
||||
+10
-1
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -27,6 +27,15 @@
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
// Report failure but clear error and continue
|
||||
#define CUDACHECKIGNORE(cmd) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
(void) cudaGetLastError(); \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
|
||||
+13
-5
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -77,16 +77,17 @@ struct ncclComm {
|
||||
int nNodes;
|
||||
int localRanks;
|
||||
|
||||
enum { GROUP, PARALLEL } launchMode;
|
||||
enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
|
||||
cudaStream_t userStream;
|
||||
bool userStreamSet;
|
||||
cudaEvent_t doneEvent;
|
||||
cudaEvent_t intDoneEvent;
|
||||
bool checkPointers;
|
||||
|
||||
// Counter to make sure collectives match (needed for bcast/reduce
|
||||
// where syncs are not symmetric).
|
||||
// Counter for tracking CUDA launches (P2P and collectives included)
|
||||
uint64_t opCount;
|
||||
uint64_t lastOpCount;
|
||||
// Collective operation counter
|
||||
uint64_t collOpCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
@@ -145,12 +146,19 @@ struct ncclComm {
|
||||
struct ncclInfo* asyncOps;
|
||||
int asyncOpCount;
|
||||
size_t asyncTotalSize;
|
||||
int lastChannel;
|
||||
|
||||
//list of async p2p operation queued in a group semantics
|
||||
struct ncclP2Plist* p2pSends;
|
||||
struct ncclP2Plist* p2pRecvs;
|
||||
int p2pSendCount;
|
||||
int p2pRecvCount;
|
||||
|
||||
// Store info for cudaGraph
|
||||
int usingCudaGraph; // Only use it during capture time, not launch time
|
||||
struct ncclQueueInfo* enqueueInfo;
|
||||
cudaGraphNode_t lastSetupNode;
|
||||
unsigned long long lastCudaGraphId;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -99,8 +99,9 @@ struct ncclConnInfo {
|
||||
struct ncclConnector {
|
||||
int connected;
|
||||
struct ncclProxyArgs *proxyAppend;
|
||||
struct ncclProxyArgs **proxyAppendPtr;
|
||||
struct ncclTransportComm* transportComm;
|
||||
void* transportResources; // Host-side resources
|
||||
void* transportResources;
|
||||
struct ncclConnInfo conn;
|
||||
struct ncclComm *comm;
|
||||
};
|
||||
@@ -125,9 +126,21 @@ struct ncclTree {
|
||||
int down[NCCL_MAX_TREE_ARITY];
|
||||
};
|
||||
|
||||
#define NCCL_MAX_DIRECT_ARITY 7
|
||||
struct ncclDirect {
|
||||
int depth;
|
||||
int out;
|
||||
int nHeads;
|
||||
int headRank;
|
||||
int shift;
|
||||
int up[NCCL_MAX_DIRECT_ARITY];
|
||||
int down[NCCL_MAX_DIRECT_ARITY];
|
||||
};
|
||||
|
||||
#define NCCL_MAX_CONNS 2
|
||||
struct ncclPeer {
|
||||
struct ncclConnector send;
|
||||
struct ncclConnector recv;
|
||||
struct ncclConnector send[NCCL_MAX_CONNS];
|
||||
struct ncclConnector recv[NCCL_MAX_CONNS];
|
||||
};
|
||||
|
||||
struct ncclDevComm;
|
||||
@@ -161,6 +174,8 @@ struct ncclWorkElem {
|
||||
struct {
|
||||
size_t sendCount;
|
||||
size_t recvCount;
|
||||
int sendChunkSize;
|
||||
int recvChunkSize;
|
||||
int32_t delta;
|
||||
uint16_t nThreads;
|
||||
} p2p;
|
||||
@@ -177,7 +192,7 @@ struct ncclChannel {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
struct ncclTree collTree;
|
||||
struct ncclDirect collTree;
|
||||
|
||||
int id;
|
||||
|
||||
@@ -189,6 +204,12 @@ struct ncclChannel {
|
||||
struct ncclWork* workFifo;
|
||||
int workCount;
|
||||
uint64_t workFifoTail; // Only used by CPU
|
||||
uint16_t index; // Only used by GPU
|
||||
|
||||
// GDRCOPY support
|
||||
struct ncclWork* workFifoGdr;
|
||||
struct ncclWork* workFifoDev;
|
||||
void* gdrMemDesc;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -11,15 +11,82 @@
|
||||
#include "group.h"
|
||||
#include "collectives.h"
|
||||
|
||||
size_t ncclKernMaxLocalSize();
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
|
||||
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
|
||||
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
|
||||
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
|
||||
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
|
||||
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
|
||||
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchKernel(ncclComm_t comm);
|
||||
ncclResult_t ncclRecordEvents(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchReset(ncclComm_t comm);
|
||||
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
|
||||
template<int USING_CUDA_GRAPH>
|
||||
void CUDART_CB ncclEnqueueHostSetup(void* arg);
|
||||
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
|
||||
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
|
||||
|
||||
// Enqueue information (for kernel and proxy) for each operation
|
||||
struct ncclQueueElem {
|
||||
struct ncclWorkElem work;
|
||||
struct ncclProxyArgs proxyArgs;
|
||||
struct ncclQueueElem* next;
|
||||
};
|
||||
|
||||
// Store enqueue elements in a list
|
||||
struct ncclQueueElemList {
|
||||
struct ncclQueueElem* head;
|
||||
struct ncclQueueElem* tail;
|
||||
};
|
||||
|
||||
// Structure passed to CUDA graph
|
||||
struct ncclQueueInfo {
|
||||
ncclComm_t comm;
|
||||
int maxChannels; // Dynamic version of gridDim
|
||||
ncclResult_t ret; // Return value of host setup call
|
||||
struct ncclQueueElemList elemList;
|
||||
};
|
||||
|
||||
// Get next element from enqueue list
|
||||
static ncclResult_t ncclAddQueueElem(struct ncclQueueInfo* eqInfo, struct ncclQueueElem** elemOut) {
|
||||
if (eqInfo == NULL) return ncclInternalError;
|
||||
struct ncclQueueElemList* list = &eqInfo->elemList;
|
||||
if (list->tail != NULL) {
|
||||
*elemOut = list->tail;
|
||||
memset(*elemOut, 0, sizeof(struct ncclWorkElem) + sizeof(struct ncclProxyArgs));
|
||||
} else {
|
||||
NCCLCHECK(ncclCalloc(&list->tail, 1));
|
||||
*elemOut = list->tail;
|
||||
list->head = list->tail;
|
||||
}
|
||||
if (list->tail->next == NULL) {
|
||||
NCCLCHECK(ncclCalloc(&list->tail->next, 1));
|
||||
}
|
||||
list->tail = list->tail->next;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Reset element queue
|
||||
static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
|
||||
if (eqInfo == NULL) return ncclInternalError;
|
||||
eqInfo->maxChannels = 0;
|
||||
eqInfo->ret = ncclSuccess;
|
||||
eqInfo->elemList.tail = eqInfo->elemList.head;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Destroy enqueue info space
|
||||
// used by both CUDA graph and non CUDA graph
|
||||
static void ncclDestroyQueueInfo(void* ptr) {
|
||||
if (ptr == NULL) return;
|
||||
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
|
||||
struct ncclQueueElem* head = eqInfo->elemList.head;
|
||||
while (head != NULL) {
|
||||
struct ncclQueueElem* temp = head;
|
||||
head = head->next;
|
||||
free(temp);
|
||||
}
|
||||
free(eqInfo);
|
||||
}
|
||||
#endif // End include guard
|
||||
|
||||
@@ -0,0 +1,251 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_GDRWRAP_H_
|
||||
#define NCCL_GDRWRAP_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include <stdint.h> // for standard [u]intX_t types
|
||||
#include <stdio.h>
|
||||
|
||||
// These can be used if the GDR library isn't thread safe
|
||||
#include <pthread.h>
|
||||
extern pthread_mutex_t gdrLock;
|
||||
#define GDRLOCK() pthread_mutex_lock(&gdrLock)
|
||||
#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
|
||||
#define GDRLOCKCALL(cmd, ret) do { \
|
||||
GDRLOCK(); \
|
||||
ret = cmd; \
|
||||
GDRUNLOCK(); \
|
||||
} while(false)
|
||||
|
||||
#define GDRCHECK(cmd) do { \
|
||||
int e; \
|
||||
/* GDRLOCKCALL(cmd, e); */ \
|
||||
e = cmd; \
|
||||
if( e != 0 ) { \
|
||||
WARN("GDRCOPY failure %d", e); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
// This is required as the GDR memory is mapped WC
|
||||
#if !defined(__NVCC__)
|
||||
#if defined(__PPC__)
|
||||
static inline void wc_store_fence(void) { asm volatile("sync") ; }
|
||||
#elif defined(__x86_64__)
|
||||
#include <immintrin.h>
|
||||
static inline void wc_store_fence(void) { _mm_sfence(); }
|
||||
#elif defined(__aarch64__)
|
||||
#ifdef __cplusplus
|
||||
#include <atomic>
|
||||
static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
|
||||
#else
|
||||
#include <stdatomic.h>
|
||||
static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//#define GDR_DIRECT 1
|
||||
#ifdef GDR_DIRECT
|
||||
// Call the GDR API library code directly rather than via
|
||||
// dlopen() wrappers
|
||||
#include <gdrapi.h>
|
||||
|
||||
static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
|
||||
static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
|
||||
static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
|
||||
static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
|
||||
GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
|
||||
GDRCHECK(gdr_unpin_buffer(g, handle));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
|
||||
GDRCHECK(gdr_get_info(g, handle, info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
|
||||
GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
|
||||
GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static void wrap_gdr_runtime_get_version(int *major, int *minor) {
|
||||
gdr_runtime_get_version(major, minor);
|
||||
return ncclSuccess;
|
||||
}
|
||||
static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
|
||||
gdr_driver_get_version(g, major, minor);
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
|
||||
GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
|
||||
GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#else
|
||||
// Dynamically handle dependency the GDR API library
|
||||
|
||||
/* Extracted from gdrapi.h (v2.1 Nov 2020) */
|
||||
|
||||
#define GPU_PAGE_SHIFT 16
|
||||
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
|
||||
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
|
||||
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
|
||||
|
||||
struct gdr;
|
||||
typedef struct gdr *gdr_t;
|
||||
|
||||
typedef struct gdr_mh_s {
|
||||
unsigned long h;
|
||||
} gdr_mh_t;
|
||||
|
||||
struct gdr_info {
|
||||
uint64_t va;
|
||||
uint64_t mapped_size;
|
||||
uint32_t page_size;
|
||||
uint64_t tm_cycles;
|
||||
uint32_t cycles_per_ms;
|
||||
unsigned mapped:1;
|
||||
unsigned wc_mapping:1;
|
||||
};
|
||||
typedef struct gdr_info gdr_info_t;
|
||||
|
||||
/* End of gdrapi.h */
|
||||
|
||||
ncclResult_t wrap_gdr_symbols(void);
|
||||
|
||||
gdr_t wrap_gdr_open(void);
|
||||
ncclResult_t wrap_gdr_close(gdr_t g);
|
||||
ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
|
||||
ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
|
||||
ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
|
||||
ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
|
||||
ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
|
||||
ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
|
||||
ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
|
||||
ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
|
||||
ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
|
||||
|
||||
#endif // GDR_DIRECT
|
||||
|
||||
// Global GDR driver handle
|
||||
extern gdr_t ncclGdrCopy;
|
||||
|
||||
#include "alloc.h"
|
||||
|
||||
typedef struct gdr_mem_desc {
|
||||
void *gdrDevMem;
|
||||
void *gdrMap;
|
||||
size_t gdrOffset;
|
||||
size_t gdrMapSize;
|
||||
gdr_mh_t gdrMh;
|
||||
} gdr_mem_desc_t;
|
||||
|
||||
static gdr_t ncclGdrInit() {
|
||||
int libMajor, libMinor, drvMajor, drvMinor;
|
||||
gdr_t handle = NULL;
|
||||
// Dynamically load the GDRAPI library symbols
|
||||
if (wrap_gdr_symbols() == ncclSuccess) {
|
||||
handle = wrap_gdr_open();
|
||||
|
||||
if (handle != NULL) {
|
||||
ncclResult_t res;
|
||||
|
||||
// Query the version of libgdrapi
|
||||
NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
|
||||
|
||||
// Query the version of gdrdrv driver
|
||||
NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
|
||||
|
||||
// Only support GDRAPI 2.1 and later
|
||||
if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
|
||||
goto error;
|
||||
}
|
||||
else
|
||||
INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
|
||||
}
|
||||
}
|
||||
return handle;
|
||||
error:
|
||||
if (handle != NULL) (void) wrap_gdr_close(handle);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
|
||||
gdr_info_t info;
|
||||
size_t mapSize;
|
||||
gdr_mh_t mh;
|
||||
char *devMem;
|
||||
void *gdrMap;
|
||||
|
||||
mapSize = sizeof(T)*nelem;
|
||||
|
||||
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
|
||||
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
|
||||
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
|
||||
uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
|
||||
size_t align = alignedAddr - (uint64_t)devMem;
|
||||
|
||||
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
|
||||
NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
|
||||
|
||||
NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
|
||||
//TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
|
||||
|
||||
NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
|
||||
|
||||
// Will offset ever be non zero ?
|
||||
ssize_t off = info.va - alignedAddr;
|
||||
|
||||
gdr_mem_desc_t* md;
|
||||
NCCLCHECK(ncclCalloc(&md, 1));
|
||||
md->gdrDevMem = devMem;
|
||||
md->gdrMap = gdrMap;
|
||||
md->gdrMapSize = mapSize;
|
||||
md->gdrOffset = off+align;
|
||||
md->gdrMh = mh;
|
||||
*gdrHandle = md;
|
||||
|
||||
*ptr = (T *)((char *)gdrMap+off);
|
||||
if (devPtr) *devPtr = (T *)(devMem+off+align);
|
||||
|
||||
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
|
||||
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
|
||||
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
||||
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
|
||||
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
||||
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
|
||||
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
|
||||
CUDACHECK(cudaFree(md->gdrDevMem));
|
||||
free(md);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif // End include guard
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -28,7 +28,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
|
||||
@@ -91,13 +91,11 @@ struct ncclTopoRanks {
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
|
||||
struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings);
|
||||
|
||||
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph);
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
|
||||
#include "info.h"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -1089,7 +1089,7 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
|
||||
static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
|
||||
int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
||||
if (ret != IBV_SUCCESS) {
|
||||
WARN("ibv_post_send() failed with error %s", strerror(ret));
|
||||
WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -18,8 +18,7 @@ typedef enum {
|
||||
ncclPatternTreeUp,
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown,
|
||||
ncclPatternCollTreeUp,
|
||||
ncclPatternCollTreeDown
|
||||
ncclPatternCollTreeUpDown
|
||||
} ncclPattern_t;
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
@@ -49,6 +48,8 @@ struct ncclInfo {
|
||||
int nchunksPerLoop;
|
||||
ssize_t sendbytes;
|
||||
ssize_t recvbytes;
|
||||
int recvChunkSize;
|
||||
int sendChunkSize;
|
||||
uint32_t delta;
|
||||
int channelId;
|
||||
};
|
||||
|
||||
+10
-5
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -47,11 +47,12 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
ncclNetHandle_t handle;
|
||||
void* gpuPtr = NULL;
|
||||
void* mHandle = NULL;
|
||||
NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
|
||||
NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
|
||||
NCCLCHECK(ncclNetAccept(lComm, &rComm));
|
||||
CUDACHECK(cudaMalloc(&gpuPtr, GPU_BUF_SIZE));
|
||||
ncclResult_t ret;
|
||||
ncclDebugNoWarn = NCCL_NET;
|
||||
NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
|
||||
NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
|
||||
NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
|
||||
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
|
||||
if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
|
||||
NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
|
||||
NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
|
||||
@@ -60,9 +61,13 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
}
|
||||
ncclDebugNoWarn = 0;
|
||||
CUDACHECK(cudaFree(gpuPtr));
|
||||
cleanup4:
|
||||
NCCLCHECK(ncclNetCloseRecv(rComm));
|
||||
cleanup3:
|
||||
NCCLCHECK(ncclNetCloseSend(sComm));
|
||||
cleanup2:
|
||||
NCCLCHECK(ncclNetCloseListen(lComm));
|
||||
cleanup1:
|
||||
break;
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
+54
-29
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -14,47 +14,66 @@ enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }
|
||||
struct ncclProxyArgs;
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
struct ncclProxyArgs {
|
||||
proxyProgressFunc_t progress;
|
||||
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
|
||||
|
||||
struct ncclProxySubArgs {
|
||||
struct ncclChannel* channel;
|
||||
struct ncclConnector* connector;
|
||||
size_t sendbytes;
|
||||
size_t recvbytes;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
uint64_t opCount;
|
||||
int protocol;
|
||||
int segment; // Only for profiling
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
int state; // add component before this line -- it is left out during initialization
|
||||
ssize_t sendbytes;
|
||||
ssize_t recvbytes;
|
||||
int sendChunkSize;
|
||||
int recvChunkSize;
|
||||
int delta;
|
||||
|
||||
// Internal state
|
||||
uint64_t base;
|
||||
uint64_t posted;
|
||||
uint64_t received; // Only used by recv proxy to wait for flush.
|
||||
uint64_t received;
|
||||
uint64_t flushed;
|
||||
uint64_t transmitted;
|
||||
uint64_t done;
|
||||
uint64_t end;
|
||||
void* requests[NCCL_STEPS];
|
||||
};
|
||||
|
||||
struct ncclProxyArgs {
|
||||
proxyProgressFunc_t progress;
|
||||
struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
|
||||
int nsubs;
|
||||
int done;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int chunkSize;
|
||||
uint64_t opCount;
|
||||
uint64_t commOpCount;
|
||||
int protocol;
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
ncclPattern_t pattern;
|
||||
int root;
|
||||
int state;
|
||||
char* sharedBuff[NCCL_STEPS];
|
||||
int sharedSize[NCCL_STEPS];
|
||||
|
||||
int idle;
|
||||
|
||||
// Element linking
|
||||
pthread_mutex_t mutex;
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
struct ncclProxyArgs* nextGroup;
|
||||
struct ncclProxyArgs** proxyAppendPtr;
|
||||
};
|
||||
|
||||
struct ncclProxySharedBuffers {
|
||||
int nslots;
|
||||
int slotSize;
|
||||
char* cudaBuff[2*MAXCHANNELS];
|
||||
int* cudaUsed[2*MAXCHANNELS];
|
||||
char* hostBuff[2*MAXCHANNELS];
|
||||
int* hostUsed[2*MAXCHANNELS];
|
||||
int size;
|
||||
char* cudaBuff;
|
||||
char* hostBuff;
|
||||
struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
|
||||
// Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
|
||||
struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
|
||||
void* collNetResources;
|
||||
};
|
||||
|
||||
struct ncclProxyPool;
|
||||
@@ -63,11 +82,16 @@ struct ncclProxyState {
|
||||
pthread_mutex_t opsMutex;
|
||||
pthread_mutex_t poolMutex;
|
||||
bool stop;
|
||||
struct ncclProxySharedBuffers* sharedBuffs;
|
||||
struct ncclProxyArgs* ops;
|
||||
struct ncclProxyArgs* nextOps;
|
||||
struct ncclProxySharedBuffers sharedBuffs;
|
||||
struct ncclProxyArgs* ops; // Running operations, used by proxy thread
|
||||
struct ncclProxyArgs* postedOps; // Posted operations, shared between proxy and main thread, locked with opsMutex
|
||||
struct ncclProxyArgs* postedOpsEnd;
|
||||
struct ncclProxyArgs* nextOps; // Pending operations, used by main thread (could still be cancelled)
|
||||
struct ncclProxyArgs* nextOpsEnd;
|
||||
struct ncclProxyArgs* pool;
|
||||
struct ncclProxyArgs* pool; // Free operations for main thread
|
||||
struct ncclProxyArgs* poolFreed; // Freed operations by the progress thread
|
||||
struct ncclProxyArgs* poolReturned; // Shared between main and progress thread, lock with poolMutex
|
||||
|
||||
struct ncclProxyPool* pools;
|
||||
};
|
||||
|
||||
@@ -79,15 +103,16 @@ enum proxyMode {
|
||||
proxyTo = 2
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
|
||||
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
||||
|
||||
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
|
||||
ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -41,7 +41,7 @@ struct ncclConnect {
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
|
||||
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
|
||||
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
|
||||
ncclResult_t (*free)(void*);
|
||||
ncclResult_t (*proxy)(struct ncclProxyArgs*);
|
||||
@@ -54,7 +54,10 @@ struct ncclTransport {
|
||||
struct ncclTransportComm recv;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
|
||||
|
||||
enum { collNetRecv=0, collNetSend=1 };
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user