Merge remote-tracking branch 'nccl/master' into develop

Bu işleme şunda yer alıyor:
Wenkai Du
2021-04-30 16:57:36 -07:00
işleme a4ea1fed5b
80 değiştirilmiş dosya ile 3136 ekleme ve 1675 silme
+8 -3
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -52,11 +52,16 @@ extern struct allocationTracker allocTracker[];
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
// Need async stream for P2P pre-connect + CUDA Graph
hipStream_t stream;
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
if (isFineGrain)
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
CUDACHECK(hipStreamSynchronize(stream));
CUDACHECK(hipStreamDestroy(stream));
int dev;
CUDACHECK(hipGetDevice(&dev));
if (dev < MAX_ALLOC_TRACK_NGPU) {
+3 -3
Dosyayı Görüntüle
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,8 +14,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
ncclResult_t bootstrapClose(void* commState);
+11 -2
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -28,6 +28,15 @@
} \
} while(false)
// Report failure but clear error and continue
#define CUDACHECKIGNORE(cmd) do { \
hipError_t err = cmd; \
if( err != hipSuccess ) { \
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, hipGetErrorString(err)); \
(void) hipGetLastError(); \
} \
} while(false)
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
+2 -1
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -81,4 +81,5 @@ DECL_ALL
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
#define SENDRECV_SLICEFACTOR 1
#endif
+16 -8
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -15,6 +15,9 @@
// [/RCCL]
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
typedef void *cudaGraph_t;
typedef void *cudaGraphNode_t;
#define HIPRT_CB
#else
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
@@ -84,16 +87,17 @@ struct ncclComm {
int nNodes;
int localRanks;
enum { GROUP, PARALLEL } launchMode;
enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
hipStream_t userStream;
bool userStreamSet;
hipEvent_t doneEvent;
hipEvent_t intDoneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
// Counter for tracking CUDA launches (P2P and collectives included)
uint64_t opCount;
uint64_t lastOpCount;
// Collective operation counter
uint64_t collOpCount;
// Channels for collectives
int nChannels;
@@ -101,8 +105,6 @@ struct ncclComm {
int p2pnChannels;
int p2pnChannelsPerPeer;
int p2pChannels[MAXCHANNELS];
//Channels for collnet
int collNetnChannels;
// Buffer sizes
int buffSizes[NCCL_NUM_PROTOCOLS];
@@ -157,6 +159,7 @@ struct ncclComm {
struct ncclInfo* asyncOps;
int asyncOpCount;
size_t asyncTotalSize;
int lastChannel;
//list of async p2p operation queued in a group semantics
struct ncclP2Plist* p2pSends;
@@ -169,6 +172,11 @@ struct ncclComm {
int rootPid; // Process ID of root
// [/RCCL]
// Store info for cudaGraph
int usingCudaGraph; // Only use it during capture time, not launch time
struct ncclQueueInfo* enqueueInfo;
cudaGraphNode_t lastSetupNode;
unsigned long long lastCudaGraphId;
};
#endif
+1 -1
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+33 -9
Dosyayı Görüntüle
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -125,8 +125,9 @@ struct ncclConnInfo {
struct ncclConnector {
int connected;
struct ncclProxyArgs *proxyAppend;
struct ncclProxyArgs **proxyAppendPtr;
struct ncclTransportComm* transportComm;
void* transportResources; // Host-side resources
void* transportResources;
struct ncclConnInfo conn;
struct ncclComm *comm;
};
@@ -151,11 +152,23 @@ struct ncclTree {
int down[NCCL_MAX_TREE_ARITY];
};
#define NCCL_MAX_DIRECT_ARITY 7
struct ncclDirect {
int depth;
int out;
int nHeads;
int headRank;
int shift;
int up[NCCL_MAX_DIRECT_ARITY];
int down[NCCL_MAX_DIRECT_ARITY];
};
#define NCCL_CONN_IDX_P2P (*(comm->p2pNet)*2)
#define NCCL_CONN_IDX_P2P_NET 2
#define NCCL_MAX_CONNS 3
struct ncclPeer {
struct ncclConnector send;
struct ncclConnector recv;
struct ncclConnector p2pSend;
struct ncclConnector p2pRecv;
struct ncclConnector send[NCCL_MAX_CONNS];
struct ncclConnector recv[NCCL_MAX_CONNS];
};
struct ncclDevComm;
@@ -179,7 +192,6 @@ struct ncclWorkElem {
const void * sendbuff;
void * recvbuff;
uint64_t opCount;
// Op-specific fields.
union {
struct {
@@ -192,9 +204,15 @@ struct ncclWorkElem {
struct {
size_t sendCount;
size_t recvCount;
int sendChunkSize;
int recvChunkSize;
int32_t delta;
uint16_t nThreads;
} p2p;
struct {
uint16_t padding[15];
uint16_t opCount;
} op;
// [RCCL] Clique-based arguments
// NOTE: Follows same field structure as coll
// because nChannels is accessed from "coll" struct.
@@ -206,7 +224,7 @@ struct ncclWorkElem {
uint8_t nChannels;
} clique;
// [/RCCL]
uint64_t align[3];
uint64_t align[4];
};
};
struct ncclWork {
@@ -219,7 +237,7 @@ struct ncclChannel {
struct {
struct ncclRing ring;
struct ncclTree tree;
struct ncclTree collTree;
struct ncclDirect collTree;
int id;
@@ -241,6 +259,12 @@ struct ncclChannel {
float bw_cumulative;
int bw_count;
#endif
uint16_t index; // Only used by GPU
// GDRCOPY support
struct ncclWork* workFifoGdr;
struct ncclWork* workFifoDev;
void* gdrMemDesc;
};
int data[0x80];
};
+74 -7
Dosyayı Görüntüle
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,15 +11,82 @@
#include "group.h"
#include "collectives.h"
size_t ncclKernMaxLocalSize();
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
ncclResult_t ncclLaunchKernel(ncclComm_t comm);
ncclResult_t ncclRecordEvents(struct ncclComm* comm);
ncclResult_t ncclLaunchReset(ncclComm_t comm);
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
template<int USING_CUDA_GRAPH>
void HIPRT_CB ncclEnqueueHostSetup(void* arg);
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
// Enqueue information (for kernel and proxy) for each operation
struct ncclQueueElem {
struct ncclWorkElem work;
struct ncclProxyArgs proxyArgs;
struct ncclQueueElem* next;
};
// Store enqueue elements in a list
struct ncclQueueElemList {
struct ncclQueueElem* head;
struct ncclQueueElem* tail;
};
// Structure passed to CUDA graph
struct ncclQueueInfo {
ncclComm_t comm;
int maxChannels; // Dynamic version of gridDim
ncclResult_t ret; // Return value of host setup call
struct ncclQueueElemList elemList;
};
// Get next element from enqueue list
static ncclResult_t ncclAddQueueElem(struct ncclQueueInfo* eqInfo, struct ncclQueueElem** elemOut) {
if (eqInfo == NULL) return ncclInternalError;
struct ncclQueueElemList* list = &eqInfo->elemList;
if (list->tail != NULL) {
*elemOut = list->tail;
memset(*elemOut, 0, sizeof(struct ncclWorkElem) + sizeof(struct ncclProxyArgs));
} else {
NCCLCHECK(ncclCalloc(&list->tail, 1));
*elemOut = list->tail;
list->head = list->tail;
}
if (list->tail->next == NULL) {
NCCLCHECK(ncclCalloc(&list->tail->next, 1));
}
list->tail = list->tail->next;
return ncclSuccess;
}
// Reset element queue
static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
if (eqInfo == NULL) return ncclInternalError;
eqInfo->maxChannels = 0;
eqInfo->ret = ncclSuccess;
eqInfo->elemList.tail = eqInfo->elemList.head;
return ncclSuccess;
}
// Destroy enqueue info space
// used by both CUDA graph and non CUDA graph
static void ncclDestroyQueueInfo(void* ptr) {
if (ptr == NULL) return;
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
struct ncclQueueElem* head = eqInfo->elemList.head;
while (head != NULL) {
struct ncclQueueElem* temp = head;
head = head->next;
free(temp);
}
free(eqInfo);
}
#endif // End include guard
+272
Dosyayı Görüntüle
@@ -0,0 +1,272 @@
/*************************************************************************
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_GDRWRAP_H_
#define NCCL_GDRWRAP_H_
#include "nccl.h"
#include <stdint.h> // for standard [u]intX_t types
#include <stdio.h>
// These can be used if the GDR library isn't thread safe
#include <pthread.h>
extern pthread_mutex_t gdrLock;
#define GDRLOCK() pthread_mutex_lock(&gdrLock)
#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
#define GDRLOCKCALL(cmd, ret) do { \
GDRLOCK(); \
ret = cmd; \
GDRUNLOCK(); \
} while(false)
#define GDRCHECK(cmd) do { \
int e; \
/* GDRLOCKCALL(cmd, e); */ \
e = cmd; \
if( e != 0 ) { \
WARN("GDRCOPY failure %d", e); \
return ncclSystemError; \
} \
} while(false)
// This is required as the GDR memory is mapped WC
#if !defined(__NVCC__)
#if defined(__PPC__)
static inline void wc_store_fence(void) { asm volatile("sync") ; }
#elif defined(__x86_64__)
#include <immintrin.h>
static inline void wc_store_fence(void) { _mm_sfence(); }
#elif defined(__aarch64__)
#ifdef __cplusplus
#include <atomic>
static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
#else
#include <stdatomic.h>
static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
#endif
#endif
#endif
//#define GDR_DIRECT 1
#ifdef GDR_DIRECT
// Call the GDR API library code directly rather than via
// dlopen() wrappers
#include <gdrapi.h>
static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
GDRCHECK(gdr_unpin_buffer(g, handle));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
GDRCHECK(gdr_get_info(g, handle, info));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
return ncclSuccess;
}
static void wrap_gdr_runtime_get_version(int *major, int *minor) {
gdr_runtime_get_version(major, minor);
return ncclSuccess;
}
static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
gdr_driver_get_version(g, major, minor);
return ncclSuccess;
}
static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
return ncclSuccess;
}
#else
// Dynamically handle dependency the GDR API library
/* Extracted from gdrapi.h (v2.1 Nov 2020) */
#define GPU_PAGE_SHIFT 16
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
struct gdr;
typedef struct gdr *gdr_t;
typedef struct gdr_mh_s {
unsigned long h;
} gdr_mh_t;
struct gdr_info {
uint64_t va;
uint64_t mapped_size;
uint32_t page_size;
uint64_t tm_cycles;
uint32_t cycles_per_ms;
unsigned mapped:1;
unsigned wc_mapping:1;
};
typedef struct gdr_info gdr_info_t;
/* End of gdrapi.h */
ncclResult_t wrap_gdr_symbols(void);
gdr_t wrap_gdr_open(void);
ncclResult_t wrap_gdr_close(gdr_t g);
ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
#endif // GDR_DIRECT
// Global GDR driver handle
extern gdr_t ncclGdrCopy;
#include "alloc.h"
typedef struct gdr_mem_desc {
void *gdrDevMem;
void *gdrMap;
size_t gdrOffset;
size_t gdrMapSize;
gdr_mh_t gdrMh;
} gdr_mem_desc_t;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
static gdr_t ncclGdrInit() {
return NULL;
}
template <typename T>
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
return ncclSuccess;
}
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
return ncclSuccess;
}
#else
static gdr_t ncclGdrInit() {
int libMajor, libMinor, drvMajor, drvMinor;
gdr_t handle = NULL;
// Dynamically load the GDRAPI library symbols
if (wrap_gdr_symbols() == ncclSuccess) {
handle = wrap_gdr_open();
if (handle != NULL) {
ncclResult_t res;
// Query the version of libgdrapi
NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
// Query the version of gdrdrv driver
NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
// Only support GDRAPI 2.1 and later
if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
goto error;
}
else
INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
}
}
return handle;
error:
if (handle != NULL) (void) wrap_gdr_close(handle);
return NULL;
}
template <typename T>
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
gdr_info_t info;
size_t mapSize;
gdr_mh_t mh;
char *devMem;
void *gdrMap;
mapSize = sizeof(T)*nelem;
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
size_t align = alignedAddr - (uint64_t)devMem;
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
//TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
// Will offset ever be non zero ?
ssize_t off = info.va - alignedAddr;
gdr_mem_desc_t* md;
NCCLCHECK(ncclCalloc(&md, 1));
md->gdrDevMem = devMem;
md->gdrMap = gdrMap;
md->gdrMapSize = mapSize;
md->gdrOffset = off+align;
md->gdrMh = mh;
*gdrHandle = md;
*ptr = (T *)((char *)gdrMap+off);
if (devPtr) *devPtr = (T *)(devMem+off+align);
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
return ncclSuccess;
}
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
CUDACHECK(hipFree(md->gdrDevMem));
free(md);
return ncclSuccess;
}
#endif
#endif // End include guard
+5 -6
Dosyayı Görüntüle
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -28,7 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
@@ -93,13 +94,11 @@ struct ncclTopoRanks {
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, int nc);
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
#include "info.h"
+1 -1
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+2 -2
Dosyayı Görüntüle
@@ -4,7 +4,7 @@
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -1089,7 +1089,7 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
if (ret != IBV_SUCCESS) {
WARN("ibv_post_send() failed with error %s", strerror(ret));
WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
return ncclSystemError;
}
return ncclSuccess;
+5 -4
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -19,8 +19,7 @@ typedef enum {
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollTreeUp,
ncclPatternCollTreeDown
ncclPatternCollTreeUpDown
} ncclPattern_t;
// Used to pass NCCL call information between functions
@@ -50,6 +49,8 @@ struct ncclInfo {
int nchunksPerLoop;
ssize_t sendbytes;
ssize_t recvbytes;
int recvChunkSize;
int sendChunkSize;
uint32_t delta;
int channelId;
};
+10 -5
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -53,11 +53,12 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
ncclNetHandle_t handle;
void* gpuPtr = NULL;
void* mHandle = NULL;
NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
NCCLCHECK(ncclNetAccept(lComm, &rComm));
CUDACHECK(hipExtMallocWithFlags(&gpuPtr, GPU_BUF_SIZE, hipDeviceMallocFinegrained));
ncclResult_t ret;
ncclDebugNoWarn = NCCL_NET;
NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
@@ -66,9 +67,13 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
}
ncclDebugNoWarn = 0;
CUDACHECK(hipFree(gpuPtr));
cleanup4:
NCCLCHECK(ncclNetCloseRecv(rComm));
cleanup3:
NCCLCHECK(ncclNetCloseSend(sComm));
cleanup2:
NCCLCHECK(ncclNetCloseListen(lComm));
cleanup1:
break;
}
return ncclSuccess;
+1
Dosyayı Görüntüle
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+56 -30
Dosyayı Görüntüle
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,48 +15,67 @@ enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
struct ncclProxyArgs {
proxyProgressFunc_t progress;
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
struct ncclProxySubArgs {
struct ncclChannel* channel;
struct ncclConnector* connector;
size_t sendbytes;
size_t recvbytes;
int sliceSteps;
int chunkSteps;
int nsteps;
uint64_t opCount;
int protocol;
int segment; // Only for profiling
ncclDataType_t dtype;
ncclRedOp_t redOp;
int state; // add component before this line -- it is left out during initialization
ssize_t sendbytes;
ssize_t recvbytes;
int sendChunkSize;
int recvChunkSize;
int delta;
// Internal state
uint64_t base;
uint64_t posted;
uint64_t received; // Only used by recv proxy to wait for flush.
uint64_t received;
uint64_t flushed;
uint64_t transmitted;
uint64_t done;
uint64_t end;
uint64_t hdp_flushed;
void* requests[NCCL_STEPS];
};
struct ncclProxyArgs {
proxyProgressFunc_t progress;
struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
int nsubs;
int done;
int sliceSteps;
int chunkSteps;
int chunkSize;
uint64_t opCount;
uint64_t commOpCount;
int protocol;
ncclDataType_t dtype;
ncclRedOp_t redOp;
ncclPattern_t pattern;
int root;
int state;
char* sharedBuff[NCCL_STEPS];
int sharedSize[NCCL_STEPS];
int idle;
uint64_t hdp_flushed;
// Element linking
pthread_mutex_t mutex;
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
struct ncclProxyArgs* nextGroup;
struct ncclProxyArgs** proxyAppendPtr;
};
struct ncclProxySharedBuffers {
int nslots;
int slotSize;
char* cudaBuff[2*MAXCHANNELS];
int* cudaUsed[2*MAXCHANNELS];
char* hostBuff[2*MAXCHANNELS];
int* hostUsed[2*MAXCHANNELS];
int size;
char* cudaBuff;
char* hostBuff;
struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
// Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
void* collNetResources;
};
struct ncclProxyPool;
@@ -64,11 +84,16 @@ struct ncclProxyState {
pthread_mutex_t opsMutex;
pthread_mutex_t poolMutex;
bool stop;
struct ncclProxySharedBuffers* sharedBuffs;
struct ncclProxyArgs* ops;
struct ncclProxyArgs* nextOps;
struct ncclProxySharedBuffers sharedBuffs;
struct ncclProxyArgs* ops; // Running operations, used by proxy thread
struct ncclProxyArgs* postedOps; // Posted operations, shared between proxy and main thread, locked with opsMutex
struct ncclProxyArgs* postedOpsEnd;
struct ncclProxyArgs* nextOps; // Pending operations, used by main thread (could still be cancelled)
struct ncclProxyArgs* nextOpsEnd;
struct ncclProxyArgs* pool;
struct ncclProxyArgs* pool; // Free operations for main thread
struct ncclProxyArgs* poolFreed; // Freed operations by the progress thread
struct ncclProxyArgs* poolReturned; // Shared between main and progress thread, lock with poolMutex
struct ncclProxyPool* pools;
};
@@ -80,15 +105,16 @@ enum proxyMode {
proxyTo = 2
};
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
#include <unistd.h>
+1 -1
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+1
Dosyayı Görüntüle
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+7 -4
Dosyayı Görüntüle
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -41,7 +41,7 @@ struct ncclConnect {
};
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -54,7 +54,10 @@ struct ncclTransport {
struct ncclTransportComm recv;
};
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
#endif
-2
Dosyayı Görüntüle
@@ -37,6 +37,4 @@ static long log2i(long n) {
return l;
}
int busIdToCudaDev(int64_t busId);
#endif