Merge remote-tracking branch 'nccl/master' into develop
Bu işleme şunda yer alıyor:
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -52,11 +52,16 @@ extern struct allocationTracker allocTracker[];
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
// Need async stream for P2P pre-connect + CUDA Graph
|
||||
hipStream_t stream;
|
||||
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
if (isFineGrain)
|
||||
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
|
||||
else
|
||||
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
||||
CUDACHECK(hipStreamSynchronize(stream));
|
||||
CUDACHECK(hipStreamDestroy(stream));
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -14,8 +14,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
|
||||
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
||||
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
|
||||
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
|
||||
ncclResult_t bootstrapClose(void* commState);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -28,6 +28,15 @@
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
// Report failure but clear error and continue
|
||||
#define CUDACHECKIGNORE(cmd) do { \
|
||||
hipError_t err = cmd; \
|
||||
if( err != hipSuccess ) { \
|
||||
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, hipGetErrorString(err)); \
|
||||
(void) hipGetLastError(); \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -81,4 +81,5 @@ DECL_ALL
|
||||
#define REDUCE_SLICESTEPS 1
|
||||
#define REDUCE_CHUNKSTEPS 1
|
||||
#define SENDRECV_SLICEFACTOR 1
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -15,6 +15,9 @@
|
||||
// [/RCCL]
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
typedef void *cudaGraph_t;
|
||||
typedef void *cudaGraphNode_t;
|
||||
#define HIPRT_CB
|
||||
#else
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
@@ -84,16 +87,17 @@ struct ncclComm {
|
||||
int nNodes;
|
||||
int localRanks;
|
||||
|
||||
enum { GROUP, PARALLEL } launchMode;
|
||||
enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
|
||||
hipStream_t userStream;
|
||||
bool userStreamSet;
|
||||
hipEvent_t doneEvent;
|
||||
hipEvent_t intDoneEvent;
|
||||
bool checkPointers;
|
||||
|
||||
// Counter to make sure collectives match (needed for bcast/reduce
|
||||
// where syncs are not symmetric).
|
||||
// Counter for tracking CUDA launches (P2P and collectives included)
|
||||
uint64_t opCount;
|
||||
uint64_t lastOpCount;
|
||||
// Collective operation counter
|
||||
uint64_t collOpCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
@@ -101,8 +105,6 @@ struct ncclComm {
|
||||
int p2pnChannels;
|
||||
int p2pnChannelsPerPeer;
|
||||
int p2pChannels[MAXCHANNELS];
|
||||
//Channels for collnet
|
||||
int collNetnChannels;
|
||||
|
||||
// Buffer sizes
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
@@ -157,6 +159,7 @@ struct ncclComm {
|
||||
struct ncclInfo* asyncOps;
|
||||
int asyncOpCount;
|
||||
size_t asyncTotalSize;
|
||||
int lastChannel;
|
||||
|
||||
//list of async p2p operation queued in a group semantics
|
||||
struct ncclP2Plist* p2pSends;
|
||||
@@ -169,6 +172,11 @@ struct ncclComm {
|
||||
int rootPid; // Process ID of root
|
||||
// [/RCCL]
|
||||
|
||||
// Store info for cudaGraph
|
||||
int usingCudaGraph; // Only use it during capture time, not launch time
|
||||
struct ncclQueueInfo* enqueueInfo;
|
||||
cudaGraphNode_t lastSetupNode;
|
||||
unsigned long long lastCudaGraphId;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -125,8 +125,9 @@ struct ncclConnInfo {
|
||||
struct ncclConnector {
|
||||
int connected;
|
||||
struct ncclProxyArgs *proxyAppend;
|
||||
struct ncclProxyArgs **proxyAppendPtr;
|
||||
struct ncclTransportComm* transportComm;
|
||||
void* transportResources; // Host-side resources
|
||||
void* transportResources;
|
||||
struct ncclConnInfo conn;
|
||||
struct ncclComm *comm;
|
||||
};
|
||||
@@ -151,11 +152,23 @@ struct ncclTree {
|
||||
int down[NCCL_MAX_TREE_ARITY];
|
||||
};
|
||||
|
||||
#define NCCL_MAX_DIRECT_ARITY 7
|
||||
struct ncclDirect {
|
||||
int depth;
|
||||
int out;
|
||||
int nHeads;
|
||||
int headRank;
|
||||
int shift;
|
||||
int up[NCCL_MAX_DIRECT_ARITY];
|
||||
int down[NCCL_MAX_DIRECT_ARITY];
|
||||
};
|
||||
|
||||
#define NCCL_CONN_IDX_P2P (*(comm->p2pNet)*2)
|
||||
#define NCCL_CONN_IDX_P2P_NET 2
|
||||
#define NCCL_MAX_CONNS 3
|
||||
struct ncclPeer {
|
||||
struct ncclConnector send;
|
||||
struct ncclConnector recv;
|
||||
struct ncclConnector p2pSend;
|
||||
struct ncclConnector p2pRecv;
|
||||
struct ncclConnector send[NCCL_MAX_CONNS];
|
||||
struct ncclConnector recv[NCCL_MAX_CONNS];
|
||||
};
|
||||
|
||||
struct ncclDevComm;
|
||||
@@ -179,7 +192,6 @@ struct ncclWorkElem {
|
||||
const void * sendbuff;
|
||||
void * recvbuff;
|
||||
|
||||
uint64_t opCount;
|
||||
// Op-specific fields.
|
||||
union {
|
||||
struct {
|
||||
@@ -192,9 +204,15 @@ struct ncclWorkElem {
|
||||
struct {
|
||||
size_t sendCount;
|
||||
size_t recvCount;
|
||||
int sendChunkSize;
|
||||
int recvChunkSize;
|
||||
int32_t delta;
|
||||
uint16_t nThreads;
|
||||
} p2p;
|
||||
struct {
|
||||
uint16_t padding[15];
|
||||
uint16_t opCount;
|
||||
} op;
|
||||
// [RCCL] Clique-based arguments
|
||||
// NOTE: Follows same field structure as coll
|
||||
// because nChannels is accessed from "coll" struct.
|
||||
@@ -206,7 +224,7 @@ struct ncclWorkElem {
|
||||
uint8_t nChannels;
|
||||
} clique;
|
||||
// [/RCCL]
|
||||
uint64_t align[3];
|
||||
uint64_t align[4];
|
||||
};
|
||||
};
|
||||
struct ncclWork {
|
||||
@@ -219,7 +237,7 @@ struct ncclChannel {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
struct ncclTree collTree;
|
||||
struct ncclDirect collTree;
|
||||
|
||||
int id;
|
||||
|
||||
@@ -241,6 +259,12 @@ struct ncclChannel {
|
||||
float bw_cumulative;
|
||||
int bw_count;
|
||||
#endif
|
||||
uint16_t index; // Only used by GPU
|
||||
|
||||
// GDRCOPY support
|
||||
struct ncclWork* workFifoGdr;
|
||||
struct ncclWork* workFifoDev;
|
||||
void* gdrMemDesc;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -11,15 +11,82 @@
|
||||
#include "group.h"
|
||||
#include "collectives.h"
|
||||
|
||||
size_t ncclKernMaxLocalSize();
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
|
||||
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
|
||||
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
|
||||
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
|
||||
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
|
||||
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
|
||||
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchKernel(ncclComm_t comm);
|
||||
ncclResult_t ncclRecordEvents(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchReset(ncclComm_t comm);
|
||||
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
|
||||
template<int USING_CUDA_GRAPH>
|
||||
void HIPRT_CB ncclEnqueueHostSetup(void* arg);
|
||||
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
|
||||
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
|
||||
|
||||
// Enqueue information (for kernel and proxy) for each operation
|
||||
struct ncclQueueElem {
|
||||
struct ncclWorkElem work;
|
||||
struct ncclProxyArgs proxyArgs;
|
||||
struct ncclQueueElem* next;
|
||||
};
|
||||
|
||||
// Store enqueue elements in a list
|
||||
struct ncclQueueElemList {
|
||||
struct ncclQueueElem* head;
|
||||
struct ncclQueueElem* tail;
|
||||
};
|
||||
|
||||
// Structure passed to CUDA graph
|
||||
struct ncclQueueInfo {
|
||||
ncclComm_t comm;
|
||||
int maxChannels; // Dynamic version of gridDim
|
||||
ncclResult_t ret; // Return value of host setup call
|
||||
struct ncclQueueElemList elemList;
|
||||
};
|
||||
|
||||
// Get next element from enqueue list
|
||||
static ncclResult_t ncclAddQueueElem(struct ncclQueueInfo* eqInfo, struct ncclQueueElem** elemOut) {
|
||||
if (eqInfo == NULL) return ncclInternalError;
|
||||
struct ncclQueueElemList* list = &eqInfo->elemList;
|
||||
if (list->tail != NULL) {
|
||||
*elemOut = list->tail;
|
||||
memset(*elemOut, 0, sizeof(struct ncclWorkElem) + sizeof(struct ncclProxyArgs));
|
||||
} else {
|
||||
NCCLCHECK(ncclCalloc(&list->tail, 1));
|
||||
*elemOut = list->tail;
|
||||
list->head = list->tail;
|
||||
}
|
||||
if (list->tail->next == NULL) {
|
||||
NCCLCHECK(ncclCalloc(&list->tail->next, 1));
|
||||
}
|
||||
list->tail = list->tail->next;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Reset element queue
|
||||
static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
|
||||
if (eqInfo == NULL) return ncclInternalError;
|
||||
eqInfo->maxChannels = 0;
|
||||
eqInfo->ret = ncclSuccess;
|
||||
eqInfo->elemList.tail = eqInfo->elemList.head;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Destroy enqueue info space
|
||||
// used by both CUDA graph and non CUDA graph
|
||||
static void ncclDestroyQueueInfo(void* ptr) {
|
||||
if (ptr == NULL) return;
|
||||
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
|
||||
struct ncclQueueElem* head = eqInfo->elemList.head;
|
||||
while (head != NULL) {
|
||||
struct ncclQueueElem* temp = head;
|
||||
head = head->next;
|
||||
free(temp);
|
||||
}
|
||||
free(eqInfo);
|
||||
}
|
||||
#endif // End include guard
|
||||
|
||||
@@ -0,0 +1,272 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_GDRWRAP_H_
|
||||
#define NCCL_GDRWRAP_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include <stdint.h> // for standard [u]intX_t types
|
||||
#include <stdio.h>
|
||||
|
||||
// These can be used if the GDR library isn't thread safe
|
||||
#include <pthread.h>
|
||||
extern pthread_mutex_t gdrLock;
|
||||
#define GDRLOCK() pthread_mutex_lock(&gdrLock)
|
||||
#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
|
||||
#define GDRLOCKCALL(cmd, ret) do { \
|
||||
GDRLOCK(); \
|
||||
ret = cmd; \
|
||||
GDRUNLOCK(); \
|
||||
} while(false)
|
||||
|
||||
#define GDRCHECK(cmd) do { \
|
||||
int e; \
|
||||
/* GDRLOCKCALL(cmd, e); */ \
|
||||
e = cmd; \
|
||||
if( e != 0 ) { \
|
||||
WARN("GDRCOPY failure %d", e); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
// This is required as the GDR memory is mapped WC
|
||||
#if !defined(__NVCC__)
|
||||
#if defined(__PPC__)
|
||||
static inline void wc_store_fence(void) { asm volatile("sync") ; }
|
||||
#elif defined(__x86_64__)
|
||||
#include <immintrin.h>
|
||||
static inline void wc_store_fence(void) { _mm_sfence(); }
|
||||
#elif defined(__aarch64__)
|
||||
#ifdef __cplusplus
|
||||
#include <atomic>
|
||||
static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
|
||||
#else
|
||||
#include <stdatomic.h>
|
||||
static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//#define GDR_DIRECT 1
|
||||
#ifdef GDR_DIRECT
|
||||
// Call the GDR API library code directly rather than via
|
||||
// dlopen() wrappers
|
||||
#include <gdrapi.h>
|
||||
|
||||
static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
|
||||
static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
|
||||
static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
|
||||
static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
|
||||
GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
|
||||
GDRCHECK(gdr_unpin_buffer(g, handle));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
|
||||
GDRCHECK(gdr_get_info(g, handle, info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
|
||||
GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
|
||||
GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static void wrap_gdr_runtime_get_version(int *major, int *minor) {
|
||||
gdr_runtime_get_version(major, minor);
|
||||
return ncclSuccess;
|
||||
}
|
||||
static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
|
||||
gdr_driver_get_version(g, major, minor);
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
|
||||
GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
|
||||
GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#else
|
||||
// Dynamically handle dependency the GDR API library
|
||||
|
||||
/* Extracted from gdrapi.h (v2.1 Nov 2020) */
|
||||
|
||||
#define GPU_PAGE_SHIFT 16
|
||||
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
|
||||
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
|
||||
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
|
||||
|
||||
struct gdr;
|
||||
typedef struct gdr *gdr_t;
|
||||
|
||||
typedef struct gdr_mh_s {
|
||||
unsigned long h;
|
||||
} gdr_mh_t;
|
||||
|
||||
struct gdr_info {
|
||||
uint64_t va;
|
||||
uint64_t mapped_size;
|
||||
uint32_t page_size;
|
||||
uint64_t tm_cycles;
|
||||
uint32_t cycles_per_ms;
|
||||
unsigned mapped:1;
|
||||
unsigned wc_mapping:1;
|
||||
};
|
||||
typedef struct gdr_info gdr_info_t;
|
||||
|
||||
/* End of gdrapi.h */
|
||||
|
||||
ncclResult_t wrap_gdr_symbols(void);
|
||||
|
||||
gdr_t wrap_gdr_open(void);
|
||||
ncclResult_t wrap_gdr_close(gdr_t g);
|
||||
ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
|
||||
ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
|
||||
ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
|
||||
ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
|
||||
ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
|
||||
ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
|
||||
ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
|
||||
ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
|
||||
ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
|
||||
|
||||
#endif // GDR_DIRECT
|
||||
|
||||
// Global GDR driver handle
|
||||
extern gdr_t ncclGdrCopy;
|
||||
|
||||
#include "alloc.h"
|
||||
|
||||
typedef struct gdr_mem_desc {
|
||||
void *gdrDevMem;
|
||||
void *gdrMap;
|
||||
size_t gdrOffset;
|
||||
size_t gdrMapSize;
|
||||
gdr_mh_t gdrMh;
|
||||
} gdr_mem_desc_t;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
static gdr_t ncclGdrInit() {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
#else
|
||||
static gdr_t ncclGdrInit() {
|
||||
int libMajor, libMinor, drvMajor, drvMinor;
|
||||
gdr_t handle = NULL;
|
||||
// Dynamically load the GDRAPI library symbols
|
||||
if (wrap_gdr_symbols() == ncclSuccess) {
|
||||
handle = wrap_gdr_open();
|
||||
|
||||
if (handle != NULL) {
|
||||
ncclResult_t res;
|
||||
|
||||
// Query the version of libgdrapi
|
||||
NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
|
||||
|
||||
// Query the version of gdrdrv driver
|
||||
NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
|
||||
|
||||
// Only support GDRAPI 2.1 and later
|
||||
if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
|
||||
goto error;
|
||||
}
|
||||
else
|
||||
INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
|
||||
}
|
||||
}
|
||||
return handle;
|
||||
error:
|
||||
if (handle != NULL) (void) wrap_gdr_close(handle);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
|
||||
gdr_info_t info;
|
||||
size_t mapSize;
|
||||
gdr_mh_t mh;
|
||||
char *devMem;
|
||||
void *gdrMap;
|
||||
|
||||
mapSize = sizeof(T)*nelem;
|
||||
|
||||
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
|
||||
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
|
||||
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
|
||||
uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
|
||||
size_t align = alignedAddr - (uint64_t)devMem;
|
||||
|
||||
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
|
||||
NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
|
||||
|
||||
NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
|
||||
//TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
|
||||
|
||||
NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
|
||||
|
||||
// Will offset ever be non zero ?
|
||||
ssize_t off = info.va - alignedAddr;
|
||||
|
||||
gdr_mem_desc_t* md;
|
||||
NCCLCHECK(ncclCalloc(&md, 1));
|
||||
md->gdrDevMem = devMem;
|
||||
md->gdrMap = gdrMap;
|
||||
md->gdrMapSize = mapSize;
|
||||
md->gdrOffset = off+align;
|
||||
md->gdrMh = mh;
|
||||
*gdrHandle = md;
|
||||
|
||||
*ptr = (T *)((char *)gdrMap+off);
|
||||
if (devPtr) *devPtr = (T *)(devMem+off+align);
|
||||
|
||||
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
|
||||
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
|
||||
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
||||
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
|
||||
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
||||
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
|
||||
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
|
||||
CUDACHECK(hipFree(md->gdrDevMem));
|
||||
free(md);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // End include guard
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -28,7 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
|
||||
@@ -93,13 +94,11 @@ struct ncclTopoRanks {
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
|
||||
struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, int nc);
|
||||
|
||||
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
|
||||
#include "info.h"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -1089,7 +1089,7 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
|
||||
static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
|
||||
int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
||||
if (ret != IBV_SUCCESS) {
|
||||
WARN("ibv_post_send() failed with error %s", strerror(ret));
|
||||
WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -19,8 +19,7 @@ typedef enum {
|
||||
ncclPatternTreeUp,
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown,
|
||||
ncclPatternCollTreeUp,
|
||||
ncclPatternCollTreeDown
|
||||
ncclPatternCollTreeUpDown
|
||||
} ncclPattern_t;
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
@@ -50,6 +49,8 @@ struct ncclInfo {
|
||||
int nchunksPerLoop;
|
||||
ssize_t sendbytes;
|
||||
ssize_t recvbytes;
|
||||
int recvChunkSize;
|
||||
int sendChunkSize;
|
||||
uint32_t delta;
|
||||
int channelId;
|
||||
};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -53,11 +53,12 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
ncclNetHandle_t handle;
|
||||
void* gpuPtr = NULL;
|
||||
void* mHandle = NULL;
|
||||
NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
|
||||
NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
|
||||
NCCLCHECK(ncclNetAccept(lComm, &rComm));
|
||||
CUDACHECK(hipExtMallocWithFlags(&gpuPtr, GPU_BUF_SIZE, hipDeviceMallocFinegrained));
|
||||
ncclResult_t ret;
|
||||
ncclDebugNoWarn = NCCL_NET;
|
||||
NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
|
||||
NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
|
||||
NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
|
||||
CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
|
||||
if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
|
||||
NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
|
||||
NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
|
||||
@@ -66,9 +67,13 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
}
|
||||
ncclDebugNoWarn = 0;
|
||||
CUDACHECK(hipFree(gpuPtr));
|
||||
cleanup4:
|
||||
NCCLCHECK(ncclNetCloseRecv(rComm));
|
||||
cleanup3:
|
||||
NCCLCHECK(ncclNetCloseSend(sComm));
|
||||
cleanup2:
|
||||
NCCLCHECK(ncclNetCloseListen(lComm));
|
||||
cleanup1:
|
||||
break;
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
+56
-30
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -14,48 +15,67 @@ enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }
|
||||
struct ncclProxyArgs;
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
struct ncclProxyArgs {
|
||||
proxyProgressFunc_t progress;
|
||||
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
|
||||
|
||||
struct ncclProxySubArgs {
|
||||
struct ncclChannel* channel;
|
||||
struct ncclConnector* connector;
|
||||
size_t sendbytes;
|
||||
size_t recvbytes;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
uint64_t opCount;
|
||||
int protocol;
|
||||
int segment; // Only for profiling
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
int state; // add component before this line -- it is left out during initialization
|
||||
ssize_t sendbytes;
|
||||
ssize_t recvbytes;
|
||||
int sendChunkSize;
|
||||
int recvChunkSize;
|
||||
int delta;
|
||||
|
||||
// Internal state
|
||||
uint64_t base;
|
||||
uint64_t posted;
|
||||
uint64_t received; // Only used by recv proxy to wait for flush.
|
||||
uint64_t received;
|
||||
uint64_t flushed;
|
||||
uint64_t transmitted;
|
||||
uint64_t done;
|
||||
uint64_t end;
|
||||
uint64_t hdp_flushed;
|
||||
void* requests[NCCL_STEPS];
|
||||
};
|
||||
|
||||
struct ncclProxyArgs {
|
||||
proxyProgressFunc_t progress;
|
||||
struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
|
||||
int nsubs;
|
||||
int done;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int chunkSize;
|
||||
uint64_t opCount;
|
||||
uint64_t commOpCount;
|
||||
int protocol;
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
ncclPattern_t pattern;
|
||||
int root;
|
||||
int state;
|
||||
char* sharedBuff[NCCL_STEPS];
|
||||
int sharedSize[NCCL_STEPS];
|
||||
|
||||
int idle;
|
||||
uint64_t hdp_flushed;
|
||||
|
||||
// Element linking
|
||||
pthread_mutex_t mutex;
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
struct ncclProxyArgs* nextGroup;
|
||||
struct ncclProxyArgs** proxyAppendPtr;
|
||||
};
|
||||
|
||||
struct ncclProxySharedBuffers {
|
||||
int nslots;
|
||||
int slotSize;
|
||||
char* cudaBuff[2*MAXCHANNELS];
|
||||
int* cudaUsed[2*MAXCHANNELS];
|
||||
char* hostBuff[2*MAXCHANNELS];
|
||||
int* hostUsed[2*MAXCHANNELS];
|
||||
int size;
|
||||
char* cudaBuff;
|
||||
char* hostBuff;
|
||||
struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
|
||||
// Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
|
||||
struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
|
||||
void* collNetResources;
|
||||
};
|
||||
|
||||
struct ncclProxyPool;
|
||||
@@ -64,11 +84,16 @@ struct ncclProxyState {
|
||||
pthread_mutex_t opsMutex;
|
||||
pthread_mutex_t poolMutex;
|
||||
bool stop;
|
||||
struct ncclProxySharedBuffers* sharedBuffs;
|
||||
struct ncclProxyArgs* ops;
|
||||
struct ncclProxyArgs* nextOps;
|
||||
struct ncclProxySharedBuffers sharedBuffs;
|
||||
struct ncclProxyArgs* ops; // Running operations, used by proxy thread
|
||||
struct ncclProxyArgs* postedOps; // Posted operations, shared between proxy and main thread, locked with opsMutex
|
||||
struct ncclProxyArgs* postedOpsEnd;
|
||||
struct ncclProxyArgs* nextOps; // Pending operations, used by main thread (could still be cancelled)
|
||||
struct ncclProxyArgs* nextOpsEnd;
|
||||
struct ncclProxyArgs* pool;
|
||||
struct ncclProxyArgs* pool; // Free operations for main thread
|
||||
struct ncclProxyArgs* poolFreed; // Freed operations by the progress thread
|
||||
struct ncclProxyArgs* poolReturned; // Shared between main and progress thread, lock with poolMutex
|
||||
|
||||
struct ncclProxyPool* pools;
|
||||
};
|
||||
|
||||
@@ -80,15 +105,16 @@ enum proxyMode {
|
||||
proxyTo = 2
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
|
||||
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
||||
|
||||
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
|
||||
ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
|
||||
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -41,7 +41,7 @@ struct ncclConnect {
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
|
||||
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
|
||||
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
|
||||
ncclResult_t (*free)(void*);
|
||||
ncclResult_t (*proxy)(struct ncclProxyArgs*);
|
||||
@@ -54,7 +54,10 @@ struct ncclTransport {
|
||||
struct ncclTransportComm recv;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
|
||||
|
||||
enum { collNetRecv=0, collNetSend=1 };
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
||||
#endif
|
||||
|
||||
@@ -37,6 +37,4 @@ static long log2i(long n) {
|
||||
return l;
|
||||
}
|
||||
|
||||
int busIdToCudaDev(int64_t busId);
|
||||
|
||||
#endif
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle