Merge remote-tracking branch 'nccl/master' into develop
Этот коммит содержится в:
@@ -193,10 +193,9 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
|
||||
#define MAX_ALLOC_TRACK_NGPU 128
|
||||
extern struct allocationTracker allocTracker[];
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#if ROCM_VERSION >= 70000
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
#include "rocmwrap.h"
|
||||
|
||||
// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
|
||||
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
|
||||
@@ -262,7 +261,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
|
||||
prop.requestedHandleTypes = type;
|
||||
prop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
||||
// CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
||||
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
ALIGN_SIZE(size, granularity);
|
||||
@@ -318,21 +317,21 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
extern int ncclCuMemEnable();
|
||||
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
WARN("CUMEM not supported prior to ROCm 7.0");
|
||||
return ncclInternalError;
|
||||
}
|
||||
static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
WARN("CUMEM not supported prior to ROCm 7.0");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
WARN("CUMEM not supported prior to ROCm 7.0");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
WARN("CUMEM not supported prior to ROCm 7.0");
|
||||
return ncclInternalError;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ALLOCATOR_H_
|
||||
#define NCCL_ALLOCATOR_H_
|
||||
|
||||
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
|
||||
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
|
||||
|
||||
#endif
|
||||
@@ -122,6 +122,10 @@ typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommShrink_fn_t)(ncclComm_t comm, int* excludeRanksList,
|
||||
int excludeRanksCount, ncclComm_t *newcomm,
|
||||
ncclConfig_t* config, int shrinkFlags);
|
||||
|
||||
typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, int key,
|
||||
ncclComm_t* newcomm, ncclConfig_t* config);
|
||||
|
||||
@@ -158,6 +162,10 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff,
|
||||
|
||||
typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);
|
||||
|
||||
typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
|
||||
|
||||
typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win);
|
||||
|
||||
typedef struct rcclApiFuncTable
|
||||
{
|
||||
uint64_t size;
|
||||
@@ -184,6 +192,7 @@ typedef struct rcclApiFuncTable
|
||||
ncclCommFinalize_fn_t ncclCommFinalize_fn;
|
||||
ncclCommDestroy_fn_t ncclCommDestroy_fn;
|
||||
ncclCommAbort_fn_t ncclCommAbort_fn;
|
||||
ncclCommShrink_fn_t ncclCommShrink_fn;
|
||||
ncclCommSplit_fn_t ncclCommSplit_fn;
|
||||
ncclGetErrorString_fn_t ncclGetErrorString_fn;
|
||||
ncclGetLastError_fn_t ncclGetLastError_fn;
|
||||
@@ -198,6 +207,8 @@ typedef struct rcclApiFuncTable
|
||||
mscclUnloadAlgo_fn_t mscclUnloadAlgo_fn;
|
||||
ncclCommRegister_fn_t ncclCommRegister_fn;
|
||||
ncclCommDeregister_fn_t ncclCommDeregister_fn;
|
||||
ncclCommWindowRegister_fn_t ncclCommWindowRegister_fn;
|
||||
ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn;
|
||||
ncclAllReduceWithBias_fn_t ncclAllReduceWithBias_fn;
|
||||
|
||||
} rcclApiFuncTable;
|
||||
|
||||
+163
-23
@@ -19,6 +19,28 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template<typename Int>
|
||||
constexpr static __host__ __device__ Int minval(Int a) { return a; }
|
||||
template<typename Int, typename ...More>
|
||||
constexpr static __host__ __device__ Int minval(Int a, Int b, More ...more) {
|
||||
#if __CUDA_ARCH__
|
||||
return minval(min(a, b), more...);
|
||||
#else
|
||||
return minval(a < b ? a : b, more...);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
constexpr static __host__ __device__ Int maxval(Int a) { return a; }
|
||||
template<typename Int, typename ...More>
|
||||
constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
|
||||
#if __CUDA_ARCH__
|
||||
return maxval(max(a, b), more...);
|
||||
#else
|
||||
return maxval(a > b ? a : b, more...);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
|
||||
@@ -32,32 +54,150 @@
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
return (x+y-1)/y;
|
||||
}
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
return (x+y-1) - (x+y-1)%y;
|
||||
}
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundDown(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z roundDown(X x, Y y) {
|
||||
return x - x%y;
|
||||
}
|
||||
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
static __host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
return (x + a-1) & Z(-a);
|
||||
}
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignDown(X x, int a) {
|
||||
static __host__ __device__ constexpr Z alignDown(X x, int a) {
|
||||
return x & Z(-a);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int countOneBits(Int x) {
|
||||
constexpr __host__ __device__ bool isPow2(Int x) {
|
||||
return (x & (x-1)) == 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T add4G(T base, int delta4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = base;
|
||||
u32[1] += delta4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = ptr;
|
||||
u32[1] += delta4G;
|
||||
if (u32[1] >= hi4G) u32[1] -= hi4G-lo4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = ptr;
|
||||
u32[1] -= delta4G;
|
||||
if (u32[1] < lo4G) u32[1] += hi4G-lo4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
// Produce the reciprocal of x for use in idivByRcp
|
||||
constexpr __host__ __device__ uint32_t idivRcp32(uint32_t x) {
|
||||
return uint32_t(uint64_t(0x100000000)/x);
|
||||
}
|
||||
constexpr __host__ __device__ uint64_t idivRcp64(uint64_t x) {
|
||||
return uint64_t(-1)/x + isPow2(x);
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t mul32hi(uint32_t a, uint32_t b) {
|
||||
#if __CUDA_ARCH__
|
||||
return __umulhi(a, b);
|
||||
#else
|
||||
return uint64_t(a)*b >> 32;
|
||||
#endif
|
||||
}
|
||||
static __host__ __device__ uint64_t mul64hi(uint64_t a, uint64_t b) {
|
||||
#if __CUDA_ARCH__
|
||||
return __umul64hi(a, b);
|
||||
#else
|
||||
return (uint64_t)(((unsigned __int128)a)*b >> 64);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Produce the reciprocal of x*y given their respective reciprocals. This incurs
|
||||
// no integer division on device.
|
||||
static __host__ __device__ uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
|
||||
if (xrcp == 0) return yrcp;
|
||||
if (yrcp == 0) return xrcp;
|
||||
uint32_t rcp = mul32hi(xrcp, yrcp);
|
||||
uint32_t rem = -x*y*rcp;
|
||||
if (x*y <= rem) rcp += 1;
|
||||
return rcp;
|
||||
}
|
||||
static __host__ __device__ uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
|
||||
if (xrcp == 0) return yrcp;
|
||||
if (yrcp == 0) return xrcp;
|
||||
uint64_t rcp = mul64hi(xrcp, yrcp);
|
||||
uint64_t rem = -x*y*rcp;
|
||||
if (x*y <= rem) rcp += 1;
|
||||
return rcp;
|
||||
}
|
||||
|
||||
// Fast integer division where divisor has precomputed reciprocal.
|
||||
// idivFast(x, y, idivRcp(y)) == x/y
|
||||
static __host__ __device__ void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q = x, r = 0;
|
||||
if (yrcp != 0) {
|
||||
q = mul32hi(x, yrcp);
|
||||
r = x - y*q;
|
||||
if (r >= y) { q += 1; r -= y; }
|
||||
}
|
||||
*quo = q;
|
||||
*rem = r;
|
||||
}
|
||||
static __host__ __device__ void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q = x, r = 0;
|
||||
if (yrcp != 0) {
|
||||
q = mul64hi(x, yrcp);
|
||||
r = x - y*q;
|
||||
if (r >= y) { q += 1; r -= y; }
|
||||
}
|
||||
*quo = q;
|
||||
*rem = r;
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q, r;
|
||||
idivmodFast32(&q, &r, x, y, yrcp);
|
||||
return q;
|
||||
}
|
||||
static __host__ __device__ uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q, r;
|
||||
idivmodFast64(&q, &r, x, y, yrcp);
|
||||
return q;
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q, r;
|
||||
idivmodFast32(&q, &r, x, y, yrcp);
|
||||
return r;
|
||||
}
|
||||
static __host__ __device__ uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q, r;
|
||||
idivmodFast64(&q, &r, x, y, yrcp);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
static __host__ __device__ int countOneBits(Int x) {
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(unsigned int)) {
|
||||
return __popc((unsigned int)x);
|
||||
@@ -83,7 +223,7 @@ inline __host__ __device__ int countOneBits(Int x) {
|
||||
|
||||
// Returns index of first one bit or returns -1 if mask is zero.
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int firstOneBit(Int mask) {
|
||||
static __host__ __device__ int firstOneBit(Int mask) {
|
||||
int i;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
@@ -108,14 +248,14 @@ inline __host__ __device__ int firstOneBit(Int mask) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int popFirstOneBit(Int* mask) {
|
||||
static __host__ __device__ int popFirstOneBit(Int* mask) {
|
||||
Int tmp = *mask;
|
||||
*mask &= *mask-1;
|
||||
return firstOneBit(tmp);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int log2Down(Int x) {
|
||||
static __host__ __device__ int log2Down(Int x) {
|
||||
int w, n;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
@@ -147,7 +287,7 @@ inline __host__ __device__ int log2Down(Int x) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int log2Up(Int x) {
|
||||
static __host__ __device__ int log2Up(Int x) {
|
||||
int w, n;
|
||||
if (x != 0) x -= 1;
|
||||
#if __CUDA_ARCH__
|
||||
@@ -180,19 +320,19 @@ inline __host__ __device__ int log2Up(Int x) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Up(Int x) {
|
||||
static __host__ __device__ Int pow2Up(Int x) {
|
||||
return Int(1)<<log2Up(x);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Down(Int x) {
|
||||
static __host__ __device__ Int pow2Down(Int x) {
|
||||
// True, log2Down can return -1, but we don't normally pass 0 as an argument...
|
||||
// coverity[negative_shift]
|
||||
return Int(1)<<log2Down(x);
|
||||
}
|
||||
|
||||
template<typename UInt, int nSubBits>
|
||||
inline __host__ __device__ UInt reverseSubBits(UInt x) {
|
||||
static __host__ __device__ UInt reverseSubBits(UInt x) {
|
||||
if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
|
||||
switch (8*sizeof(UInt)) {
|
||||
case 16: x = __builtin_bswap16(x); break;
|
||||
@@ -225,7 +365,7 @@ template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned lon
|
||||
|
||||
// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
static __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
using UInt = typename ncclToUnsigned<Int>::type;
|
||||
union { UInt ux; Int sx; };
|
||||
sx = x;
|
||||
@@ -249,7 +389,7 @@ inline __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
// has nearly the full range of uint32_t except it only keeps the top 3 bits
|
||||
// beneath the leading 1 bit and thus has a max value of 0xf0000000.
|
||||
|
||||
inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
static __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
int log2x;
|
||||
#if __CUDA_ARCH__
|
||||
log2x = 31-__clz(x|1);
|
||||
@@ -261,7 +401,7 @@ inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
return exponent<<bitsPerPow2 | mantissa;
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
static __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
uint32_t exponent = x>>bitsPerPow2;
|
||||
uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
|
||||
if (exponent != 0) exponent -= 1;
|
||||
@@ -270,16 +410,16 @@ inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
|
||||
constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
|
||||
|
||||
inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
|
||||
static __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
|
||||
return u32fpEncode(x, 3);
|
||||
}
|
||||
inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
static __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
return u32fpDecode(x, 3);
|
||||
}
|
||||
|
||||
// The hash isn't just a function of the bytes but also where the bytes are split
|
||||
// into different calls to eatHash().
|
||||
inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
|
||||
static __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
|
||||
char const* ptr = (char const*)bytes;
|
||||
acc[0] ^= size;
|
||||
while (size != 0) {
|
||||
@@ -302,11 +442,11 @@ inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
|
||||
static __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
|
||||
eatHash(acc, (const void*)bytes, sizeof(T));
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
static __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
uint64_t h = acc[0];
|
||||
h ^= h >> 31;
|
||||
h *= 0xbac3bd562846de6b;
|
||||
@@ -316,13 +456,13 @@ inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
return h;
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
|
||||
static __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
|
||||
uint64_t acc[2] = {1, 1};
|
||||
eatHash(acc, bytes, size);
|
||||
return digestHash(acc);
|
||||
}
|
||||
template<typename T>
|
||||
inline __host__ __device__ uint64_t getHash(const T* bytes) {
|
||||
static __host__ __device__ uint64_t getHash(const T* bytes) {
|
||||
return getHash((const void*)bytes, sizeof(T));
|
||||
}
|
||||
|
||||
|
||||
+48
-18
@@ -19,6 +19,7 @@
|
||||
#include "graph.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "profiler.h"
|
||||
#include "allocator.h"
|
||||
#include "latency_profiler/CollTrace.h"
|
||||
#include "rccl_common.h"
|
||||
#include "recorder.h"
|
||||
@@ -140,7 +141,6 @@ struct ncclSharedResources {
|
||||
int* tpRankToLocalRank;
|
||||
// Internal streams
|
||||
struct ncclStrongStream deviceStream, hostStream;
|
||||
int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
|
||||
int persistentRefs;
|
||||
cudaEvent_t launchEvent, scratchEvent;
|
||||
|
||||
@@ -229,6 +229,7 @@ struct ncclTaskColl {
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
void* eventHandle;
|
||||
uint8_t nChannels;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
struct ncclTaskP2p* next;
|
||||
@@ -243,6 +244,7 @@ struct ncclTaskP2p {
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
void* eventHandle;
|
||||
uint8_t nChannels;
|
||||
};
|
||||
|
||||
struct ncclKernelPlan {
|
||||
@@ -255,10 +257,14 @@ struct ncclKernelPlan {
|
||||
|
||||
bool persistent; // aka captured in a graph
|
||||
bool isHostCbEnq;
|
||||
bool isSymColl;
|
||||
enum ncclDevWorkStorageType workStorageType;
|
||||
bool kernelSpecialized;
|
||||
void *kernelFn;
|
||||
struct ncclDevKernelArgs* kernelArgs;
|
||||
void* kernelFn;
|
||||
union {
|
||||
struct ncclDevKernelArgs* kernelArgs;
|
||||
struct ncclSymDevArgs* kernelSymArgs;
|
||||
};
|
||||
size_t kernelArgsSize;
|
||||
struct channelMasks channelMask;
|
||||
bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
|
||||
@@ -367,6 +373,7 @@ struct ncclKernelPlanner {
|
||||
struct Peer* peers/*[nRanks]*/;
|
||||
int nTasksColl, nTasksP2p;
|
||||
bool persistent;
|
||||
bool isSymColl;
|
||||
|
||||
// The list of user streams aggregated over all tasks present.
|
||||
struct ncclCudaStreamList* streams;
|
||||
@@ -430,12 +437,19 @@ struct ncclPeerInfo {
|
||||
int64_t busId;
|
||||
struct ncclComm* comm;
|
||||
int cudaCompCap;
|
||||
size_t totalGlobalMem;
|
||||
// MNNVL support
|
||||
nvmlGpuFabricInfoV_t fabricInfo;
|
||||
int cuMemSupport;
|
||||
int version;
|
||||
};
|
||||
|
||||
typedef enum ncclGroupTaskType {
|
||||
ncclGroupTaskTypeCollective = 0,
|
||||
ncclGroupTaskTypeSymRegister = 1,
|
||||
ncclGroupTaskTypeNum = 2,
|
||||
} ncclGroupTaskType_t;
|
||||
|
||||
struct ncclComm {
|
||||
uint64_t startMagic;
|
||||
struct ncclMemoryStack memPermanent, memScoped;
|
||||
@@ -452,9 +466,10 @@ struct ncclComm {
|
||||
struct ncclTopoSystem* topo;
|
||||
struct ncclProxyConnector* gproxyConn;
|
||||
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
|
||||
bool peerInfoValid;
|
||||
|
||||
int netPluginLoaded;
|
||||
ncclNet_t* ncclNet;
|
||||
int netPluginIndex;
|
||||
int ncclNetVer;
|
||||
ncclNetDeviceType netDeviceType;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
@@ -471,7 +486,6 @@ struct ncclComm {
|
||||
|
||||
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||
|
||||
const char* commName;
|
||||
uint64_t commHash;
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
@@ -556,6 +570,7 @@ struct ncclComm {
|
||||
|
||||
// Device side of the communicator (for cudaFree's)
|
||||
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
|
||||
struct ncclSymDevComm symDevComm;
|
||||
|
||||
uint32_t workArgsBytes; // max size of kernel args
|
||||
uint32_t workFifoBytes; // size of workFifoBuf, power of 2
|
||||
@@ -563,12 +578,10 @@ struct ncclComm {
|
||||
void* workFifoBufDev;
|
||||
void* workFifoBufGdrHandle;
|
||||
|
||||
// Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
|
||||
uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
|
||||
// Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
|
||||
uint32_t workFifoConsumedLeast;
|
||||
// Monotonic number of bytes (mod 1<<32) sent to fifo.
|
||||
uint32_t workFifoProduced;
|
||||
uint32_t workFifoProducedLastRecorded;
|
||||
uint32_t workFifoConsumed;
|
||||
|
||||
// Intra-process sync
|
||||
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
|
||||
@@ -584,10 +597,8 @@ struct ncclComm {
|
||||
struct ncclProxyState* proxyState;
|
||||
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
|
||||
// Whether this communicator uses collNet
|
||||
int collNetSupport;
|
||||
bool isOneRPN;
|
||||
uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
|
||||
bool intraNodeP2pSupport;
|
||||
int* collNetHeads;
|
||||
int collNetHeadsNum;
|
||||
int* collNetDenseToUserRank;
|
||||
@@ -609,7 +620,7 @@ struct ncclComm {
|
||||
|
||||
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
|
||||
// this comm is not yet in a group.
|
||||
struct ncclComm* groupNext;
|
||||
struct ncclComm* groupNext[ncclGroupTaskTypeNum];
|
||||
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
|
||||
struct ncclComm* preconnectNext;
|
||||
int localPersistentRefs; // number of persistent plan-lists capturing this comm
|
||||
@@ -631,6 +642,7 @@ struct ncclComm {
|
||||
ncclUserRedOp *userRedOps;
|
||||
|
||||
// Queue of things for the main thread to do
|
||||
int reclaimSteps;
|
||||
struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
|
||||
|
||||
hipEvent_t doneEvent;
|
||||
@@ -670,6 +682,9 @@ struct ncclComm {
|
||||
// group job to support multi-thread FT
|
||||
struct ncclGroupJob *groupJob;
|
||||
|
||||
// Flag indicating if this communicator shares resources with parent or children
|
||||
bool shareResources;
|
||||
|
||||
// Tuning plugin
|
||||
int tunerPluginLoaded;
|
||||
ncclTuner_t* tuner;
|
||||
@@ -683,16 +698,25 @@ struct ncclComm {
|
||||
// buffer registration cache
|
||||
struct ncclRegCache regCache;
|
||||
int isAllNvlink;
|
||||
bool isAllDirectP2p;
|
||||
int symmetricSupport;
|
||||
bool useNetPXN;
|
||||
bool useGdr;
|
||||
int splitCount;
|
||||
|
||||
// symmetric buffer
|
||||
uint8_t* baseUCSymPtr;
|
||||
uint8_t* baseMCSymPtr;
|
||||
size_t baseStride;
|
||||
size_t symAllocHead;
|
||||
CUmemGenericAllocationHandle symMCHandle;
|
||||
struct ncclIntruQueue<struct ncclSymRegTask, &ncclSymRegTask::next> symRegTaskQueue;
|
||||
|
||||
// Unroll factor for comm [RCCL]
|
||||
int unroll;
|
||||
|
||||
// custom collective
|
||||
// custom collective [RCCL]
|
||||
bool enableCustColl;
|
||||
|
||||
|
||||
uint64_t endMagic;
|
||||
};
|
||||
|
||||
@@ -724,15 +748,21 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
|
||||
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm, bool waitSome) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
while (true) {
|
||||
struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
|
||||
if (cb == nullptr) break;
|
||||
cudaError_t ok = cudaEventSynchronize(cb->event);
|
||||
if (ok == cudaErrorNotReady) break;
|
||||
cudaError_t ok;
|
||||
if (waitSome) {
|
||||
ok = cudaEventSynchronize(cb->event);
|
||||
waitSome = false;
|
||||
} else {
|
||||
ok = cudaEventQuery(cb->event);
|
||||
if (ok == cudaErrorNotReady) break;
|
||||
}
|
||||
ncclIntruQueueDequeue(&comm->eventCallbackQueue);
|
||||
if (ok == cudaSuccess) {
|
||||
NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
|
||||
|
||||
@@ -58,4 +58,29 @@ static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
|
||||
int c = 0;
|
||||
int start = -1;
|
||||
// Iterate through all possible CPU bits plus one extra position
|
||||
for (int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
|
||||
int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
|
||||
// Start of a new range
|
||||
if (isSet && start == -1) {
|
||||
start = cpu;
|
||||
}
|
||||
// End of a range, add comma between ranges
|
||||
if (!isSet && start != -1) {
|
||||
if (cpu-1 == start) {
|
||||
c += snprintf(str+c, len-c, "%s%d", c ? "," : "", start);
|
||||
} else {
|
||||
c += snprintf(str+c, len-c, "%s%d-%d", c ? "," : "", start, cpu-1);
|
||||
}
|
||||
if (c >= len-1) break;
|
||||
start = -1;
|
||||
}
|
||||
}
|
||||
if (c == 0) str[0] = '\0';
|
||||
return str;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -36,6 +36,10 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUCALL(cmd) do { \
|
||||
pfn_##cmd; \
|
||||
} while(false)
|
||||
|
||||
#define CUCHECKGOTO(cmd, res, label) do { \
|
||||
CUresult err = pfn_##cmd; \
|
||||
if( err != CUDA_SUCCESS ) { \
|
||||
@@ -66,49 +70,49 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 11040);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel, 4000);
|
||||
#if CUDART_VERSION >= 11080
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx, 11060);
|
||||
#endif
|
||||
// cuMem API support
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemMap);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle, 10020);
|
||||
#if CUDA_VERSION >= 11070
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12010
|
||||
/* NVSwitch Multicast support */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#include "nccl_common.h"
|
||||
#include "bitops.h"
|
||||
#include "symmetric.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit_struct.h"
|
||||
#endif
|
||||
@@ -41,6 +42,30 @@ extern const char* funcNames[];
|
||||
#define NCCL_CUDA_ARCH 0
|
||||
#endif
|
||||
|
||||
#ifdef __CUDA_ARCH_SPECIFIC__
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC __CUDA_ARCH_SPECIFIC__
|
||||
#elif defined(__CUDA_ARCH_HAS_FEATURE__)
|
||||
#if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 900
|
||||
#elif __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 1000
|
||||
#elif __CUDA_ARCH_HAS_FEATURE__(SM101_ALL)
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 1010
|
||||
#elif __CUDA_ARCH_HAS_FEATURE__(SM120_ALL)
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 1200
|
||||
#else
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 0
|
||||
#endif
|
||||
#else
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 0
|
||||
#endif
|
||||
|
||||
#ifdef __CUDA_ARCH_FAMILY_SPECIFIC__
|
||||
#define NCCL_CUDA_ARCH_FAMILY_SPECIFIC __CUDA_ARCH_FAMILY_SPECIFIC__
|
||||
#else
|
||||
#define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0
|
||||
#endif
|
||||
|
||||
#include "net_device.h"
|
||||
|
||||
enum ncclDevRedOp_t {
|
||||
@@ -516,6 +541,14 @@ struct alignas(16) ncclDevChannel {
|
||||
uint64_t workCounter;
|
||||
};
|
||||
|
||||
#define MAX_PROFILER_EVENTS_PER_CHANNEL 64
|
||||
struct ncclDevProfiler {
|
||||
struct {
|
||||
uint64_t counter;
|
||||
uint64_t timestamp;
|
||||
} data[MAX_PROFILER_EVENTS_PER_CHANNEL];
|
||||
};
|
||||
|
||||
struct ncclDevComm {
|
||||
int rank;
|
||||
int nRanks;
|
||||
@@ -526,9 +559,6 @@ struct ncclDevComm {
|
||||
int isAllNvlink;
|
||||
int p2pnChannelsPerPeer;
|
||||
|
||||
// Work fifo return credits
|
||||
uint32_t* workConsumed/*[MAXCHANNELS]*/;
|
||||
|
||||
int* collNetDenseToUserRank;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
@@ -540,8 +570,8 @@ struct ncclDevComm {
|
||||
int* rankToLocalRank;
|
||||
|
||||
// Profiler counters
|
||||
uint64_t* workStarted/*[MAXCHANNELS]*/;
|
||||
uint64_t* workCompleted/*[MAXCHANNELS]*/;
|
||||
struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
|
||||
struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
NpKitEventCollectContext* npKitEventCollectContexts;
|
||||
@@ -641,7 +671,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int
|
||||
|
||||
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
|
||||
// Our collective unroll should move to the same bytes&insns model as NVLS.
|
||||
return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
|
||||
return cudaArch >= 800 ? (cudaArch / 100 == 12 ? 6 : 8) : 4;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
|
||||
@@ -672,7 +702,6 @@ extern int const ncclDevKernelCount;
|
||||
extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
|
||||
|
||||
// Table of most specialized kernel function to run given func index.
|
||||
extern int const ncclDevFuncIdCount;
|
||||
extern int const ncclDevFuncRowToId[];
|
||||
extern void* const ncclDevKernelForFunc[/*funcIndex*/];
|
||||
extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
|
||||
|
||||
@@ -51,6 +51,8 @@ int ncclPxnDisable(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
|
||||
|
||||
ncclResult_t ncclGetUserP2pLevel(int* level);
|
||||
|
||||
#define MAX_XGMI_INTER_GPUS 4
|
||||
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
|
||||
@@ -81,7 +83,9 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
|
||||
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
|
||||
|
||||
// Allows for up to 32 NICs per node on GB200-NVL72
|
||||
#define NCCL_TOPO_MAX_NODES 64
|
||||
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType);
|
||||
|
||||
// Init search. Needs to be done before calling ncclTopoCompute
|
||||
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
|
||||
|
||||
+27
-41
@@ -10,9 +10,11 @@
|
||||
|
||||
#include "nccl.h"
|
||||
#include "comm.h"
|
||||
#include "allocator.h"
|
||||
#include "register.h"
|
||||
|
||||
ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
|
||||
void ncclGroupCommJoin(struct ncclComm* comm);
|
||||
void ncclGroupCommJoin(struct ncclComm* comm, int type);
|
||||
void ncclGroupCommPreconnect(struct ncclComm* comm);
|
||||
ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
|
||||
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
|
||||
@@ -53,13 +55,14 @@ ncclResult_t ncclAsyncLaunch(
|
||||
|
||||
struct ncclGroupJob {
|
||||
struct ncclAsyncJob base;
|
||||
struct ncclComm **groupCommHeadPtr;
|
||||
struct ncclComm **groupCommPreconnectHeadPtr;
|
||||
ncclResult_t *groupErrorPtr;
|
||||
bool *abortFlagPtr;
|
||||
int *groupBlockingPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
|
||||
bool initialized;
|
||||
int groupRefCount;
|
||||
bool nonBlockingInit;
|
||||
bool joined;
|
||||
struct ncclComm *groupCommHead[ncclGroupTaskTypeNum];
|
||||
struct ncclComm *groupCommPreconnectHead;
|
||||
ncclResult_t groupError;
|
||||
bool abortFlag;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncJobs;
|
||||
};
|
||||
|
||||
ncclResult_t ncclGroupStartInternal();
|
||||
@@ -70,27 +73,9 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
|
||||
|
||||
extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
|
||||
extern __thread ncclResult_t ncclGroupError;
|
||||
extern __thread struct ncclComm* ncclGroupCommHead;
|
||||
extern __thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum];
|
||||
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
|
||||
extern __thread int ncclGroupBlocking;
|
||||
extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
|
||||
extern __thread struct ncclGroupJob ncclGroupJobMain;
|
||||
|
||||
static inline void groupResetJobState() {
|
||||
ncclGroupBlocking = -1;
|
||||
ncclGroupJobMainPtr = NULL;
|
||||
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
|
||||
return;
|
||||
}
|
||||
|
||||
static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (job) {
|
||||
ret = ncclAsyncJobComplete(&job->base);
|
||||
groupResetJobState();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
|
||||
if (ncclGroupDepth > 0) {
|
||||
@@ -100,31 +85,32 @@ inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
|
||||
}
|
||||
|
||||
// Add comm to this thread's group
|
||||
inline void ncclGroupCommJoin(struct ncclComm* comm) {
|
||||
if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
|
||||
inline void ncclGroupCommJoin(struct ncclComm* comm, int type) {
|
||||
if (comm->groupNext[type] == reinterpret_cast<struct ncclComm*>(0x1)) {
|
||||
// Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
|
||||
// the users program order yet insures siblings occur consecutively. This
|
||||
// is required by doLaunches() in "group.cc".
|
||||
struct ncclComm** pp = &ncclGroupCommHead;
|
||||
struct ncclComm** pp = &ncclGroupCommHead[type];
|
||||
while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
|
||||
pp = &(*pp)->groupNext;
|
||||
pp = &(*pp)->groupNext[type];
|
||||
|
||||
// didn't find its clique, we need to insert it with ascending order based on commHash
|
||||
if (*pp == nullptr) {
|
||||
pp = &ncclGroupCommHead;
|
||||
while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
|
||||
pp = &ncclGroupCommHead[type];
|
||||
while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext[type];
|
||||
}
|
||||
comm->groupNext = *pp;
|
||||
comm->groupNext[type] = *pp;
|
||||
*pp = comm;
|
||||
// Comms gets a new memory stack scope upon joining. Each task batched for
|
||||
// this comm is allocated there.
|
||||
ncclMemoryStackPush(&comm->memScoped);
|
||||
// Initialize planner
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
if (type == ncclGroupTaskTypeCollective) {
|
||||
// Initialize planner
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
ncclGroupBlocking = comm->config.blocking;
|
||||
}
|
||||
|
||||
@@ -137,8 +123,8 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
// Comm has left group
|
||||
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
|
||||
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm, int type) {
|
||||
comm->groupNext[type] = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
ncclMemoryStackPop(&comm->memScoped);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
#ifndef NCCL_MLX5DV_CORE_H_
|
||||
#define NCCL_MLX5DV_CORE_H_
|
||||
|
||||
/* Basic MLX5 direct verbs structs. Needed to dynamically load MLX5 direct verbs functions without
|
||||
* explicit including of MLX5 direct verbs header.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include "ibvwrap.h"
|
||||
|
||||
enum mlx5dv_reg_dmabuf_access {
|
||||
MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT = (1<<0),
|
||||
};
|
||||
|
||||
#endif // NCCL_MLX5DV_CORE_H_
|
||||
@@ -0,0 +1,23 @@
|
||||
#ifndef NCCL_MLX5DV_SYMBOLS_H_
|
||||
#define NCCL_MLX5DV_SYMBOLS_H_
|
||||
|
||||
#ifdef NCCL_BUILD_MLX5DV
|
||||
#include <infiniband/mlx5dv.h>
|
||||
#else
|
||||
#include "mlx5/mlx5dvcore.h"
|
||||
#endif
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
/* MLX5 Direct Verbs Function Pointers*/
|
||||
struct ncclMlx5dvSymbols {
|
||||
bool (*mlx5dv_internal_is_supported)(struct ibv_device *device);
|
||||
int (*mlx5dv_internal_get_data_direct_sysfs_path)(struct ibv_context *context, char *buf, size_t buf_len);
|
||||
/* DMA-BUF support */
|
||||
struct ibv_mr * (*mlx5dv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
|
||||
};
|
||||
|
||||
/* Constructs MLX5 direct verbs symbols per rdma-core linking or dynamic loading mode */
|
||||
ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols);
|
||||
|
||||
#endif // NCCL_MLX5DV_SYMBOLS_H_
|
||||
@@ -0,0 +1,41 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
|
||||
* Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_MLX5DVWRAP_H_
|
||||
#define NCCL_MLX5DVWRAP_H_
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <netinet/in.h>
|
||||
#ifdef NCCL_BUILD_MLX5DV
|
||||
#include <infiniband/mlx5dv.h>
|
||||
#else
|
||||
#include "mlx5/mlx5dvcore.h"
|
||||
#endif
|
||||
|
||||
#include "core.h"
|
||||
#include "ibvwrap.h"
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
typedef enum mlx5dv_return_enum
|
||||
{
|
||||
MLX5DV_SUCCESS = 0, //!< The operation was successful
|
||||
} mlx5dv_return_t;
|
||||
|
||||
ncclResult_t wrap_mlx5dv_symbols(void);
|
||||
/* NCCL wrappers of MLX5 direct verbs functions */
|
||||
bool wrap_mlx5dv_is_supported(struct ibv_device *device);
|
||||
ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
|
||||
struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
|
||||
|
||||
#endif // NCCL_MLX5DVWRAP_H_
|
||||
@@ -7,6 +7,9 @@
|
||||
#ifndef NCCL_DEBUG_H_
|
||||
#define NCCL_DEBUG_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include "nccl.h"
|
||||
|
||||
typedef enum {
|
||||
NCCL_LOG_NONE = 0,
|
||||
NCCL_LOG_VERSION = 1,
|
||||
@@ -39,6 +42,16 @@ typedef enum {
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
// NCCL core profiler callback for network defined events instrumentation
|
||||
enum {
|
||||
ncclProfilerNetEventStart = 0,
|
||||
ncclProfilerNetEventStop,
|
||||
ncclProfilerNetEventUpdate,
|
||||
ncclProfilerNetEventUpdateAndStop,
|
||||
};
|
||||
|
||||
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
|
||||
typedef enum {
|
||||
ncclFuncBroadcast = 0,
|
||||
@@ -54,7 +67,7 @@ typedef enum {
|
||||
ncclNumFuncs = 10
|
||||
} ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
|
||||
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT
|
||||
#define NCCL_ALGO_UNDEF -1
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
|
||||
@@ -14,8 +14,6 @@
|
||||
|
||||
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
||||
|
||||
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetFinalize(struct ncclComm* comm);
|
||||
|
||||
|
||||
@@ -37,10 +37,11 @@
|
||||
#define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommSplit 18
|
||||
#define NVTX_SID_CommFinalize 19
|
||||
#define NVTX_SID_CommShrink 20
|
||||
// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
|
||||
|
||||
// Define static schema ID for the reduction operation.
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 20 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 21 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
|
||||
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
|
||||
|
||||
|
||||
@@ -70,6 +70,16 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static cons
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommShrink, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(int, nranks, TYPE_INT, nccl_nvtxNranksStr),
|
||||
(int, myrank, TYPE_INT, nccl_nvtxRankStr),
|
||||
(int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
|
||||
(int, num_exclude, TYPE_INT, "num_exclude")
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
|
||||
|
||||
@@ -29,10 +29,9 @@
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
// Max number of ncclNet objects which can live in the same process
|
||||
#define NCCL_NET_MAX_PLUGINS 3
|
||||
|
||||
// NCCL core profiler callback for network defined events instrumentation
|
||||
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
|
||||
#ifndef NCCL_NET_MAX_PLUGINS
|
||||
#define NCCL_NET_MAX_PLUGINS 16
|
||||
#endif
|
||||
|
||||
#include "net/net_v10.h"
|
||||
#include "net/net_v9.h"
|
||||
|
||||
@@ -19,43 +19,53 @@ enum {
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
|
||||
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
|
||||
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
|
||||
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
|
||||
ncclProfilerProxyOpInProgress_v4 = 19,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
ncclProfilerProxyStepSendGPUWait = 8,
|
||||
ncclProfilerProxyStepSendPeerWait_v4 = 20,
|
||||
ncclProfilerProxyStepSendWait = 9,
|
||||
ncclProfilerProxyStepRecvWait = 10,
|
||||
ncclProfilerProxyStepRecvFlushWait = 11,
|
||||
ncclProfilerProxyStepRecvGPUWait = 12,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
ncclProfilerProxyCtrlIdle = 13,
|
||||
ncclProfilerProxyCtrlActive = 14,
|
||||
ncclProfilerProxyCtrlSleep = 15,
|
||||
ncclProfilerProxyCtrlWakeup = 16,
|
||||
ncclProfilerProxyCtrlAppend = 17,
|
||||
ncclProfilerProxyCtrlAppendEnd = 18,
|
||||
|
||||
/* Network defined event states */
|
||||
ncclProfilerNetPluginUpdate = 21,
|
||||
|
||||
/* Kernel event states */
|
||||
ncclProfilerKernelChStop = 22,
|
||||
} ncclProfilerEventState_t;
|
||||
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
|
||||
|
||||
#include <cstdint>
|
||||
#include "profiler/profiler_v4.h"
|
||||
#include "profiler/profiler_v3.h"
|
||||
#include "profiler/profiler_v2.h"
|
||||
#include "profiler/profiler_v1.h"
|
||||
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v4_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
|
||||
|
||||
#define NCCL_PROFILER_NET_VER_BITS (16)
|
||||
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V4_H_
|
||||
#define PROFILER_V4_H_
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
uint8_t nChannels;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
uint64_t pTimer; // start timestamp from GPU globaltimer
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v4_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
|
||||
struct {
|
||||
void* data;
|
||||
} netPlugin;
|
||||
|
||||
struct {
|
||||
uint64_t pTimer;
|
||||
} kernelCh;
|
||||
} ncclProfilerEventStateArgs_v4_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// - commName : user assigned communicator name
|
||||
// - commHash : communicator id
|
||||
// - nNodes : number of nodes in communicator
|
||||
// - nranks : number of ranks in communicator
|
||||
// - rank : rank identifier in communicator
|
||||
// - logfn : logger function
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v4_t;
|
||||
|
||||
#endif
|
||||
@@ -21,8 +21,8 @@ struct ncclProxyConnector;
|
||||
|
||||
struct ncclProfilerProxy {
|
||||
bool initialized;
|
||||
uint64_t* workStarted/*[MAXCHANNELS]*/;
|
||||
uint64_t* workCompleted/*[MAXCHANNELS]*/;
|
||||
struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
|
||||
struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
|
||||
uint64_t workCounter[MAXCHANNELS]; // host work counter
|
||||
struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
|
||||
struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
|
||||
@@ -43,8 +43,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
|
||||
|
||||
// Proxy Op Start/Stop Event Wrappers
|
||||
ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProfilerStartProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
|
||||
// Proxy Step Start/Stop Event Wrappers
|
||||
@@ -57,11 +56,11 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand
|
||||
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
|
||||
|
||||
// Kernel Channel Start/Stop Event Wrappers
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start);
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop);
|
||||
|
||||
// Record Event Wrappers
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
|
||||
|
||||
|
||||
@@ -118,6 +118,13 @@ struct ncclProxyOp {
|
||||
facebook_rccl::ProxyTraceExtraInfo traceInfo;
|
||||
};
|
||||
|
||||
struct ncclProxySubArgs;
|
||||
|
||||
struct ncclProxyEventHandle {
|
||||
void* stepEventHandle;
|
||||
struct ncclProxySubArgs* subArgPtr;
|
||||
};
|
||||
|
||||
struct ncclProxySubArgs {
|
||||
struct ncclProxyConnection* connection;
|
||||
int reg;
|
||||
@@ -150,13 +157,12 @@ struct ncclProxySubArgs {
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
int rank;
|
||||
uint64_t profilerSteps;
|
||||
pid_t pid;
|
||||
void* profilerContext;
|
||||
void* taskEventHandle;
|
||||
void* opEventHandle;
|
||||
void* kernelEventHandle;
|
||||
void* stepEventHandles[NCCL_STEPS];
|
||||
struct ncclProxyEventHandle pHandles[NCCL_STEPS];
|
||||
size_t transSize;
|
||||
uint64_t workCounter;
|
||||
|
||||
@@ -254,6 +260,8 @@ struct ncclProxyPeer {
|
||||
};
|
||||
|
||||
struct ncclSharedNetComms {
|
||||
int activeConnect[MAXCHANNELS];
|
||||
int activeAccept[MAXCHANNELS];
|
||||
void* sendComm[MAXCHANNELS];
|
||||
void* recvComm[MAXCHANNELS];
|
||||
int sendRefCount[MAXCHANNELS];
|
||||
|
||||
@@ -29,18 +29,24 @@ struct ncclRegNetHandles {
|
||||
struct ncclRegNetHandles* next;
|
||||
};
|
||||
|
||||
struct ncclSymRegTask {
|
||||
struct ncclSymRegTask *next;
|
||||
void* buff;
|
||||
size_t baseSize;
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
struct ncclReg* regHandle;
|
||||
size_t alignment;
|
||||
};
|
||||
|
||||
struct ncclReg {
|
||||
// common attributes
|
||||
size_t pages;
|
||||
uintptr_t begAddr, endAddr; // page aligned
|
||||
int localRefs;
|
||||
int graphRefs;
|
||||
uintptr_t addr;
|
||||
uint32_t state;
|
||||
// net reg
|
||||
struct ncclRegNetHandles* netHandleHead;
|
||||
// nvls reg
|
||||
uintptr_t baseAddr;
|
||||
size_t baseSize;
|
||||
CUdeviceptr regAddr;
|
||||
size_t regUCSize, regMCSize;
|
||||
int dev;
|
||||
@@ -52,6 +58,10 @@ struct ncclReg {
|
||||
// general ipc reg
|
||||
struct ncclPeerRegIpcAddr regIpcAddrs;
|
||||
struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
|
||||
// symmetric reg
|
||||
void* baseSymPtr;
|
||||
size_t symSize;
|
||||
int winFlags;
|
||||
};
|
||||
|
||||
struct ncclRegCache {
|
||||
@@ -60,10 +70,14 @@ struct ncclRegCache {
|
||||
uintptr_t pageSize;
|
||||
};
|
||||
|
||||
struct ncclWindow {
|
||||
struct ncclReg* handle;
|
||||
};
|
||||
|
||||
ncclResult_t ncclRegCleanup(struct ncclComm* comm);
|
||||
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
|
||||
ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
|
||||
ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
|
||||
ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
#ifndef NCCL_REGISTER_INLINE_H_
|
||||
#define NCCL_REGISTER_INLINE_H_
|
||||
|
||||
#include "comm.h"
|
||||
#include "register.h"
|
||||
|
||||
static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** outReg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
*outReg = NULL;
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
if (slot == cache->population) return ncclSuccess;
|
||||
struct ncclReg *reg = cache->slots[slot];
|
||||
if ((uintptr_t)data < reg->begAddr) return ncclSuccess;
|
||||
if ((uintptr_t)data + size <= reg->endAddr) {
|
||||
*outReg = reg;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) {
|
||||
struct ncclReg* regRecord = NULL;
|
||||
*symPtr = NULL;
|
||||
*outReg = NULL;
|
||||
NCCLCHECK(ncclRegFind(comm, data, size, ®Record));
|
||||
if (regRecord && regRecord->baseSymPtr) {
|
||||
*symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr);
|
||||
*outReg = regRecord;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -15,25 +15,35 @@ typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, voi
|
||||
typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char ** status_string);
|
||||
typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
|
||||
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
#define CUPFN(symbol) symbol
|
||||
#else
|
||||
#define CUPFN(symbol) pfn_##symbol
|
||||
#endif
|
||||
|
||||
// Check CUDA PFN driver calls
|
||||
#define CUCHECK(cmd) do { \
|
||||
#define HSACHECK(cmd) do { \
|
||||
hsa_status_t err = pfn_##cmd; \
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
WARN("ROCr failure '%s'", errStr); \
|
||||
WARN("HIP failure '%s'", errStr); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
// Check CUDA PFN driver calls
|
||||
#define CUCHECK(cmd) do { \
|
||||
hipError_t err = cmd; \
|
||||
if( err != hipSuccess ) { \
|
||||
WARN("HIP failure '%s' at %s:%d", hipGetErrorString(err), __FILE__, __LINE__); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUCHECKGOTO(cmd, res, label) do { \
|
||||
hsa_status_t err = pfn_##cmd; \
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
WARN("ROCr failure '%s'", errStr); \
|
||||
hipError_t err = cmd; \
|
||||
if( err != hipSuccess ) { \
|
||||
WARN("HIP failure '%s' at %s:%d", hipGetErrorString(err), __FILE__, __LINE__); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
@@ -45,7 +55,7 @@ typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
INFO(NCCL_ALL,"%s:%d ROCr failure '%s'", __FILE__, __LINE__, errStr); \
|
||||
INFO(NCCL_ALL,"%s:%d HIP failure '%s'", __FILE__, __LINE__, errStr); \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
|
||||
@@ -69,8 +69,10 @@ struct ncclSocket {
|
||||
|
||||
const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
|
||||
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
|
||||
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
||||
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
|
||||
ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
|
||||
union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found);
|
||||
ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
|
||||
int* nIfs);
|
||||
|
||||
// Initialize a socket
|
||||
ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
#ifndef NCCL_DEVICE_SYMMETRIC_H_
|
||||
#define NCCL_DEVICE_SYMMETRIC_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
#include "bitops.h"
|
||||
|
||||
constexpr int ncclSymMaxBlocks = 64;
|
||||
constexpr int ncclSymMaxThreads = 512;
|
||||
constexpr int ncclSymLLMaxEltSize = 64;
|
||||
|
||||
constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) {
|
||||
return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize;
|
||||
}
|
||||
|
||||
constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) {
|
||||
return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize);
|
||||
}
|
||||
|
||||
struct alignas(16) ncclSymDevBase {
|
||||
uint32_t llEpoch[ncclSymMaxBlocks];
|
||||
uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks];
|
||||
uint32_t barInboxMc[ncclSymMaxBlocks];
|
||||
uint32_t barInboxPerPeer[];
|
||||
|
||||
static constexpr size_t size(int nRanks) {
|
||||
return sizeof(ncclSymDevBase) +
|
||||
alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) +
|
||||
ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks);
|
||||
}
|
||||
};
|
||||
|
||||
static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) {
|
||||
// Get pointer to buffer trailing the header struct.
|
||||
char* ans = (char*)(base + 1);
|
||||
// Skip over barInboxPerPeer[]
|
||||
ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16);
|
||||
// Skip to our block
|
||||
int epochSize = ncclSymLLEpochSize(nRanks);
|
||||
ans += block * /*epochs=*/2 * epochSize;
|
||||
ans += (epoch & 1)*epochSize;
|
||||
return (uint4*)ans;
|
||||
}
|
||||
|
||||
struct ncclSymDevComm {
|
||||
ncclSymDevBase* base;
|
||||
ncclSymDevBase* baseMc;
|
||||
uint32_t stride4G;
|
||||
int nRanks, rank;
|
||||
uint32_t nRanks_rcp32; // idivRcp32(nRanks)
|
||||
};
|
||||
|
||||
struct alignas(16) ncclSymDevArgs {
|
||||
struct ncclSymDevComm comm;
|
||||
int rootRank;
|
||||
uint64_t redOpArg; // must be collectively uniform
|
||||
size_t nElts;
|
||||
char* input;
|
||||
char* output;
|
||||
};
|
||||
|
||||
enum ncclSymKernelId {
|
||||
ncclSymKernelId_AllReduce_AGxLL_R,
|
||||
ncclSymKernelId_AllReduce_AGxLLMC_R,
|
||||
ncclSymKernelId_AllReduce_RSxLD_AGxST,
|
||||
ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC,
|
||||
|
||||
ncclSymKernelId_AllGather_LL,
|
||||
ncclSymKernelId_AllGather_LLMC,
|
||||
ncclSymKernelId_AllGather_ST,
|
||||
ncclSymKernelId_AllGather_STMC,
|
||||
|
||||
ncclSymKernelId_ReduceScatter_LL,
|
||||
ncclSymKernelId_ReduceScatter_LD,
|
||||
ncclSymKernelId_ReduceScatter_LDMC,
|
||||
|
||||
ncclSymKernelId_Count
|
||||
};
|
||||
|
||||
bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
|
||||
|
||||
ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps);
|
||||
|
||||
// Generated by src/device/symmetric/generate.py
|
||||
extern int const ncclSymKernelCount;
|
||||
extern void* const ncclSymKernelList[];
|
||||
void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
|
||||
const char* ncclSymKernelIdToString(int kernelId);
|
||||
|
||||
#endif
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
#include "proxy.h"
|
||||
#include "comm.h"
|
||||
#include "bootstrap.h"
|
||||
|
||||
extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
@@ -37,7 +38,15 @@ struct ncclConnector;
|
||||
struct ncclComm;
|
||||
|
||||
#define CHANNEL_MASK_OFFSET(nranks, connIndex) (nranks * (connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0))
|
||||
|
||||
#define CONNECT_SIZE 256
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
#define NCCL_MAX_PAGE_SIZE (512L * 1024L)
|
||||
#define NCCL_REC_PAGE_SIZE (4L * 1024L)
|
||||
#else
|
||||
#define NCCL_MAX_PAGE_SIZE (512L * 1024L * 1024L)
|
||||
#define NCCL_REC_PAGE_SIZE (2L * 1024L * 1024L)
|
||||
#endif
|
||||
struct ncclConnect {
|
||||
char data[CONNECT_SIZE];
|
||||
};
|
||||
@@ -65,6 +74,7 @@ struct ncclNvlsSharedRes {
|
||||
char* ucBuff; // Unicast NVLS buffer address
|
||||
char* ucCredit; // Unicast NVLS credit address
|
||||
int nChannels;
|
||||
int nHeads;
|
||||
struct ncclShmemCollBuff nvlsShmem;
|
||||
void *nvlsShmemHandle;
|
||||
};
|
||||
@@ -104,7 +114,8 @@ struct ncclTransport {
|
||||
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, bool* needsProxy=NULL);
|
||||
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
|
||||
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode);
|
||||
ncclResult_t ncclTransportIsAllDirectP2p(struct ncclComm* comm, int* isAllDirectP2p);
|
||||
|
||||
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
|
||||
@@ -139,5 +150,15 @@ ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, siz
|
||||
ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
|
||||
ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
|
||||
ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
|
||||
ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels);
|
||||
|
||||
ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr);
|
||||
ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr);
|
||||
ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr);
|
||||
ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr);
|
||||
ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -44,6 +44,12 @@ static long log2i(long n) {
|
||||
return log2Down(n);
|
||||
}
|
||||
|
||||
// Comparator function for qsort/bsearch to compare integers
|
||||
static int compareInts(const void *a, const void *b) {
|
||||
int ia = *(const int*)a, ib = *(const int*)b;
|
||||
return (ia > ib) - (ia < ib);
|
||||
}
|
||||
|
||||
inline uint64_t clockNano() {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
|
||||
Ссылка в новой задаче
Block a user