Merge remote-tracking branch 'nccl/master' into develop

Этот коммит содержится в:
BertanDogancay
2025-08-28 15:45:42 -05:00
родитель a0ec15bafe 72d2432094
Коммит 08a7be231b
108 изменённых файлов: 7754 добавлений и 2129 удалений
+7 -8
Просмотреть файл
@@ -193,10 +193,9 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
#define MAX_ALLOC_TRACK_NGPU 128
extern struct allocationTracker allocTracker[];
#if CUDART_VERSION >= 11030
#if ROCM_VERSION >= 70000
#include <cuda.h>
#include "cudawrap.h"
#include "rocmwrap.h"
// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
@@ -262,7 +261,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
prop.requestedHandleTypes = type;
prop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
// CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
@@ -318,21 +317,21 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
extern int ncclCuMemEnable();
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) {
WARN("CUMEM not supported prior to CUDA 11.3");
WARN("CUMEM not supported prior to ROCm 7.0");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemFree(void *ptr) {
WARN("CUMEM not supported prior to CUDA 11.3");
WARN("CUMEM not supported prior to ROCm 7.0");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
WARN("CUMEM not supported prior to CUDA 11.3");
WARN("CUMEM not supported prior to ROCm 7.0");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
WARN("CUMEM not supported prior to CUDA 11.3");
WARN("CUMEM not supported prior to ROCm 7.0");
return ncclInternalError;
}
#endif
+13
Просмотреть файл
@@ -0,0 +1,13 @@
/*************************************************************************
* Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ALLOCATOR_H_
#define NCCL_ALLOCATOR_H_
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
#endif
+11
Просмотреть файл
@@ -122,6 +122,10 @@ typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);
typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);
typedef ncclResult_t (*ncclCommShrink_fn_t)(ncclComm_t comm, int* excludeRanksList,
int excludeRanksCount, ncclComm_t *newcomm,
ncclConfig_t* config, int shrinkFlags);
typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, int key,
ncclComm_t* newcomm, ncclConfig_t* config);
@@ -158,6 +162,10 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff,
typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);
typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win);
typedef struct rcclApiFuncTable
{
uint64_t size;
@@ -184,6 +192,7 @@ typedef struct rcclApiFuncTable
ncclCommFinalize_fn_t ncclCommFinalize_fn;
ncclCommDestroy_fn_t ncclCommDestroy_fn;
ncclCommAbort_fn_t ncclCommAbort_fn;
ncclCommShrink_fn_t ncclCommShrink_fn;
ncclCommSplit_fn_t ncclCommSplit_fn;
ncclGetErrorString_fn_t ncclGetErrorString_fn;
ncclGetLastError_fn_t ncclGetLastError_fn;
@@ -198,6 +207,8 @@ typedef struct rcclApiFuncTable
mscclUnloadAlgo_fn_t mscclUnloadAlgo_fn;
ncclCommRegister_fn_t ncclCommRegister_fn;
ncclCommDeregister_fn_t ncclCommDeregister_fn;
ncclCommWindowRegister_fn_t ncclCommWindowRegister_fn;
ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn;
ncclAllReduceWithBias_fn_t ncclAllReduceWithBias_fn;
} rcclApiFuncTable;
+163 -23
Просмотреть файл
@@ -19,6 +19,28 @@
#endif
#endif
template<typename Int>
constexpr static __host__ __device__ Int minval(Int a) { return a; }
template<typename Int, typename ...More>
constexpr static __host__ __device__ Int minval(Int a, Int b, More ...more) {
#if __CUDA_ARCH__
return minval(min(a, b), more...);
#else
return minval(a < b ? a : b, more...);
#endif
}
template<typename Int>
constexpr static __host__ __device__ Int maxval(Int a) { return a; }
template<typename Int, typename ...More>
constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
#if __CUDA_ARCH__
return maxval(max(a, b), more...);
#else
return maxval(a > b ? a : b, more...);
#endif
}
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
@@ -32,32 +54,150 @@
size = ((size + (align) - 1) / (align)) * (align);
template<typename X, typename Y, typename Z = decltype(X()+Y())>
__host__ __device__ constexpr Z divUp(X x, Y y) {
static __host__ __device__ constexpr Z divUp(X x, Y y) {
return (x+y-1)/y;
}
template<typename X, typename Y, typename Z = decltype(X()+Y())>
__host__ __device__ constexpr Z roundUp(X x, Y y) {
static __host__ __device__ constexpr Z roundUp(X x, Y y) {
return (x+y-1) - (x+y-1)%y;
}
template<typename X, typename Y, typename Z = decltype(X()+Y())>
__host__ __device__ constexpr Z roundDown(X x, Y y) {
static __host__ __device__ constexpr Z roundDown(X x, Y y) {
return x - x%y;
}
// assumes second argument is a power of 2
template<typename X, typename Z = decltype(X()+int())>
__host__ __device__ constexpr Z alignUp(X x, int a) {
static __host__ __device__ constexpr Z alignUp(X x, int a) {
return (x + a-1) & Z(-a);
}
// assumes second argument is a power of 2
template<typename X, typename Z = decltype(X()+int())>
__host__ __device__ constexpr Z alignDown(X x, int a) {
static __host__ __device__ constexpr Z alignDown(X x, int a) {
return x & Z(-a);
}
template<typename Int>
inline __host__ __device__ int countOneBits(Int x) {
constexpr __host__ __device__ bool isPow2(Int x) {
return (x & (x-1)) == 0;
}
template<typename T>
static __host__ __device__ T add4G(T base, int delta4G) {
union { T tmp; uint32_t u32[2]; };
tmp = base;
u32[1] += delta4G;
return tmp;
}
template<typename T>
static __host__ __device__ T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
union { T tmp; uint32_t u32[2]; };
tmp = ptr;
u32[1] += delta4G;
if (u32[1] >= hi4G) u32[1] -= hi4G-lo4G;
return tmp;
}
template<typename T>
static __host__ __device__ T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
union { T tmp; uint32_t u32[2]; };
tmp = ptr;
u32[1] -= delta4G;
if (u32[1] < lo4G) u32[1] += hi4G-lo4G;
return tmp;
}
// Produce the reciprocal of x for use in idivByRcp
constexpr __host__ __device__ uint32_t idivRcp32(uint32_t x) {
return uint32_t(uint64_t(0x100000000)/x);
}
constexpr __host__ __device__ uint64_t idivRcp64(uint64_t x) {
return uint64_t(-1)/x + isPow2(x);
}
static __host__ __device__ uint32_t mul32hi(uint32_t a, uint32_t b) {
#if __CUDA_ARCH__
return __umulhi(a, b);
#else
return uint64_t(a)*b >> 32;
#endif
}
static __host__ __device__ uint64_t mul64hi(uint64_t a, uint64_t b) {
#if __CUDA_ARCH__
return __umul64hi(a, b);
#else
return (uint64_t)(((unsigned __int128)a)*b >> 64);
#endif
}
// Produce the reciprocal of x*y given their respective reciprocals. This incurs
// no integer division on device.
static __host__ __device__ uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
if (xrcp == 0) return yrcp;
if (yrcp == 0) return xrcp;
uint32_t rcp = mul32hi(xrcp, yrcp);
uint32_t rem = -x*y*rcp;
if (x*y <= rem) rcp += 1;
return rcp;
}
static __host__ __device__ uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
if (xrcp == 0) return yrcp;
if (yrcp == 0) return xrcp;
uint64_t rcp = mul64hi(xrcp, yrcp);
uint64_t rem = -x*y*rcp;
if (x*y <= rem) rcp += 1;
return rcp;
}
// Fast integer division where divisor has precomputed reciprocal.
// idivFast(x, y, idivRcp(y)) == x/y
static __host__ __device__ void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
uint32_t q = x, r = 0;
if (yrcp != 0) {
q = mul32hi(x, yrcp);
r = x - y*q;
if (r >= y) { q += 1; r -= y; }
}
*quo = q;
*rem = r;
}
static __host__ __device__ void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
uint64_t q = x, r = 0;
if (yrcp != 0) {
q = mul64hi(x, yrcp);
r = x - y*q;
if (r >= y) { q += 1; r -= y; }
}
*quo = q;
*rem = r;
}
static __host__ __device__ uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
uint32_t q, r;
idivmodFast32(&q, &r, x, y, yrcp);
return q;
}
static __host__ __device__ uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
uint64_t q, r;
idivmodFast64(&q, &r, x, y, yrcp);
return q;
}
static __host__ __device__ uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
uint32_t q, r;
idivmodFast32(&q, &r, x, y, yrcp);
return r;
}
static __host__ __device__ uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
uint64_t q, r;
idivmodFast64(&q, &r, x, y, yrcp);
return r;
}
template<typename Int>
static __host__ __device__ int countOneBits(Int x) {
#if __CUDA_ARCH__
if (sizeof(Int) <= sizeof(unsigned int)) {
return __popc((unsigned int)x);
@@ -83,7 +223,7 @@ inline __host__ __device__ int countOneBits(Int x) {
// Returns index of first one bit or returns -1 if mask is zero.
template<typename Int>
inline __host__ __device__ int firstOneBit(Int mask) {
static __host__ __device__ int firstOneBit(Int mask) {
int i;
#if __CUDA_ARCH__
if (sizeof(Int) <= sizeof(int)) {
@@ -108,14 +248,14 @@ inline __host__ __device__ int firstOneBit(Int mask) {
}
template<typename Int>
inline __host__ __device__ int popFirstOneBit(Int* mask) {
static __host__ __device__ int popFirstOneBit(Int* mask) {
Int tmp = *mask;
*mask &= *mask-1;
return firstOneBit(tmp);
}
template<typename Int>
inline __host__ __device__ int log2Down(Int x) {
static __host__ __device__ int log2Down(Int x) {
int w, n;
#if __CUDA_ARCH__
if (sizeof(Int) <= sizeof(int)) {
@@ -147,7 +287,7 @@ inline __host__ __device__ int log2Down(Int x) {
}
template<typename Int>
inline __host__ __device__ int log2Up(Int x) {
static __host__ __device__ int log2Up(Int x) {
int w, n;
if (x != 0) x -= 1;
#if __CUDA_ARCH__
@@ -180,19 +320,19 @@ inline __host__ __device__ int log2Up(Int x) {
}
template<typename Int>
inline __host__ __device__ Int pow2Up(Int x) {
static __host__ __device__ Int pow2Up(Int x) {
return Int(1)<<log2Up(x);
}
template<typename Int>
inline __host__ __device__ Int pow2Down(Int x) {
static __host__ __device__ Int pow2Down(Int x) {
// True, log2Down can return -1, but we don't normally pass 0 as an argument...
// coverity[negative_shift]
return Int(1)<<log2Down(x);
}
template<typename UInt, int nSubBits>
inline __host__ __device__ UInt reverseSubBits(UInt x) {
static __host__ __device__ UInt reverseSubBits(UInt x) {
if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
switch (8*sizeof(UInt)) {
case 16: x = __builtin_bswap16(x); break;
@@ -225,7 +365,7 @@ template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned lon
// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
template<typename Int>
inline __host__ __device__ Int reverseBits(Int x, int nBits) {
static __host__ __device__ Int reverseBits(Int x, int nBits) {
using UInt = typename ncclToUnsigned<Int>::type;
union { UInt ux; Int sx; };
sx = x;
@@ -249,7 +389,7 @@ inline __host__ __device__ Int reverseBits(Int x, int nBits) {
// has nearly the full range of uint32_t except it only keeps the top 3 bits
// beneath the leading 1 bit and thus has a max value of 0xf0000000.
inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
static __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
int log2x;
#if __CUDA_ARCH__
log2x = 31-__clz(x|1);
@@ -261,7 +401,7 @@ inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
return exponent<<bitsPerPow2 | mantissa;
}
inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
static __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
uint32_t exponent = x>>bitsPerPow2;
uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
if (exponent != 0) exponent -= 1;
@@ -270,16 +410,16 @@ inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
static __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
return u32fpEncode(x, 3);
}
inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
static __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
return u32fpDecode(x, 3);
}
// The hash isn't just a function of the bytes but also where the bytes are split
// into different calls to eatHash().
inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
static __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
char const* ptr = (char const*)bytes;
acc[0] ^= size;
while (size != 0) {
@@ -302,11 +442,11 @@ inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size
}
template<typename T>
inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
static __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
eatHash(acc, (const void*)bytes, sizeof(T));
}
inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
static __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
uint64_t h = acc[0];
h ^= h >> 31;
h *= 0xbac3bd562846de6b;
@@ -316,13 +456,13 @@ inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
return h;
}
inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
static __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
uint64_t acc[2] = {1, 1};
eatHash(acc, bytes, size);
return digestHash(acc);
}
template<typename T>
inline __host__ __device__ uint64_t getHash(const T* bytes) {
static __host__ __device__ uint64_t getHash(const T* bytes) {
return getHash((const void*)bytes, sizeof(T));
}
+48 -18
Просмотреть файл
@@ -19,6 +19,7 @@
#include "graph.h"
#include "nvmlwrap.h"
#include "profiler.h"
#include "allocator.h"
#include "latency_profiler/CollTrace.h"
#include "rccl_common.h"
#include "recorder.h"
@@ -140,7 +141,6 @@ struct ncclSharedResources {
int* tpRankToLocalRank;
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
int persistentRefs;
cudaEvent_t launchEvent, scratchEvent;
@@ -229,6 +229,7 @@ struct ncclTaskColl {
// Profiler plugin
int eActivationMask;
void* eventHandle;
uint8_t nChannels;
};
struct ncclTaskP2p {
struct ncclTaskP2p* next;
@@ -243,6 +244,7 @@ struct ncclTaskP2p {
// Profiler plugin
int eActivationMask;
void* eventHandle;
uint8_t nChannels;
};
struct ncclKernelPlan {
@@ -255,10 +257,14 @@ struct ncclKernelPlan {
bool persistent; // aka captured in a graph
bool isHostCbEnq;
bool isSymColl;
enum ncclDevWorkStorageType workStorageType;
bool kernelSpecialized;
void *kernelFn;
struct ncclDevKernelArgs* kernelArgs;
void* kernelFn;
union {
struct ncclDevKernelArgs* kernelArgs;
struct ncclSymDevArgs* kernelSymArgs;
};
size_t kernelArgsSize;
struct channelMasks channelMask;
bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
@@ -367,6 +373,7 @@ struct ncclKernelPlanner {
struct Peer* peers/*[nRanks]*/;
int nTasksColl, nTasksP2p;
bool persistent;
bool isSymColl;
// The list of user streams aggregated over all tasks present.
struct ncclCudaStreamList* streams;
@@ -430,12 +437,19 @@ struct ncclPeerInfo {
int64_t busId;
struct ncclComm* comm;
int cudaCompCap;
size_t totalGlobalMem;
// MNNVL support
nvmlGpuFabricInfoV_t fabricInfo;
int cuMemSupport;
int version;
};
typedef enum ncclGroupTaskType {
ncclGroupTaskTypeCollective = 0,
ncclGroupTaskTypeSymRegister = 1,
ncclGroupTaskTypeNum = 2,
} ncclGroupTaskType_t;
struct ncclComm {
uint64_t startMagic;
struct ncclMemoryStack memPermanent, memScoped;
@@ -452,9 +466,10 @@ struct ncclComm {
struct ncclTopoSystem* topo;
struct ncclProxyConnector* gproxyConn;
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
bool peerInfoValid;
int netPluginLoaded;
ncclNet_t* ncclNet;
int netPluginIndex;
int ncclNetVer;
ncclNetDeviceType netDeviceType;
ncclCollNet_t* ncclCollNet;
@@ -471,7 +486,6 @@ struct ncclComm {
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
const char* commName;
uint64_t commHash;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
@@ -556,6 +570,7 @@ struct ncclComm {
// Device side of the communicator (for cudaFree's)
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
struct ncclSymDevComm symDevComm;
uint32_t workArgsBytes; // max size of kernel args
uint32_t workFifoBytes; // size of workFifoBuf, power of 2
@@ -563,12 +578,10 @@ struct ncclComm {
void* workFifoBufDev;
void* workFifoBufGdrHandle;
// Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
// Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
uint32_t workFifoConsumedLeast;
// Monotonic number of bytes (mod 1<<32) sent to fifo.
uint32_t workFifoProduced;
uint32_t workFifoProducedLastRecorded;
uint32_t workFifoConsumed;
// Intra-process sync
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
@@ -584,10 +597,8 @@ struct ncclComm {
struct ncclProxyState* proxyState;
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
// Whether this communicator uses collNet
int collNetSupport;
bool isOneRPN;
uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
bool intraNodeP2pSupport;
int* collNetHeads;
int collNetHeadsNum;
int* collNetDenseToUserRank;
@@ -609,7 +620,7 @@ struct ncclComm {
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
// this comm is not yet in a group.
struct ncclComm* groupNext;
struct ncclComm* groupNext[ncclGroupTaskTypeNum];
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
struct ncclComm* preconnectNext;
int localPersistentRefs; // number of persistent plan-lists capturing this comm
@@ -631,6 +642,7 @@ struct ncclComm {
ncclUserRedOp *userRedOps;
// Queue of things for the main thread to do
int reclaimSteps;
struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
hipEvent_t doneEvent;
@@ -670,6 +682,9 @@ struct ncclComm {
// group job to support multi-thread FT
struct ncclGroupJob *groupJob;
// Flag indicating if this communicator shares resources with parent or children
bool shareResources;
// Tuning plugin
int tunerPluginLoaded;
ncclTuner_t* tuner;
@@ -683,16 +698,25 @@ struct ncclComm {
// buffer registration cache
struct ncclRegCache regCache;
int isAllNvlink;
bool isAllDirectP2p;
int symmetricSupport;
bool useNetPXN;
bool useGdr;
int splitCount;
// symmetric buffer
uint8_t* baseUCSymPtr;
uint8_t* baseMCSymPtr;
size_t baseStride;
size_t symAllocHead;
CUmemGenericAllocationHandle symMCHandle;
struct ncclIntruQueue<struct ncclSymRegTask, &ncclSymRegTask::next> symRegTaskQueue;
// Unroll factor for comm [RCCL]
int unroll;
// custom collective
// custom collective [RCCL]
bool enableCustColl;
uint64_t endMagic;
};
@@ -724,15 +748,21 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
return ncclSuccess;
}
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm, bool waitSome) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
while (true) {
struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
if (cb == nullptr) break;
cudaError_t ok = cudaEventSynchronize(cb->event);
if (ok == cudaErrorNotReady) break;
cudaError_t ok;
if (waitSome) {
ok = cudaEventSynchronize(cb->event);
waitSome = false;
} else {
ok = cudaEventQuery(cb->event);
if (ok == cudaErrorNotReady) break;
}
ncclIntruQueueDequeue(&comm->eventCallbackQueue);
if (ok == cudaSuccess) {
NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
+25
Просмотреть файл
@@ -58,4 +58,29 @@ static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
return ncclSuccess;
}
static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
int c = 0;
int start = -1;
// Iterate through all possible CPU bits plus one extra position
for (int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
// Start of a new range
if (isSet && start == -1) {
start = cpu;
}
// End of a range, add comma between ranges
if (!isSet && start != -1) {
if (cpu-1 == start) {
c += snprintf(str+c, len-c, "%s%d", c ? "," : "", start);
} else {
c += snprintf(str+c, len-c, "%s%d-%d", c ? "," : "", start, cpu-1);
}
if (c >= len-1) break;
start = -1;
}
}
if (c == 0) str[0] = '\0';
return str;
}
#endif
+37 -33
Просмотреть файл
@@ -36,6 +36,10 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
} \
} while(false)
#define CUCALL(cmd) do { \
pfn_##cmd; \
} while(false)
#define CUCHECKGOTO(cmd, res, label) do { \
CUresult err = pfn_##cmd; \
if( err != CUDA_SUCCESS ) { \
@@ -66,49 +70,49 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
} \
} while(0)
#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
#if CUDART_VERSION >= 11030
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate);
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 11040);
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel, 4000);
#if CUDART_VERSION >= 11080
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx, 11060);
#endif
// cuMem API support
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
DECLARE_CUDA_PFN_EXTERN(cuMemCreate);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity);
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemMap);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle, 10020);
#if CUDA_VERSION >= 11070
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr);
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate);
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity);
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind);
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
#endif
#endif
+36 -7
Просмотреть файл
@@ -14,6 +14,7 @@
#include <hip/hip_bfloat16.h>
#include "nccl_common.h"
#include "bitops.h"
#include "symmetric.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit_struct.h"
#endif
@@ -41,6 +42,30 @@ extern const char* funcNames[];
#define NCCL_CUDA_ARCH 0
#endif
#ifdef __CUDA_ARCH_SPECIFIC__
#define NCCL_CUDA_ARCH_SPECIFIC __CUDA_ARCH_SPECIFIC__
#elif defined(__CUDA_ARCH_HAS_FEATURE__)
#if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
#define NCCL_CUDA_ARCH_SPECIFIC 900
#elif __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
#define NCCL_CUDA_ARCH_SPECIFIC 1000
#elif __CUDA_ARCH_HAS_FEATURE__(SM101_ALL)
#define NCCL_CUDA_ARCH_SPECIFIC 1010
#elif __CUDA_ARCH_HAS_FEATURE__(SM120_ALL)
#define NCCL_CUDA_ARCH_SPECIFIC 1200
#else
#define NCCL_CUDA_ARCH_SPECIFIC 0
#endif
#else
#define NCCL_CUDA_ARCH_SPECIFIC 0
#endif
#ifdef __CUDA_ARCH_FAMILY_SPECIFIC__
#define NCCL_CUDA_ARCH_FAMILY_SPECIFIC __CUDA_ARCH_FAMILY_SPECIFIC__
#else
#define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0
#endif
#include "net_device.h"
enum ncclDevRedOp_t {
@@ -516,6 +541,14 @@ struct alignas(16) ncclDevChannel {
uint64_t workCounter;
};
#define MAX_PROFILER_EVENTS_PER_CHANNEL 64
struct ncclDevProfiler {
struct {
uint64_t counter;
uint64_t timestamp;
} data[MAX_PROFILER_EVENTS_PER_CHANNEL];
};
struct ncclDevComm {
int rank;
int nRanks;
@@ -526,9 +559,6 @@ struct ncclDevComm {
int isAllNvlink;
int p2pnChannelsPerPeer;
// Work fifo return credits
uint32_t* workConsumed/*[MAXCHANNELS]*/;
int* collNetDenseToUserRank;
// Flag to ask NCCL kernels to abort
@@ -540,8 +570,8 @@ struct ncclDevComm {
int* rankToLocalRank;
// Profiler counters
uint64_t* workStarted/*[MAXCHANNELS]*/;
uint64_t* workCompleted/*[MAXCHANNELS]*/;
struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
#if defined(ENABLE_NPKIT)
NpKitEventCollectContext* npKitEventCollectContexts;
@@ -641,7 +671,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
// Our collective unroll should move to the same bytes&insns model as NVLS.
return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
return cudaArch >= 800 ? (cudaArch / 100 == 12 ? 6 : 8) : 4;
}
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
@@ -672,7 +702,6 @@ extern int const ncclDevKernelCount;
extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
// Table of most specialized kernel function to run given func index.
extern int const ncclDevFuncIdCount;
extern int const ncclDevFuncRowToId[];
extern void* const ncclDevKernelForFunc[/*funcIndex*/];
extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
+4
Просмотреть файл
@@ -51,6 +51,8 @@ int ncclPxnDisable(struct ncclComm* comm);
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
ncclResult_t ncclGetUserP2pLevel(int* level);
#define MAX_XGMI_INTER_GPUS 4
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
@@ -81,7 +83,9 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
// Allows for up to 32 NICs per node on GB200-NVL72
#define NCCL_TOPO_MAX_NODES 64
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType);
// Init search. Needs to be done before calling ncclTopoCompute
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
+27 -41
Просмотреть файл
@@ -10,9 +10,11 @@
#include "nccl.h"
#include "comm.h"
#include "allocator.h"
#include "register.h"
ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
void ncclGroupCommJoin(struct ncclComm* comm);
void ncclGroupCommJoin(struct ncclComm* comm, int type);
void ncclGroupCommPreconnect(struct ncclComm* comm);
ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
@@ -53,13 +55,14 @@ ncclResult_t ncclAsyncLaunch(
struct ncclGroupJob {
struct ncclAsyncJob base;
struct ncclComm **groupCommHeadPtr;
struct ncclComm **groupCommPreconnectHeadPtr;
ncclResult_t *groupErrorPtr;
bool *abortFlagPtr;
int *groupBlockingPtr;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
bool initialized;
int groupRefCount;
bool nonBlockingInit;
bool joined;
struct ncclComm *groupCommHead[ncclGroupTaskTypeNum];
struct ncclComm *groupCommPreconnectHead;
ncclResult_t groupError;
bool abortFlag;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncJobs;
};
ncclResult_t ncclGroupStartInternal();
@@ -70,27 +73,9 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
extern __thread ncclResult_t ncclGroupError;
extern __thread struct ncclComm* ncclGroupCommHead;
extern __thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum];
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
extern __thread int ncclGroupBlocking;
extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
extern __thread struct ncclGroupJob ncclGroupJobMain;
static inline void groupResetJobState() {
ncclGroupBlocking = -1;
ncclGroupJobMainPtr = NULL;
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
return;
}
static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
ncclResult_t ret = ncclSuccess;
if (job) {
ret = ncclAsyncJobComplete(&job->base);
groupResetJobState();
}
return ret;
}
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
if (ncclGroupDepth > 0) {
@@ -100,31 +85,32 @@ inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
}
// Add comm to this thread's group
inline void ncclGroupCommJoin(struct ncclComm* comm) {
if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
inline void ncclGroupCommJoin(struct ncclComm* comm, int type) {
if (comm->groupNext[type] == reinterpret_cast<struct ncclComm*>(0x1)) {
// Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
// the users program order yet insures siblings occur consecutively. This
// is required by doLaunches() in "group.cc".
struct ncclComm** pp = &ncclGroupCommHead;
struct ncclComm** pp = &ncclGroupCommHead[type];
while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
pp = &(*pp)->groupNext;
pp = &(*pp)->groupNext[type];
// didn't find its clique, we need to insert it with ascending order based on commHash
if (*pp == nullptr) {
pp = &ncclGroupCommHead;
while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
pp = &ncclGroupCommHead[type];
while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext[type];
}
comm->groupNext = *pp;
comm->groupNext[type] = *pp;
*pp = comm;
// Comms gets a new memory stack scope upon joining. Each task batched for
// this comm is allocated there.
ncclMemoryStackPush(&comm->memScoped);
// Initialize planner
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
memset(&comm->planner, 0, sizeof(comm->planner));
comm->planner.peers = tmp;
if (type == ncclGroupTaskTypeCollective) {
// Initialize planner
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
memset(&comm->planner, 0, sizeof(comm->planner));
comm->planner.peers = tmp;
}
}
ncclGroupBlocking = comm->config.blocking;
}
@@ -137,8 +123,8 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
}
// Comm has left group
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm, int type) {
comm->groupNext[type] = reinterpret_cast<struct ncclComm*>(0x1);
ncclMemoryStackPop(&comm->memScoped);
return ncclSuccess;
}
+18
Просмотреть файл
@@ -0,0 +1,18 @@
#ifndef NCCL_MLX5DV_CORE_H_
#define NCCL_MLX5DV_CORE_H_
/* Basic MLX5 direct verbs structs. Needed to dynamically load MLX5 direct verbs functions without
* explicit including of MLX5 direct verbs header.
*/
#include <stddef.h>
#include <stdint.h>
#include <sys/types.h>
#include <unistd.h>
#include "ibvwrap.h"
enum mlx5dv_reg_dmabuf_access {
MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT = (1<<0),
};
#endif // NCCL_MLX5DV_CORE_H_
+23
Просмотреть файл
@@ -0,0 +1,23 @@
#ifndef NCCL_MLX5DV_SYMBOLS_H_
#define NCCL_MLX5DV_SYMBOLS_H_
#ifdef NCCL_BUILD_MLX5DV
#include <infiniband/mlx5dv.h>
#else
#include "mlx5/mlx5dvcore.h"
#endif
#include "nccl.h"
/* MLX5 Direct Verbs Function Pointers*/
struct ncclMlx5dvSymbols {
bool (*mlx5dv_internal_is_supported)(struct ibv_device *device);
int (*mlx5dv_internal_get_data_direct_sysfs_path)(struct ibv_context *context, char *buf, size_t buf_len);
/* DMA-BUF support */
struct ibv_mr * (*mlx5dv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
};
/* Constructs MLX5 direct verbs symbols per rdma-core linking or dynamic loading mode */
ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols);
#endif // NCCL_MLX5DV_SYMBOLS_H_
+41
Просмотреть файл
@@ -0,0 +1,41 @@
/*************************************************************************
* Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
* Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved.
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_MLX5DVWRAP_H_
#define NCCL_MLX5DVWRAP_H_
#include <arpa/inet.h>
#include <netinet/in.h>
#ifdef NCCL_BUILD_MLX5DV
#include <infiniband/mlx5dv.h>
#else
#include "mlx5/mlx5dvcore.h"
#endif
#include "core.h"
#include "ibvwrap.h"
#include <sys/types.h>
#include <unistd.h>
typedef enum mlx5dv_return_enum
{
MLX5DV_SUCCESS = 0, //!< The operation was successful
} mlx5dv_return_t;
ncclResult_t wrap_mlx5dv_symbols(void);
/* NCCL wrappers of MLX5 direct verbs functions */
bool wrap_mlx5dv_is_supported(struct ibv_device *device);
ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len);
/* DMA-BUF support */
ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
#endif // NCCL_MLX5DVWRAP_H_
+14 -1
Просмотреть файл
@@ -7,6 +7,9 @@
#ifndef NCCL_DEBUG_H_
#define NCCL_DEBUG_H_
#include <cstdint>
#include "nccl.h"
typedef enum {
NCCL_LOG_NONE = 0,
NCCL_LOG_VERSION = 1,
@@ -39,6 +42,16 @@ typedef enum {
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
// NCCL core profiler callback for network defined events instrumentation
enum {
ncclProfilerNetEventStart = 0,
ncclProfilerNetEventStop,
ncclProfilerNetEventUpdate,
ncclProfilerNetEventUpdateAndStop,
};
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
typedef enum {
ncclFuncBroadcast = 0,
@@ -54,7 +67,7 @@ typedef enum {
ncclNumFuncs = 10
} ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT
#define NCCL_ALGO_UNDEF -1
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
-2
Просмотреть файл
@@ -14,8 +14,6 @@
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
ncclResult_t ncclNetInit(struct ncclComm* comm);
ncclResult_t ncclNetFinalize(struct ncclComm* comm);
+2 -1
Просмотреть файл
@@ -37,10 +37,11 @@
#define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommSplit 18
#define NVTX_SID_CommFinalize 19
#define NVTX_SID_CommShrink 20
// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
// Define static schema ID for the reduction operation.
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 20 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 21 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
+10
Просмотреть файл
@@ -70,6 +70,16 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static cons
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommShrink, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
(int, nranks, TYPE_INT, nccl_nvtxNranksStr),
(int, myrank, TYPE_INT, nccl_nvtxRankStr),
(int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
(int, num_exclude, TYPE_INT, "num_exclude")
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
+3 -4
Просмотреть файл
@@ -29,10 +29,9 @@
#define NCCL_NET_MAX_REQUESTS 32
// Max number of ncclNet objects which can live in the same process
#define NCCL_NET_MAX_PLUGINS 3
// NCCL core profiler callback for network defined events instrumentation
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
#ifndef NCCL_NET_MAX_PLUGINS
#define NCCL_NET_MAX_PLUGINS 16
#endif
#include "net/net_v10.h"
#include "net/net_v9.h"
+32 -22
Просмотреть файл
@@ -19,43 +19,53 @@ enum {
};
typedef enum {
ncclProfilerProxyOpSendPosted,
ncclProfilerProxyOpSendRemFifoWait,
ncclProfilerProxyOpSendTransmitted,
ncclProfilerProxyOpSendDone,
ncclProfilerProxyOpRecvPosted,
ncclProfilerProxyOpRecvReceived,
ncclProfilerProxyOpRecvTransmitted,
ncclProfilerProxyOpRecvDone,
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
ncclProfilerProxyOpInProgress_v4 = 19,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait,
ncclProfilerProxyStepSendWait,
ncclProfilerProxyStepRecvWait,
ncclProfilerProxyStepRecvFlushWait,
ncclProfilerProxyStepRecvGPUWait,
ncclProfilerProxyStepSendGPUWait = 8,
ncclProfilerProxyStepSendPeerWait_v4 = 20,
ncclProfilerProxyStepSendWait = 9,
ncclProfilerProxyStepRecvWait = 10,
ncclProfilerProxyStepRecvFlushWait = 11,
ncclProfilerProxyStepRecvGPUWait = 12,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle,
ncclProfilerProxyCtrlActive,
ncclProfilerProxyCtrlSleep,
ncclProfilerProxyCtrlWakeup,
ncclProfilerProxyCtrlAppend,
ncclProfilerProxyCtrlAppendEnd,
ncclProfilerProxyCtrlIdle = 13,
ncclProfilerProxyCtrlActive = 14,
ncclProfilerProxyCtrlSleep = 15,
ncclProfilerProxyCtrlWakeup = 16,
ncclProfilerProxyCtrlAppend = 17,
ncclProfilerProxyCtrlAppendEnd = 18,
/* Network defined event states */
ncclProfilerNetPluginUpdate = 21,
/* Kernel event states */
ncclProfilerKernelChStop = 22,
} ncclProfilerEventState_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
#include <cstdint>
#include "profiler/profiler_v4.h"
#include "profiler/profiler_v3.h"
#include "profiler/profiler_v2.h"
#include "profiler/profiler_v1.h"
typedef ncclProfiler_v3_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v4_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
#define NCCL_PROFILER_NET_VER_BITS (16)
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
+123
Просмотреть файл
@@ -0,0 +1,123 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V4_H_
#define PROFILER_V4_H_
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
uint8_t nChannels;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
uint64_t pTimer; // start timestamp from GPU globaltimer
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v4_t;
typedef union {
struct {
size_t transSize;
} proxyStep;
struct {
int appendedProxyOps;
} proxyCtrl;
struct {
void* data;
} netPlugin;
struct {
uint64_t pTimer;
} kernelCh;
} ncclProfilerEventStateArgs_v4_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commName : user assigned communicator name
// - commHash : communicator id
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v4_t;
#endif
+6 -7
Просмотреть файл
@@ -21,8 +21,8 @@ struct ncclProxyConnector;
struct ncclProfilerProxy {
bool initialized;
uint64_t* workStarted/*[MAXCHANNELS]*/;
uint64_t* workCompleted/*[MAXCHANNELS]*/;
struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
uint64_t workCounter[MAXCHANNELS]; // host work counter
struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
@@ -43,8 +43,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
// Proxy Op Start/Stop Event Wrappers
ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclResult_t ncclProfilerStartProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
// Proxy Step Start/Stop Event Wrappers
@@ -57,11 +56,11 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
// Kernel Channel Start/Stop Event Wrappers
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start);
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop);
// Record Event Wrappers
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, ncclProfilerEventState_t eState);
ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
+10 -2
Просмотреть файл
@@ -118,6 +118,13 @@ struct ncclProxyOp {
facebook_rccl::ProxyTraceExtraInfo traceInfo;
};
struct ncclProxySubArgs;
struct ncclProxyEventHandle {
void* stepEventHandle;
struct ncclProxySubArgs* subArgPtr;
};
struct ncclProxySubArgs {
struct ncclProxyConnection* connection;
int reg;
@@ -150,13 +157,12 @@ struct ncclProxySubArgs {
// Profiler plugin
int eActivationMask;
int rank;
uint64_t profilerSteps;
pid_t pid;
void* profilerContext;
void* taskEventHandle;
void* opEventHandle;
void* kernelEventHandle;
void* stepEventHandles[NCCL_STEPS];
struct ncclProxyEventHandle pHandles[NCCL_STEPS];
size_t transSize;
uint64_t workCounter;
@@ -254,6 +260,8 @@ struct ncclProxyPeer {
};
struct ncclSharedNetComms {
int activeConnect[MAXCHANNELS];
int activeAccept[MAXCHANNELS];
void* sendComm[MAXCHANNELS];
void* recvComm[MAXCHANNELS];
int sendRefCount[MAXCHANNELS];
+19 -5
Просмотреть файл
@@ -29,18 +29,24 @@ struct ncclRegNetHandles {
struct ncclRegNetHandles* next;
};
struct ncclSymRegTask {
struct ncclSymRegTask *next;
void* buff;
size_t baseSize;
CUmemGenericAllocationHandle memHandle;
struct ncclReg* regHandle;
size_t alignment;
};
struct ncclReg {
// common attributes
size_t pages;
uintptr_t begAddr, endAddr; // page aligned
int localRefs;
int graphRefs;
uintptr_t addr;
uint32_t state;
// net reg
struct ncclRegNetHandles* netHandleHead;
// nvls reg
uintptr_t baseAddr;
size_t baseSize;
CUdeviceptr regAddr;
size_t regUCSize, regMCSize;
int dev;
@@ -52,6 +58,10 @@ struct ncclReg {
// general ipc reg
struct ncclPeerRegIpcAddr regIpcAddrs;
struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
// symmetric reg
void* baseSymPtr;
size_t symSize;
int winFlags;
};
struct ncclRegCache {
@@ -60,10 +70,14 @@ struct ncclRegCache {
uintptr_t pageSize;
};
struct ncclWindow {
struct ncclReg* handle;
};
ncclResult_t ncclRegCleanup(struct ncclComm* comm);
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle);
#endif
+33
Просмотреть файл
@@ -0,0 +1,33 @@
#ifndef NCCL_REGISTER_INLINE_H_
#define NCCL_REGISTER_INLINE_H_
#include "comm.h"
#include "register.h"
static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** outReg) {
struct ncclRegCache* cache = &comm->regCache;
*outReg = NULL;
for (int slot=0; /*true*/; slot++) {
if (slot == cache->population) return ncclSuccess;
struct ncclReg *reg = cache->slots[slot];
if ((uintptr_t)data < reg->begAddr) return ncclSuccess;
if ((uintptr_t)data + size <= reg->endAddr) {
*outReg = reg;
return ncclSuccess;
}
}
}
static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) {
struct ncclReg* regRecord = NULL;
*symPtr = NULL;
*outReg = NULL;
NCCLCHECK(ncclRegFind(comm, data, size, &regRecord));
if (regRecord && regRecord->baseSymPtr) {
*symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr);
*outReg = regRecord;
}
return ncclSuccess;
}
#endif
+19 -9
Просмотреть файл
@@ -15,25 +15,35 @@ typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, voi
typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char ** status_string);
typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
#ifdef __HIP_PLATFORM_AMD__
#define CUPFN(symbol) symbol
#else
#define CUPFN(symbol) pfn_##symbol
#endif
// Check CUDA PFN driver calls
#define CUCHECK(cmd) do { \
#define HSACHECK(cmd) do { \
hsa_status_t err = pfn_##cmd; \
if( err != HSA_STATUS_SUCCESS ) { \
const char *errStr; \
pfn_hsa_status_string(err, &errStr); \
WARN("ROCr failure '%s'", errStr); \
WARN("HIP failure '%s'", errStr); \
return ncclUnhandledCudaError; \
} \
} while(false)
// Check CUDA PFN driver calls
#define CUCHECK(cmd) do { \
hipError_t err = cmd; \
if( err != hipSuccess ) { \
WARN("HIP failure '%s' at %s:%d", hipGetErrorString(err), __FILE__, __LINE__); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUCHECKGOTO(cmd, res, label) do { \
hsa_status_t err = pfn_##cmd; \
if( err != HSA_STATUS_SUCCESS ) { \
const char *errStr; \
pfn_hsa_status_string(err, &errStr); \
WARN("ROCr failure '%s'", errStr); \
hipError_t err = cmd; \
if( err != hipSuccess ) { \
WARN("HIP failure '%s' at %s:%d", hipGetErrorString(err), __FILE__, __LINE__); \
res = ncclUnhandledCudaError; \
goto label; \
} \
@@ -45,7 +55,7 @@ typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size
if( err != HSA_STATUS_SUCCESS ) { \
const char *errStr; \
pfn_hsa_status_string(err, &errStr); \
INFO(NCCL_ALL,"%s:%d ROCr failure '%s'", __FILE__, __LINE__, errStr); \
INFO(NCCL_ALL,"%s:%d HIP failure '%s'", __FILE__, __LINE__, errStr); \
} \
} while(false)
+4 -2
Просмотреть файл
@@ -69,8 +69,10 @@ struct ncclSocket {
const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found);
ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
int* nIfs);
// Initialize a socket
ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
+90
Просмотреть файл
@@ -0,0 +1,90 @@
#ifndef NCCL_DEVICE_SYMMETRIC_H_
#define NCCL_DEVICE_SYMMETRIC_H_
#include "nccl.h"
#include "nccl_common.h"
#include "bitops.h"
constexpr int ncclSymMaxBlocks = 64;
constexpr int ncclSymMaxThreads = 512;
constexpr int ncclSymLLMaxEltSize = 64;
constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) {
return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize;
}
constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) {
return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize);
}
struct alignas(16) ncclSymDevBase {
uint32_t llEpoch[ncclSymMaxBlocks];
uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks];
uint32_t barInboxMc[ncclSymMaxBlocks];
uint32_t barInboxPerPeer[];
static constexpr size_t size(int nRanks) {
return sizeof(ncclSymDevBase) +
alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) +
ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks);
}
};
static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) {
// Get pointer to buffer trailing the header struct.
char* ans = (char*)(base + 1);
// Skip over barInboxPerPeer[]
ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16);
// Skip to our block
int epochSize = ncclSymLLEpochSize(nRanks);
ans += block * /*epochs=*/2 * epochSize;
ans += (epoch & 1)*epochSize;
return (uint4*)ans;
}
struct ncclSymDevComm {
ncclSymDevBase* base;
ncclSymDevBase* baseMc;
uint32_t stride4G;
int nRanks, rank;
uint32_t nRanks_rcp32; // idivRcp32(nRanks)
};
struct alignas(16) ncclSymDevArgs {
struct ncclSymDevComm comm;
int rootRank;
uint64_t redOpArg; // must be collectively uniform
size_t nElts;
char* input;
char* output;
};
enum ncclSymKernelId {
ncclSymKernelId_AllReduce_AGxLL_R,
ncclSymKernelId_AllReduce_AGxLLMC_R,
ncclSymKernelId_AllReduce_RSxLD_AGxST,
ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC,
ncclSymKernelId_AllGather_LL,
ncclSymKernelId_AllGather_LLMC,
ncclSymKernelId_AllGather_ST,
ncclSymKernelId_AllGather_STMC,
ncclSymKernelId_ReduceScatter_LL,
ncclSymKernelId_ReduceScatter_LD,
ncclSymKernelId_ReduceScatter_LDMC,
ncclSymKernelId_Count
};
bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps);
// Generated by src/device/symmetric/generate.py
extern int const ncclSymKernelCount;
extern void* const ncclSymKernelList[];
void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
const char* ncclSymKernelIdToString(int kernelId);
#endif
+22 -1
Просмотреть файл
@@ -23,6 +23,7 @@
#include "proxy.h"
#include "comm.h"
#include "bootstrap.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
@@ -37,7 +38,15 @@ struct ncclConnector;
struct ncclComm;
#define CHANNEL_MASK_OFFSET(nranks, connIndex) (nranks * (connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0))
#define CONNECT_SIZE 256
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#define NCCL_MAX_PAGE_SIZE (512L * 1024L)
#define NCCL_REC_PAGE_SIZE (4L * 1024L)
#else
#define NCCL_MAX_PAGE_SIZE (512L * 1024L * 1024L)
#define NCCL_REC_PAGE_SIZE (2L * 1024L * 1024L)
#endif
struct ncclConnect {
char data[CONNECT_SIZE];
};
@@ -65,6 +74,7 @@ struct ncclNvlsSharedRes {
char* ucBuff; // Unicast NVLS buffer address
char* ucCredit; // Unicast NVLS credit address
int nChannels;
int nHeads;
struct ncclShmemCollBuff nvlsShmem;
void *nvlsShmemHandle;
};
@@ -104,7 +114,8 @@ struct ncclTransport {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, bool* needsProxy=NULL);
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode);
ncclResult_t ncclTransportIsAllDirectP2p(struct ncclComm* comm, int* isAllDirectP2p);
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
@@ -139,5 +150,15 @@ ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, siz
ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels);
ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm);
ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr);
ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr);
ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm);
ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr);
ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr);
ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm);
#endif
+6
Просмотреть файл
@@ -44,6 +44,12 @@ static long log2i(long n) {
return log2Down(n);
}
// Comparator function for qsort/bsearch to compare integers
static int compareInts(const void *a, const void *b) {
int ia = *(const int*)a, ib = *(const int*)b;
return (ia > ib) - (ia < ib);
}
inline uint64_t clockNano() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);