917 行
29 KiB
C++
917 行
29 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
|
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_COLLECTIVES_H_
|
|
#define NCCL_COLLECTIVES_H_
|
|
|
|
#include "nccl.h"
|
|
#include "nccl_tuner.h"
|
|
#include "device.h"
|
|
|
|
#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
|
|
|
|
// CHUNKSIZE must be a multiple of SLICESIZE
|
|
// RCCL: Benchmarking on single node for MI300X showed improved throughput for single node always using
|
|
// a single slice, so we have separate configurations for single node and multi-node. Single node configs
|
|
// are suffixed with _SINGLE_NODE.
|
|
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
|
|
#define ALLREDUCE_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
|
|
#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
|
|
#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
|
|
#define ALLGATHER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
|
|
#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
|
|
#define ALLTOALL_SLICESTEPS 1
|
|
#define ALLTOALL_CHUNKSTEPS 1
|
|
#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
|
|
#define REDUCESCATTER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
|
|
#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
|
|
#define BROADCAST_SLICESTEPS 1
|
|
#define BROADCAST_CHUNKSTEPS 1
|
|
#define GATHER_SLICESTEPS 1
|
|
#define GATHER_CHUNKSTEPS 1
|
|
#define SCATTER_SLICESTEPS 1
|
|
#define SCATTER_CHUNKSTEPS 1
|
|
#define REDUCE_SLICESTEPS 1
|
|
#define REDUCE_CHUNKSTEPS 1
|
|
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
|
|
#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
|
|
#define ALLTOALL_PIVOT_SLICESTEPS 2
|
|
#define ALLTOALL_PIVOT_CHUNKSTEPS 4
|
|
|
|
static_assert(ALLREDUCE_CHUNKSTEPS == ALLREDUCE_SLICESTEPS_SINGLE_NODE, "ALLREDUCE_CHUNKSTEPS must be equal to ALLREDUCE_SLICESTEPS_SINGLE_NODE");
|
|
static_assert(ALLGATHER_CHUNKSTEPS == ALLGATHER_SLICESTEPS_SINGLE_NODE, "ALLGATHER_CHUNKSTEPS must be equal to ALLGATHER_SLICESTEPS_SINGLE_NODE");
|
|
static_assert(REDUCESCATTER_CHUNKSTEPS == REDUCESCATTER_SLICESTEPS_SINGLE_NODE, "REDUCESCATTER_CHUNKSTEPS must be equal to REDUCESCATTER_SLICESTEPS_SINGLE_NODE");
|
|
|
|
const char* ncclFuncToString(ncclFunc_t op);
|
|
const char* ncclDevRedOpToString(ncclDevRedOp_t op);
|
|
const char* ncclDatatypeToString(ncclDataType_t type);
|
|
const char* ncclAlgoToString(int algo);
|
|
const char* ncclProtoToString(int proto);
|
|
|
|
inline int ncclTypeSize(ncclDataType_t type) {
|
|
switch (type) {
|
|
case ncclInt8:
|
|
case ncclUint8:
|
|
case ncclFloat8e4m3:
|
|
case ncclFloat8e5m2:
|
|
return 1;
|
|
case ncclFloat16:
|
|
case ncclBfloat16:
|
|
return 2;
|
|
case ncclInt32:
|
|
case ncclUint32:
|
|
case ncclFloat32:
|
|
return 4;
|
|
case ncclInt64:
|
|
case ncclUint64:
|
|
case ncclFloat64:
|
|
return 8;
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
#include <sys/types.h>
|
|
|
|
#define NCCL_MODE_NORMAL 0
|
|
#define NCCL_MODE_OFFSET 1
|
|
#define NCCL_MODE_PTR 2
|
|
struct ncclConnFifo {
|
|
int mode;
|
|
int offset;
|
|
ssize_t size;
|
|
void* ptr;
|
|
};
|
|
|
|
#include <stdio.h>
|
|
|
|
class RingAlgorithm {
|
|
protected:
|
|
int refCount;
|
|
int nRanks;
|
|
int nStepsPerLoop;
|
|
int chunkSteps;
|
|
int sliceSteps;
|
|
ssize_t sliceSize;
|
|
ssize_t loopSize;
|
|
ssize_t channelSize;
|
|
uint8_t *sendbuff;
|
|
uint8_t *recvbuff;
|
|
void *sendMhandle;
|
|
void *recvMhandle;
|
|
void *srecvMhandle;
|
|
public:
|
|
// this ring class is used by proxy thread to retrieve the send and recv buffer, size as well as corresponding
|
|
// mem handle based on the current step of the proxy args. The derived ring algo class is AR, AG, and BC which
|
|
// would be allocated during enqueue stage and copied to proxy side through shared memory. For each copy, we will
|
|
// increase the refCount by incRefCount() since the same ring algo object can be referenced multiple times for send
|
|
// and recv progress. After all steps are done, we decrease the refCount and only delete the ring object when
|
|
// refCount == 0.
|
|
virtual void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) = 0;
|
|
virtual void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) = 0;
|
|
int incRefCount() {
|
|
return __atomic_add_fetch(&refCount, 1, __ATOMIC_RELAXED);
|
|
}
|
|
int decRefCount() {
|
|
return __atomic_sub_fetch(&refCount, 1, __ATOMIC_RELEASE);
|
|
}
|
|
RingAlgorithm() { refCount = 0; }
|
|
virtual ~RingAlgorithm() {};
|
|
};
|
|
|
|
class RingARAlgorithm : public RingAlgorithm {
|
|
private:
|
|
int ringIndex;
|
|
int elemSize;
|
|
ssize_t chunkSize;
|
|
int slicePerChunk;
|
|
public:
|
|
void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
|
|
int curLoop = curStep / nStepsPerLoop;
|
|
int curLoopStage = (curStep % nStepsPerLoop) / chunkSteps;
|
|
int chunkStage = curLoopStage % nRanks;
|
|
int sliceStage = (curStep % chunkSteps) / sliceSteps;
|
|
ssize_t elemOffset = curLoop * loopSize;
|
|
ssize_t remSize = channelSize - elemOffset;
|
|
ssize_t chunkOffset;
|
|
ssize_t sliceOffset;
|
|
ssize_t curSliceSize;
|
|
ssize_t curChunkSize;
|
|
ssize_t size;
|
|
ssize_t nelem;
|
|
int chunkId;
|
|
|
|
if (remSize < loopSize) {
|
|
curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize;
|
|
} else {
|
|
curChunkSize = chunkSize;
|
|
}
|
|
chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks;
|
|
chunkOffset = chunkId * curChunkSize;
|
|
nelem = std::min(remSize - chunkOffset, curChunkSize);
|
|
curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
|
|
sliceOffset = sliceStage * curSliceSize;
|
|
|
|
if (nelem <= sliceOffset) {
|
|
*sendbuffOut = sendbuff;
|
|
*mhandleOut = sendMhandle;
|
|
} else {
|
|
if (curLoopStage == 0) {
|
|
*sendbuffOut = sendbuff + elemOffset + chunkOffset + sliceOffset;
|
|
*mhandleOut = sendMhandle;
|
|
} else {
|
|
*sendbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset;
|
|
*mhandleOut = srecvMhandle;
|
|
}
|
|
}
|
|
size = std::min(curSliceSize, nelem - sliceOffset);
|
|
*sizeOut = size < 0 ? 0 : size;
|
|
return;
|
|
}
|
|
|
|
void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
|
|
int curLoop = curStep / nStepsPerLoop;
|
|
int curLoopStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps;
|
|
int chunkStage = curLoopStage % nRanks;
|
|
int sliceStage = (curStep % chunkSteps) / sliceSteps;
|
|
ssize_t elemOffset = curLoop * loopSize;
|
|
ssize_t remSize = channelSize - elemOffset;
|
|
ssize_t chunkOffset;
|
|
ssize_t sliceOffset;
|
|
ssize_t curSliceSize;
|
|
ssize_t curChunkSize;
|
|
ssize_t size;
|
|
ssize_t nelem;
|
|
int chunkId;
|
|
|
|
if (remSize < loopSize) {
|
|
curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize;
|
|
} else {
|
|
curChunkSize = chunkSize;
|
|
}
|
|
|
|
if (curLoopStage == 0) {
|
|
chunkId = (ringIndex + 1) % nRanks;
|
|
} else {
|
|
chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks;
|
|
}
|
|
|
|
chunkOffset = chunkId * curChunkSize;
|
|
nelem = std::min(remSize - chunkOffset, curChunkSize);
|
|
curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
|
|
sliceOffset = sliceStage * curSliceSize;
|
|
if (nelem <= sliceOffset) {
|
|
*recvbuffOut = recvbuff;
|
|
} else {
|
|
*recvbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset;
|
|
}
|
|
if (sizeOut) {
|
|
size = std::min(curSliceSize, nelem - sliceOffset);
|
|
*sizeOut = size < 0 ? 0 : size;
|
|
}
|
|
*mhandleOut = recvMhandle;
|
|
return;
|
|
}
|
|
|
|
RingARAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int ringIndex, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
|
|
this->ringIndex = ringIndex;
|
|
this->nRanks = nRanks;
|
|
this->nStepsPerLoop = 2 * (nRanks - 1) * chunkSteps;
|
|
this->chunkSteps = chunkSteps;
|
|
this->sliceSteps = sliceSteps;
|
|
this->chunkSize = chunkSize;
|
|
this->sliceSize = sliceSize;
|
|
this->loopSize = nRanks * chunkSize;
|
|
this->sendbuff = (uint8_t*)sendbuff + gridOffset;
|
|
this->recvbuff = (uint8_t*)recvbuff + gridOffset;
|
|
this->channelSize = channelSize;
|
|
this->elemSize = elemSize;
|
|
this->sendMhandle = sendMhandle;
|
|
this->recvMhandle = recvMhandle;
|
|
this->srecvMhandle = srecvMhandle;
|
|
this->slicePerChunk = chunkSteps / sliceSteps;
|
|
}
|
|
~RingARAlgorithm() {}
|
|
};
|
|
|
|
class RingAGAlgorithm : public RingAlgorithm {
|
|
private:
|
|
int *ringRanks;
|
|
int elemSize;
|
|
ssize_t sendSize;
|
|
int slicePerChunk;
|
|
public:
|
|
void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
|
|
int curLoop = curStep / nStepsPerLoop;
|
|
int chunkStage = (curStep % nStepsPerLoop) / chunkSteps;
|
|
int sliceStage = (curStep % chunkSteps) / sliceSteps;
|
|
ssize_t sliceOffset;
|
|
ssize_t curSliceSize;
|
|
ssize_t offset;
|
|
ssize_t elemOffset = curLoop * loopSize;
|
|
ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset);
|
|
ssize_t size;
|
|
int rankDest;
|
|
uint8_t *buff;
|
|
void *mhandle;
|
|
|
|
curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
|
|
sliceOffset = sliceStage * curSliceSize;
|
|
if (chunkStage == 0) {
|
|
rankDest = ringRanks[0];
|
|
offset = elemOffset + sliceOffset;
|
|
buff = sendbuff + offset;
|
|
mhandle = sendMhandle;
|
|
} else {
|
|
rankDest = ringRanks[nRanks - chunkStage];
|
|
offset = elemOffset + rankDest * sendSize + sliceOffset;
|
|
buff = recvbuff + offset;
|
|
mhandle = srecvMhandle;
|
|
}
|
|
*sendbuffOut = buff;
|
|
size = std::min(curSliceSize, channelSize - elemOffset - sliceOffset);
|
|
*sizeOut = size < 0 ? 0 : size;
|
|
*mhandleOut = mhandle;
|
|
return;
|
|
}
|
|
|
|
void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
|
|
int curLoop = curStep / nStepsPerLoop;
|
|
int chunkStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps;
|
|
int sliceStage = (curStep % chunkSteps) / sliceSteps;
|
|
ssize_t sliceOffset;
|
|
ssize_t curSliceSize;
|
|
ssize_t offset;
|
|
ssize_t elemOffset = curLoop * loopSize;
|
|
ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset);
|
|
ssize_t size;
|
|
int rankDest;
|
|
|
|
curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
|
|
sliceOffset = sliceStage * curSliceSize;
|
|
if (chunkStage == 0) {
|
|
rankDest = ringRanks[1];
|
|
} else {
|
|
rankDest = ringRanks[nRanks - chunkStage];
|
|
}
|
|
offset = elemOffset + rankDest * sendSize + sliceOffset;
|
|
*recvbuffOut = recvbuff + offset;
|
|
if (sizeOut) {
|
|
size = std::min(sliceSize, channelSize - elemOffset - sliceOffset);
|
|
*sizeOut = size < 0 ? 0 : size;
|
|
}
|
|
*mhandleOut = recvMhandle;
|
|
}
|
|
|
|
RingAGAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, size_t sendSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
|
|
this->ringRanks = ringRanks;
|
|
this->nRanks = nRanks;
|
|
this->nStepsPerLoop = (nRanks - 1) * chunkSteps;
|
|
this->chunkSteps = chunkSteps;
|
|
this->sliceSteps = sliceSteps;
|
|
this->elemSize = elemSize;
|
|
this->sliceSize = sliceSize;
|
|
this->loopSize = chunkSize;
|
|
this->sendSize = sendSize;
|
|
this->channelSize = channelSize;
|
|
this->sendbuff = (uint8_t*)sendbuff + gridOffset;
|
|
this->recvbuff = (uint8_t*)recvbuff + gridOffset;
|
|
this->sendMhandle = sendMhandle;
|
|
this->recvMhandle = recvMhandle;
|
|
this->srecvMhandle = srecvMhandle;
|
|
this->slicePerChunk = chunkSteps / sliceSteps;
|
|
}
|
|
~RingAGAlgorithm() {}
|
|
};
|
|
|
|
class RingBCAlgorithm : public RingAlgorithm {
|
|
private:
|
|
int root;
|
|
int rank;
|
|
int nextRank;
|
|
public:
|
|
void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
|
|
int curLoop = curStep / nStepsPerLoop;
|
|
int sliceStage = (curStep % chunkSteps) / sliceSteps;
|
|
ssize_t sliceOffset = sliceStage * sliceSize;
|
|
ssize_t offset;
|
|
ssize_t elemOffset = curLoop * loopSize;
|
|
ssize_t size;
|
|
uint8_t *buff;
|
|
void *mhandle;
|
|
|
|
offset = elemOffset + sliceOffset;
|
|
if (offset >= channelSize) {
|
|
buff = sendbuff;
|
|
mhandle = sendMhandle;
|
|
} else if (rank == root) {
|
|
buff = sendbuff + offset;
|
|
mhandle = sendMhandle;
|
|
} else {
|
|
buff = recvbuff + offset;
|
|
mhandle = srecvMhandle;
|
|
}
|
|
*sendbuffOut = buff;
|
|
size = std::min(sliceSize, channelSize - offset);
|
|
*sizeOut = size < 0 ? 0 : size;
|
|
*mhandleOut = mhandle;
|
|
return;
|
|
}
|
|
|
|
void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
|
|
int curLoop = curStep / nStepsPerLoop;
|
|
int sliceStage = (curStep % chunkSteps) / sliceSteps;
|
|
ssize_t sliceOffset = sliceStage * sliceSize;
|
|
ssize_t offset;
|
|
ssize_t elemOffset = curLoop * loopSize;
|
|
ssize_t size;
|
|
offset = elemOffset + sliceOffset;
|
|
if (offset >= channelSize) {
|
|
*recvbuffOut = recvbuff;
|
|
} else {
|
|
*recvbuffOut = recvbuff + offset;
|
|
}
|
|
if (sizeOut) {
|
|
size = std::min(sliceSize, channelSize - offset);
|
|
*sizeOut = size < 0 ? 0 : size;
|
|
}
|
|
*mhandleOut = recvMhandle;
|
|
return;
|
|
}
|
|
|
|
RingBCAlgorithm(const void* sendbuff, void* recvbuff, int rank, int root, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
|
|
this->root = root;
|
|
this->rank = rank;
|
|
this->nextRank = ringRanks[1];
|
|
this->nStepsPerLoop = chunkSteps;
|
|
this->chunkSteps = chunkSteps;
|
|
this->sliceSteps = sliceSteps;
|
|
this->sliceSize = sliceSize;
|
|
this->loopSize = chunkSize;
|
|
this->channelSize = channelSize;
|
|
this->sendbuff = (uint8_t*)sendbuff + gridOffset;
|
|
this->recvbuff = (uint8_t*)recvbuff + gridOffset;
|
|
this->sendMhandle = sendMhandle;
|
|
this->recvMhandle = recvMhandle;
|
|
this->srecvMhandle = srecvMhandle;
|
|
}
|
|
~RingBCAlgorithm() {}
|
|
};
|
|
|
|
#if !defined (__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
|
// #include <cuda/atomic>
|
|
#endif
|
|
|
|
// Need a power of two to ensure it divides by parallelFactor (which is also a power of two)
|
|
#define NCCL_PAT_NWORKERS 128
|
|
|
|
static constexpr int PatUsed = 0x1,
|
|
PatSkipped = 0x2;
|
|
|
|
struct ncclPatStep {
|
|
int recvDim, sendDim, recvOffset, sendOffset, stepOffset, postRecv, postSend, nelem, last, flags;
|
|
size_t inpIx, outIx;
|
|
};
|
|
|
|
struct ncclPatPeer {
|
|
uint64_t step;
|
|
struct ncclConnInfo* conn;
|
|
struct ncclConnFifo* connFifo;
|
|
void* buff;
|
|
uint64_t *headPtr;
|
|
uint64_t *tailPtr;
|
|
uint64_t stepCache;
|
|
long long int accSize;
|
|
int connStepSize;
|
|
};
|
|
|
|
#define NCCL_SHMEM_PAT_STEPS 32
|
|
struct ncclPatShmem {
|
|
struct ncclPatStep patSteps[NCCL_SHMEM_PAT_STEPS];
|
|
int parallelFactor;
|
|
long long int localAccSize;
|
|
struct ncclPatPeer sendDims[32]; // Should cover 2^32 ranks
|
|
struct ncclPatPeer recvDims[32];
|
|
};
|
|
|
|
template<typename T>
|
|
class PatRSAlgorithm{
|
|
size_t offset;
|
|
size_t end;
|
|
size_t count;
|
|
int chunkCount;
|
|
int nelem;
|
|
int rank;
|
|
int nranks;
|
|
int nrPow2;
|
|
int postFreq;
|
|
int lastA;
|
|
int parallelFactor;
|
|
int aggFactor;
|
|
int as; // aggregated steps
|
|
int a; // step inside aggregated step
|
|
int sendSkipped; // number of skipped steps during aggregation
|
|
int stepOffset;
|
|
int aggDelta;
|
|
int scale;
|
|
int phase;
|
|
|
|
__device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
|
|
return (a<b)?a:b;
|
|
}
|
|
|
|
__device__ __host__ int getNelem() {
|
|
return min(chunkCount, end-offset);
|
|
}
|
|
|
|
__device__ __host__ int mirrorInvert(int i, int max) {
|
|
int ret = 0;
|
|
for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
|
|
if ((i&mask) == 0) ret += imask;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
__device__ __host__ int firstBitSet(int i, int max) {
|
|
int ffs =
|
|
#ifdef __CUDA_ARCH__
|
|
__ffs(i);
|
|
#else
|
|
__builtin_ffs(i);
|
|
#endif
|
|
return ffs ? ffs-1 : max;
|
|
}
|
|
|
|
__device__ __host__ void resetA() {
|
|
a = 0;
|
|
sendSkipped = stepOffset = 0;
|
|
lastA = aggFactor;
|
|
if (phase >= 2) lastA /= 2*scale;
|
|
if (phase == 4) lastA = 1;
|
|
}
|
|
|
|
__device__ __host__ void reset() {
|
|
nelem = getNelem();
|
|
phase = 0;
|
|
scale = 1;
|
|
as = aggDelta - 1;
|
|
resetA();
|
|
}
|
|
|
|
__device__ __host__ int nBitsSet(int i) {
|
|
int nbits =
|
|
#ifdef __CUDA_ARCH__
|
|
__popc(i);
|
|
#else
|
|
__builtin_popcount(i);
|
|
#endif
|
|
return nbits;
|
|
}
|
|
|
|
// Return 1 when only upper bits are set. For example, if nrpow2==16 we'll return 1 for 8, 12, 14, 15.
|
|
// A number being in the form of 1111000 implies that the complementary is 0000111 meaning it's a power of 2 minus 1.
|
|
__device__ __host__ int newPeer(int i, int pow2) {
|
|
//printf("New peer %d/%d -> %d\n", i, pow2, nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0);
|
|
return nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0;
|
|
}
|
|
|
|
public:
|
|
__device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
|
|
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
|
|
parallelFactor = maxParallelFactor;
|
|
aggDelta = nrPow2 = (1<<log2Up(nranks));
|
|
|
|
aggFactor = 1;
|
|
size_t channelSize = end-offset;
|
|
while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
|
|
aggFactor *= 2;
|
|
aggDelta /= 2;
|
|
}
|
|
postFreq = aggFactor;
|
|
if (postFreq < parallelFactor) parallelFactor = postFreq;
|
|
int d = stepDepth;
|
|
while (d > 1 && aggFactor < nranks/2) {
|
|
d /= 2;
|
|
aggFactor *= 2;
|
|
aggDelta /= 2;
|
|
}
|
|
|
|
reset();
|
|
}
|
|
|
|
__device__ __host__ int getParallelFactor() {
|
|
return parallelFactor;
|
|
}
|
|
|
|
__device__ __host__ void getNextOp(struct ncclPatStep* ps) {
|
|
ps->last = 0;
|
|
ps->nelem = nelem;
|
|
ps->outIx = offset;
|
|
ps->stepOffset = stepOffset;
|
|
int skip = 0;
|
|
if (a >= lastA) {
|
|
skip = 1;
|
|
} else if (phase == 0) {
|
|
int s = mirrorInvert(a, lastA)*aggDelta + as;
|
|
if (s >= nranks) skip = 1;
|
|
int sendDataRank = (rank + s) % nranks;
|
|
ps->inpIx = sendDataRank * count + offset;
|
|
ps->recvDim = -1;
|
|
ps->sendDim = 0;
|
|
ps->outIx = 0;
|
|
ps->recvOffset = -1;
|
|
ps->sendOffset = (a%postFreq) * nelem;
|
|
if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
|
|
ps->postSend = 1;
|
|
} else {
|
|
ps->postSend = 0;
|
|
}
|
|
ps->postRecv = 0;
|
|
} else if (phase == 1) {
|
|
int s = mirrorInvert(a, lastA)*aggDelta + as;
|
|
if (s >= nranks) skip = 1;
|
|
ps->recvDim = firstBitSet(s, nrPow2);
|
|
ps->sendOffset = (a%postFreq)*nelem;
|
|
ps->recvOffset = (a%postFreq)*nelem;
|
|
ps->postSend = 0;
|
|
if (ps->recvDim == 0 && (((a%postFreq) + 1 >= postFreq) || (a == lastA-1))) ps->postSend = 1;
|
|
if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
|
|
ps->postRecv = 1;
|
|
} else {
|
|
ps->postRecv = 0;
|
|
}
|
|
s -= (1<<ps->recvDim);
|
|
int recvDataRank = (rank + nranks + s) % nranks;
|
|
ps->inpIx = recvDataRank * count + offset;
|
|
ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
|
if (ps->sendDim == -1) {
|
|
ps->sendOffset = -1;
|
|
} else if (as - (1<<ps->recvDim) == 0) {
|
|
if (newPeer(a, aggFactor)) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
|
|
int foffset = a - sendSkipped;
|
|
ps->sendOffset = (foffset%postFreq)*nelem;
|
|
}
|
|
int recvDim = ps->recvDim;
|
|
if (s < nranks && skip) {
|
|
ps->recvDim = -1;
|
|
ps->recvOffset = -1;
|
|
ps->postRecv = 0;
|
|
skip = 0;
|
|
}
|
|
if (recvDim > 0 && (((a-sendSkipped)%postFreq) + 1 >= postFreq) && skip == 0) stepOffset++;
|
|
} else if (phase == 2) {
|
|
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
|
|
ps->postRecv = 0;
|
|
if (s >= nranks) skip = 1;
|
|
ps->recvDim = 0;
|
|
ps->postSend = a == lastA-1 ? 1 : 0;
|
|
s -= 1;
|
|
if (s < nranks && skip) {
|
|
ps->recvDim = -1;
|
|
ps->recvOffset = -1;
|
|
skip = 0;
|
|
} else if (!skip) {
|
|
int foffset = a + aggFactor - aggFactor/scale;
|
|
ps->postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
|
|
ps->recvOffset = (foffset%postFreq) * nelem;
|
|
}
|
|
int recvDataRank = (rank + nranks + s) % nranks;
|
|
ps->inpIx = recvDataRank * count + offset;
|
|
ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
|
int foffset = a;
|
|
ps->postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
|
|
ps->sendOffset = (foffset%postFreq) * nelem;
|
|
} else if (phase == 3) {
|
|
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
|
|
ps->postRecv = a == lastA-1 ? 1 : 0;
|
|
if (s >= nranks) skip = 1;
|
|
ps->recvDim = firstBitSet(s, nrPow2);
|
|
ps->postSend = 0;
|
|
s -= (1<<ps->recvDim);
|
|
int foffset = a;
|
|
ps->postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
|
|
ps->recvOffset = (foffset%postFreq) * nelem;
|
|
int recvDataRank = (rank + nranks + s) % nranks;
|
|
ps->inpIx = recvDataRank * count + offset;
|
|
ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
|
if (s < nranks && skip) {
|
|
ps->recvDim = -1;
|
|
ps->recvOffset = -1;
|
|
ps->postRecv = 0;
|
|
skip = 0;
|
|
}
|
|
if (newPeer(a, aggFactor/(2*scale))) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
|
|
foffset = a - sendSkipped;
|
|
if ((foffset%postFreq) + 1 >= postFreq && skip == 0) stepOffset++;
|
|
ps->sendOffset = ps->sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
|
|
} else if (phase == 4) {
|
|
ps->recvDim = 0;
|
|
ps->sendDim = -1;
|
|
ps->inpIx = rank * count + offset;
|
|
ps->recvOffset = ((aggFactor-1)%postFreq) * nelem;
|
|
ps->sendOffset = -1;
|
|
ps->postRecv = 1;
|
|
ps->postSend = 0;
|
|
offset += chunkCount;
|
|
}
|
|
a++;
|
|
if (a >= lastA && a >= parallelFactor) {
|
|
int p = phase;
|
|
if (p == 1) as--;
|
|
if (p == 3) scale *= 2;
|
|
phase =
|
|
p == 0 ? as == 1 ? (aggFactor > 1 ? 2 : 4) : 1 :
|
|
p == 1 ? as % 2 == 1 ? 0 : 1 :
|
|
p == 2 ? 3 :
|
|
p == 3 ? scale < aggFactor ? 2 : 4 :
|
|
5;
|
|
if (p == 4) {
|
|
if (offset >= end) {
|
|
ps->last = 2;
|
|
} else {
|
|
reset();
|
|
}
|
|
} else {
|
|
resetA();
|
|
}
|
|
} else if (phase == 4 && offset >= end) {
|
|
ps->last = 1;
|
|
}
|
|
int flags = PatUsed | (skip ? PatSkipped : 0);
|
|
#if __CUDA_ARCH__ >= 600
|
|
cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
|
|
a.store(flags, cuda::memory_order_release);
|
|
#else
|
|
ps->flags = flags;
|
|
#endif
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
class PatAGAlgorithm{
|
|
size_t offset;
|
|
size_t end;
|
|
size_t count;
|
|
int chunkCount;
|
|
int nelem;
|
|
int rank;
|
|
int nranks;
|
|
int nrPow2;
|
|
int postFreq;
|
|
int lastA;
|
|
int parallelFactor;
|
|
int aggFactor;
|
|
int as; // aggregated steps
|
|
int a; // step inside aggregated step
|
|
int aggDelta;
|
|
int scale;
|
|
int phase;
|
|
|
|
// AS computation
|
|
int asDim;
|
|
int v;
|
|
int bitCount[32];
|
|
int bitZeroStep[32];
|
|
|
|
__device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
|
|
return (a<b)?a:b;
|
|
}
|
|
|
|
__device__ __host__ int getNelem() {
|
|
return min(chunkCount, end-offset);
|
|
}
|
|
|
|
__device__ __host__ int mirror(int i, int max) {
|
|
int ret = 0;
|
|
for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
|
|
if ((i&mask)) ret += imask;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
__device__ __host__ int firstBitSet(int i, int max) {
|
|
int ffs =
|
|
#ifdef __CUDA_ARCH__
|
|
__ffs(i);
|
|
#else
|
|
__builtin_ffs(i);
|
|
#endif
|
|
return ffs ? ffs-1 : max;
|
|
}
|
|
|
|
__device__ __host__ void resetA() {
|
|
a = 0;
|
|
lastA = aggFactor;
|
|
if (phase >= 2) lastA /= 2*scale;
|
|
}
|
|
|
|
__device__ __host__ void reset() {
|
|
nelem = getNelem();
|
|
scale = aggFactor/2;
|
|
phase = scale ? 2 : 1;
|
|
v = 0;
|
|
for (int i = 0; i<asDim; i++) {
|
|
bitCount[i] = asDim-i;
|
|
bitZeroStep[i] = 1;
|
|
}
|
|
as = nextAs();
|
|
resetA();
|
|
}
|
|
|
|
__device__ __host__ int nextAs() {
|
|
for (int d=0; d<asDim; d++) {
|
|
int p = 1<<d;
|
|
bitCount[d]--;
|
|
if (bitCount[d] == 0) {
|
|
v ^= p;
|
|
bitCount[d] = p;
|
|
if ((v&p) == 0) {
|
|
bitCount[d] += firstBitSet(bitZeroStep[d], asDim) - 1;
|
|
if (bitCount[d] == 0) {
|
|
v ^= p;
|
|
bitCount[d] = p;
|
|
}
|
|
bitZeroStep[d]++;
|
|
}
|
|
}
|
|
}
|
|
return v;
|
|
}
|
|
|
|
|
|
public:
|
|
__device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
|
|
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
|
|
parallelFactor = maxParallelFactor;
|
|
aggDelta = nrPow2 = (1<<log2Up(nranks));
|
|
|
|
aggFactor = 1;
|
|
size_t channelSize = end-offset;
|
|
while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
|
|
aggFactor *= 2;
|
|
aggDelta /= 2;
|
|
}
|
|
postFreq = aggFactor;
|
|
if (postFreq < parallelFactor) parallelFactor = postFreq;
|
|
int d = stepDepth;
|
|
while (d > 1 && aggFactor < nranks/2) {
|
|
d /= 2;
|
|
aggFactor *= 2;
|
|
aggDelta /= 2;
|
|
}
|
|
|
|
asDim = log2Up(aggDelta);
|
|
reset();
|
|
}
|
|
|
|
__device__ __host__ int getParallelFactor() {
|
|
return parallelFactor;
|
|
}
|
|
|
|
__device__ __host__ void getNextOp(struct ncclPatStep* ps) {
|
|
ps->last = 0;
|
|
ps->nelem = nelem;
|
|
ps->inpIx = offset;
|
|
int skip = 0;
|
|
if (a >= lastA) {
|
|
skip = 1;
|
|
} else if (phase == 0) {
|
|
int s = a*aggDelta + as;
|
|
if (s >= nranks) skip = 1;
|
|
int recvDataRank = (rank + s) % nranks;
|
|
ps->outIx = recvDataRank * count + offset;
|
|
ps->sendDim = -1;
|
|
ps->recvDim = 0;
|
|
ps->inpIx = 0;
|
|
ps->sendOffset = -1;
|
|
ps->recvOffset = (a % postFreq) * nelem;
|
|
ps->stepOffset = 0;
|
|
ps->postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
|
|
ps->postSend = 0;
|
|
} else if (phase == 1) {
|
|
int s = a*aggDelta + as;
|
|
if (s >= nranks) skip = 1;
|
|
ps->sendDim = firstBitSet(s, nrPow2);
|
|
s -= (1<<ps->sendDim);
|
|
int sendDataRank = (rank + nranks + s) % nranks;
|
|
ps->outIx = sendDataRank * count + offset;
|
|
ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
|
|
ps->sendOffset = ps->recvOffset = (a % postFreq) * nelem;
|
|
ps->postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
|
|
ps->postRecv = (ps->sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
|
|
ps->stepOffset = (ps->sendDim == 0) ? 0 : a/postFreq;
|
|
if (ps->recvDim == -1) {
|
|
ps->recvOffset = -1;
|
|
ps->postRecv = 0;
|
|
} else if (as - (1<<ps->sendDim) == 0) {
|
|
int foffset = (a*aggDelta) >> (ps->recvDim+1);
|
|
ps->recvOffset = (foffset%postFreq)*nelem;
|
|
ps->postRecv = (ps->sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<ps->recvDim) >= nranks) ? 1 : 0;
|
|
ps->stepOffset = (ps->sendDim == 0) ? 0 : foffset/postFreq;
|
|
}
|
|
if (s < nranks && ps->sendDim == 0 && skip) {
|
|
// Don't forget to receive at least once even if we don't send afterwards
|
|
ps->sendDim = -1;
|
|
ps->sendOffset = -1;
|
|
ps->postSend = 0;
|
|
skip = 0;
|
|
}
|
|
} else if (phase == 2) {
|
|
int s = (2*a+1)*scale*aggDelta;
|
|
ps->postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
|
|
ps->postRecv = 0;
|
|
if (s >= nranks) skip = 1;
|
|
ps->sendDim = firstBitSet(s, nrPow2);
|
|
s -= (1<<ps->sendDim);
|
|
ps->sendOffset = (a%postFreq) * nelem;
|
|
ps->stepOffset = a / postFreq;
|
|
int sendDataRank = (rank + nranks + s) % nranks;
|
|
ps->outIx = sendDataRank * count + offset;
|
|
ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
|
|
if (ps->recvDim == -1) {
|
|
ps->recvOffset = -1;
|
|
} else {
|
|
s -= (1<<ps->recvDim);
|
|
int foffset = (a*2*scale*aggDelta) >> (ps->recvDim+1);
|
|
ps->recvOffset = (foffset%postFreq)*nelem;
|
|
ps->stepOffset = foffset / postFreq;
|
|
}
|
|
}
|
|
a++;
|
|
if (a >= lastA && a >= parallelFactor) {
|
|
int p = phase;
|
|
if (p == 2) scale /= 2;
|
|
phase =
|
|
p == 2 ? scale ? 2 : 1 :
|
|
p == 1 ? as % 2 == 1 ? 0 : 1 :
|
|
1;
|
|
if (p == 0 || (p == 1 && as % 2 == 0)) as = nextAs();
|
|
if (p == 0 && as == aggDelta/2) {
|
|
offset += chunkCount;
|
|
if (offset >= end) {
|
|
ps->last = 2;
|
|
} else {
|
|
reset();
|
|
}
|
|
} else {
|
|
resetA();
|
|
}
|
|
} else if (phase == 0 && as == 1 && offset + chunkCount >= end && a-1 >= ((lastA-1) / parallelFactor) * parallelFactor) {
|
|
ps->last = 1;
|
|
}
|
|
int flags = PatUsed | (skip ? PatSkipped : 0);
|
|
#if __CUDA_ARCH__ >= 600
|
|
cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
|
|
a.store(flags, cuda::memory_order_release);
|
|
#else
|
|
ps->flags = flags;
|
|
#endif
|
|
}
|
|
};
|
|
#endif
|