Use One Slice per Basic Primitive for AllReduce, ReduceScatter, AllGather (#1681) for Single Node on Some GFX9 Systems
Using a single slice rather than the typical two provides about 5% speedup (sometimes more or less) on some GFX9 systems for single node.
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
12517a957e
Коммит
2f6b20c00a
@@ -15,11 +15,17 @@
|
||||
#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
|
||||
|
||||
// CHUNKSIZE must be a multiple of SLICESIZE
|
||||
// RCCL: Benchmarking on single node for MI300X showed improved throughput for single node always using
|
||||
// a single slice, so we have separate configurations for single node and multi-node. Single node configs
|
||||
// are suffixed with _SINGLE_NODE.
|
||||
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
|
||||
#define ALLREDUCE_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
|
||||
#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
|
||||
#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
|
||||
#define ALLGATHER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
|
||||
#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
|
||||
#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
|
||||
#define REDUCESCATTER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
|
||||
#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
|
||||
#define BROADCAST_SLICESTEPS 1
|
||||
#define BROADCAST_CHUNKSTEPS 1
|
||||
@@ -30,6 +36,10 @@
|
||||
#define ALLTOALL_PIVOT_SLICESTEPS 2
|
||||
#define ALLTOALL_PIVOT_CHUNKSTEPS 4
|
||||
|
||||
static_assert(ALLREDUCE_CHUNKSTEPS == ALLREDUCE_SLICESTEPS_SINGLE_NODE, "ALLREDUCE_CHUNKSTEPS must be equal to ALLREDUCE_SLICESTEPS_SINGLE_NODE");
|
||||
static_assert(ALLGATHER_CHUNKSTEPS == ALLGATHER_SLICESTEPS_SINGLE_NODE, "ALLGATHER_CHUNKSTEPS must be equal to ALLGATHER_SLICESTEPS_SINGLE_NODE");
|
||||
static_assert(REDUCESCATTER_CHUNKSTEPS == REDUCESCATTER_SLICESTEPS_SINGLE_NODE, "REDUCESCATTER_CHUNKSTEPS must be equal to REDUCESCATTER_SLICESTEPS_SINGLE_NODE");
|
||||
|
||||
const char* ncclFuncToString(ncclFunc_t op);
|
||||
const char* ncclDevRedOpToString(ncclDevRedOp_t op);
|
||||
const char* ncclDatatypeToString(ncclDataType_t type);
|
||||
|
||||
@@ -481,6 +481,7 @@ struct ncclComm {
|
||||
|
||||
int node;
|
||||
int nNodes;
|
||||
int rcclUseOneSlice; // RCCL: true if this comm is using one slice per primitive
|
||||
int localRank;
|
||||
int localRanks;
|
||||
int maxLocalRanks;
|
||||
|
||||
@@ -288,7 +288,7 @@ struct alignas(16) ncclDevWorkColl {
|
||||
// nChannels == (channelHi - channelLo) + 1
|
||||
uint32_t channelLo:8, channelHi:8;
|
||||
uint32_t nWarps:8;
|
||||
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1;
|
||||
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1;
|
||||
uint32_t root:30, connIndex:2;
|
||||
uint16_t pivotA2ANumBiRings;
|
||||
void* recvbuff;
|
||||
|
||||
Ссылка в новой задаче
Block a user