Use One Slice per Basic Primitive for AllReduce, ReduceScatter, AllGather (#1681) for Single Node on Some GFX9 Systems

Using a single slice rather than the typical two provides about 5% speedup (sometimes more or less) on some GFX9 systems for single node.
Этот коммит содержится в:
alex-breslow-amd
2025-05-29 16:17:35 -07:00
коммит произвёл GitHub
родитель 12517a957e
Коммит 2f6b20c00a
9 изменённых файлов: 66 добавлений и 10 удалений
+10
Просмотреть файл
@@ -15,11 +15,17 @@
#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
// CHUNKSIZE must be a multiple of SLICESIZE
// RCCL: Benchmarking on single node for MI300X showed improved throughput for single node always using
// a single slice, so we have separate configurations for single node and multi-node. Single node configs
// are suffixed with _SINGLE_NODE.
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
#define ALLREDUCE_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
#define ALLGATHER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
#define REDUCESCATTER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
#define BROADCAST_SLICESTEPS 1
#define BROADCAST_CHUNKSTEPS 1
@@ -30,6 +36,10 @@
#define ALLTOALL_PIVOT_SLICESTEPS 2
#define ALLTOALL_PIVOT_CHUNKSTEPS 4
static_assert(ALLREDUCE_CHUNKSTEPS == ALLREDUCE_SLICESTEPS_SINGLE_NODE, "ALLREDUCE_CHUNKSTEPS must be equal to ALLREDUCE_SLICESTEPS_SINGLE_NODE");
static_assert(ALLGATHER_CHUNKSTEPS == ALLGATHER_SLICESTEPS_SINGLE_NODE, "ALLGATHER_CHUNKSTEPS must be equal to ALLGATHER_SLICESTEPS_SINGLE_NODE");
static_assert(REDUCESCATTER_CHUNKSTEPS == REDUCESCATTER_SLICESTEPS_SINGLE_NODE, "REDUCESCATTER_CHUNKSTEPS must be equal to REDUCESCATTER_SLICESTEPS_SINGLE_NODE");
const char* ncclFuncToString(ncclFunc_t op);
const char* ncclDevRedOpToString(ncclDevRedOp_t op);
const char* ncclDatatypeToString(ncclDataType_t type);
+1
Просмотреть файл
@@ -481,6 +481,7 @@ struct ncclComm {
int node;
int nNodes;
int rcclUseOneSlice; // RCCL: true if this comm is using one slice per primitive
int localRank;
int localRanks;
int maxLocalRanks;
+1 -1
Просмотреть файл
@@ -288,7 +288,7 @@ struct alignas(16) ncclDevWorkColl {
// nChannels == (channelHi - channelLo) + 1
uint32_t channelLo:8, channelHi:8;
uint32_t nWarps:8;
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1;
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1;
uint32_t root:30, connIndex:2;
uint16_t pivotA2ANumBiRings;
void* recvbuff;