Adding tuning conf file for CU reduction for AR, AG, and RS with under-subscribed number of GPUs per node (#2102)
[ROCm/rccl commit: f0e7e8745f]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
c64c23fbee
Коммит
bed6070e12
@@ -0,0 +1,53 @@
|
||||
# NCCL Tuner Configuration File (CSV Format)
|
||||
# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||
#
|
||||
# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
|
||||
# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
|
||||
# Protocols: ll, ll128, simple
|
||||
# Channels: number of channels to use, or -1 to keep default
|
||||
# nNodes: number of nodes to match, or -1 for any number of nodes
|
||||
# nRanks: number of ranks to match, or -1 for any number of ranks
|
||||
# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
|
||||
# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
|
||||
#
|
||||
# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
|
||||
#
|
||||
#AR 4PPN
|
||||
allreduce,33554432,4294967296,ring,simple,16,2,8,-1,-1
|
||||
allreduce,33554432,4294967296,ring,simple,16,4,16,-1,-1
|
||||
allreduce,67108864,4294967296,ring,simple,16,8,32,-1,-1
|
||||
#AR 2PPN
|
||||
allreduce,2097152,4294967296,ring,simple,4,2,4,-1,-1
|
||||
allreduce,16777216,4294967296,ring,simple,4,4,8,-1,-1
|
||||
allreduce,33554432,4294967296,ring,simple,4,8,16,-1,-1
|
||||
#AR 1PPN
|
||||
allreduce,134217728,4294967296,ring,simple,4,4,4,-1,-1
|
||||
allreduce,67108864,4294967296,ring,simple,4,8,8,-1,-1
|
||||
|
||||
|
||||
#AG 4PPN
|
||||
allgather,8388608,4294967296,ring,simple,16,2,8,-1,-1
|
||||
allgather,16777216,4294967296,ring,simple,16,4,16,-1,-1
|
||||
allgather,16777216,4294967296,ring,simple,16,8,32,-1,-1
|
||||
#AG 2PPN
|
||||
allgather,262144,4294967296,ring,simple,4,2,4,-1,-1
|
||||
allgather,16777216,4294967296,ring,simple,4,4,8,-1,-1
|
||||
allgather,33554432,4294967296,ring,simple,4,8,16,-1,-1
|
||||
#AG 1PPN
|
||||
allgather,262144,2097152,ring,simple,4,2,2,-1,-1
|
||||
allgather,262144,8388608,ring,simple,4,4,4,-1,-1
|
||||
allgather,67108864,4294967296,ring,simple,4,8,8,-1,-1
|
||||
|
||||
#RS 4PPN
|
||||
reducescatter,1048576,4294967296,ring,simple,16,2,8,-1,-1
|
||||
reducescatter,1048576,4294967296,ring,simple,16,4,16,-1,-1
|
||||
reducescatter,1048576,4294967296,ring,simple,16,8,32,-1,-1
|
||||
#RS 2PPN
|
||||
reducescatter,262144,33554432,ring,simple,4,2,4,-1,-1
|
||||
reducescatter,262144,4294967296,ring,simple,4,4,8,-1,-1
|
||||
reducescatter,262144,4294967296,ring,simple,4,8,16,-1,-1
|
||||
#RS 1PPN
|
||||
reducescatter,131072,262144,ring,simple,4,2,2,-1,-1
|
||||
reducescatter,1048576,2097152,ring,simple,4,2,2,-1,-1
|
||||
reducescatter,131072,4194304,ring,simple,4,4,4,-1,-1
|
||||
reducescatter,262144,8388608,ring,simple,4,8,8,-1,-1
|
||||
Ссылка в новой задаче
Block a user