msccl algorithms tuning for allreduce on MI300 (#1088)

[ROCm/rccl commit: 5a0f9990a9]
This commit is contained in:
Pedram Alizadeh
2024-02-21 11:31:56 -05:00
committed by GitHub
orang tua 3cd03179cb
melakukan bf48d1bc4d
5 mengubah file dengan 11734 tambahan dan 4 penghapusan
@@ -1,4 +1,4 @@
<algo name="allreduce_pairs" proto="LL" nchannels="4" nchunksperloop="32" ngpus="8" coll="allreduce" inplace="1" outofplace="0" minBytes="0" maxBytes="20480">
<algo name="allreduce_pairs" proto="LL" nchannels="4" nchunksperloop="32" ngpus="8" coll="allreduce" inplace="1" outofplace="0" minBytes="0" maxBytes="25599">
<gpu id="0" i_chunks="32" o_chunks="0" s_chunks="224">
<tb id="0" send="-1" recv="-1" chan="0">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="8" deps="0" hasdep="0"/>
@@ -1,4 +1,4 @@
<algo name="allreduce_pairs" proto="LL" nchannels="4" nchunksperloop="256" ngpus="8" coll="allreduce" inplace="1" outofplace="0" minBytes="20481" maxBytes="81919">
<algo name="allreduce_pairs" proto="LL" nchannels="4" nchunksperloop="256" ngpus="8" coll="allreduce" inplace="1" outofplace="0" minBytes="25600" maxBytes="65536">
<gpu id="0" i_chunks="256" o_chunks="0" s_chunks="224">
<tb id="0" send="-1" recv="-1" chan="0">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="8" deps="1" hasdep="0"/>
@@ -1,4 +1,4 @@
<algo name="allreduce_pairs" proto="LL" nchannels="8" nchunksperloop="512" ngpus="8" coll="allreduce" inplace="1" outofplace="0" minBytes="81920" maxBytes="1048575">
<algo name="allreduce_pairs" proto="LL" nchannels="8" nchunksperloop="512" ngpus="8" coll="allreduce" inplace="1" outofplace="0" minBytes="65537" maxBytes="524287">
<gpu id="0" i_chunks="512" o_chunks="0" s_chunks="448">
<tb id="0" send="-1" recv="-1" chan="0">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="16" deps="1" hasdep="0"/>
@@ -1,4 +1,4 @@
<algo name="allreduce_pairs" proto="Simple" nchannels="8" nchunksperloop="512" ngpus="8" coll="allreduce" inplace="1" outofplace="0" minBytes="1048576" maxBytes="11534336">
<algo name="allreduce_pairs" proto="Simple" nchannels="8" nchunksperloop="512" ngpus="8" coll="allreduce" inplace="1" outofplace="0" minBytes="524288" maxBytes="11534336">
<gpu id="0" i_chunks="512" o_chunks="0" s_chunks="448">
<tb id="0" send="-1" recv="-1" chan="0">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="16" deps="1" hasdep="0"/>