Optimize NCHANNELS and MSCCL config for gfx942 80CUs (#1195)

* Optimize NCHANNELS and MSCCL config for gfx942 80CUs

Set appropriately for different NCCL_MIN_NCHANNELS and MSCCL config,
potentially improving communication perf on the MI300x 80CUs

* Delete tools/msccl-algorithms/allreduce_1step_mccl_8_2_16777216_LL.xml

* Change the factor of gfx94 and update msccl config

[ROCm/rccl commit: cab25f919e]
This commit is contained in:
ClementLinCF
2024-06-01 22:07:46 +08:00
committed by GitHub
parent 7ca67f1cb9
commit 4f56aa5f8c
2 changed files with 319 additions and 1 deletions
+13 -1
View File
@@ -1390,6 +1390,18 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
if (ringGraph.nChannels > MAXCHANNELS/2)
allGather3Data[rank].nc = 1;
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94")) {
if (nranks == 2)
// NCCL_MIN_NCHANNELS=32
allGather3Data[rank].nc = 16;
else if (nranks == 4)
// NCCL_MIN_NCHANNELS=24
allGather3Data[rank].nc = 6;
else if (nranks == 8)
// NCCL_MIN_NCHANNELS=56
allGather3Data[rank].nc = 2;
}
allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
comm->topo->ll128Enabled = comm->topo->ll128Enabled || rcclParamLL128ForceEnable();
allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled;
@@ -2788,4 +2800,4 @@ exit:
return ret;
fail:
goto exit;
}
}
@@ -0,0 +1,306 @@
<algo name="allreduce_pairs" proto="LL" nchannels="2" nchunksperloop="8" ngpus="4" coll="allreduce" inplace="1" outofplace="0" minBytes="2048" maxBytes="13312">
<gpu id="0" i_chunks="8" o_chunks="0" s_chunks="24">
<tb id="0" send="-1" recv="-1" chan="0">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="1" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="0" dstbuf="i" dstoff="0" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="4" dstbuf="i" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="8" dstbuf="i" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="-1" chan="1">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="1" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="12" dstbuf="i" dstoff="4" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="16" dstbuf="i" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="20" dstbuf="i" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="1" recv="1" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="0" cnt="4" depid="-1" deps="-1" hasdep="0"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="0" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="4" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="1" dstbuf="i" dstoff="1" cnt="1" depid="4" deps="0" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="5" dstbuf="i" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="7" type="re" srcbuf="s" srcoff="9" dstbuf="i" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="1" recv="1" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="12" cnt="4" depid="-1" deps="-1" hasdep="0"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="12" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="4" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="13" dstbuf="i" dstoff="5" cnt="1" depid="5" deps="0" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="17" dstbuf="i" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="7" type="re" srcbuf="s" srcoff="21" dstbuf="i" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="2" recv="2" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="0" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="4" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="2" dstbuf="i" dstoff="2" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="6" dstbuf="i" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="10" dstbuf="i" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="2" recv="2" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="12" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="16" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="14" dstbuf="i" dstoff="6" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="18" dstbuf="i" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="22" dstbuf="i" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="3" recv="3" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="0" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="8" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="3" dstbuf="i" dstoff="3" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="7" dstbuf="i" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="11" dstbuf="i" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="3" recv="3" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="12" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="20" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="15" dstbuf="i" dstoff="7" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="19" dstbuf="i" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="23" dstbuf="i" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="1" i_chunks="8" o_chunks="0" s_chunks="24">
<tb id="0" send="-1" recv="-1" chan="0">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="1" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="1" dstbuf="i" dstoff="1" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="5" dstbuf="i" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="9" dstbuf="i" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="-1" chan="1">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="1" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="13" dstbuf="i" dstoff="5" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="17" dstbuf="i" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="21" dstbuf="i" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="0" recv="0" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="0" cnt="4" depid="-1" deps="-1" hasdep="0"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="0" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="4" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="0" dstbuf="i" dstoff="0" cnt="1" depid="4" deps="0" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="4" dstbuf="i" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="7" type="re" srcbuf="s" srcoff="8" dstbuf="i" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="0" recv="0" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="12" cnt="4" depid="-1" deps="-1" hasdep="0"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="12" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="4" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="12" dstbuf="i" dstoff="4" cnt="1" depid="5" deps="0" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="16" dstbuf="i" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="7" type="re" srcbuf="s" srcoff="20" dstbuf="i" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="2" recv="2" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="4" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="4" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="2" dstbuf="i" dstoff="2" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="6" dstbuf="i" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="10" dstbuf="i" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="2" recv="2" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="16" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="16" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="14" dstbuf="i" dstoff="6" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="18" dstbuf="i" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="22" dstbuf="i" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="3" recv="3" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="4" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="8" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="3" dstbuf="i" dstoff="3" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="7" dstbuf="i" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="11" dstbuf="i" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="3" recv="3" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="16" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="20" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="15" dstbuf="i" dstoff="7" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="19" dstbuf="i" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="23" dstbuf="i" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="2" i_chunks="8" o_chunks="0" s_chunks="24">
<tb id="0" send="-1" recv="-1" chan="0">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="1" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="2" dstbuf="i" dstoff="2" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="6" dstbuf="i" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="10" dstbuf="i" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="-1" chan="1">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="1" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="14" dstbuf="i" dstoff="6" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="18" dstbuf="i" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="22" dstbuf="i" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="0" recv="0" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="4" cnt="4" depid="-1" deps="-1" hasdep="0"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="0" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="4" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="0" dstbuf="i" dstoff="0" cnt="1" depid="6" deps="0" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="4" dstbuf="i" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="7" type="re" srcbuf="s" srcoff="8" dstbuf="i" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="0" recv="0" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="16" cnt="4" depid="-1" deps="-1" hasdep="0"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="12" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="4" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="12" dstbuf="i" dstoff="4" cnt="1" depid="7" deps="0" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="16" dstbuf="i" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="7" type="re" srcbuf="s" srcoff="20" dstbuf="i" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="1" recv="1" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="4" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="4" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="1" dstbuf="i" dstoff="1" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="5" dstbuf="i" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="9" dstbuf="i" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="1" recv="1" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="16" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="16" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="13" dstbuf="i" dstoff="5" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="17" dstbuf="i" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="21" dstbuf="i" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="3" recv="3" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="8" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="8" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="3" dstbuf="i" dstoff="3" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="7" dstbuf="i" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="11" dstbuf="i" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="3" recv="3" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="20" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="20" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="15" dstbuf="i" dstoff="7" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="19" dstbuf="i" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="23" dstbuf="i" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="3" i_chunks="8" o_chunks="0" s_chunks="24">
<tb id="0" send="-1" recv="-1" chan="0">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="1" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="3" dstbuf="i" dstoff="3" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="7" dstbuf="i" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="11" dstbuf="i" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="-1" chan="1">
<step s="0" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="1" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="15" dstbuf="i" dstoff="7" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="19" dstbuf="i" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="23" dstbuf="i" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="0" recv="0" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="8" cnt="4" depid="-1" deps="-1" hasdep="0"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="0" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="4" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="0" dstbuf="i" dstoff="0" cnt="1" depid="4" deps="0" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="4" dstbuf="i" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="7" type="re" srcbuf="s" srcoff="8" dstbuf="i" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="0" recv="0" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="20" cnt="4" depid="-1" deps="-1" hasdep="0"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="12" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="4" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="12" dstbuf="i" dstoff="4" cnt="1" depid="5" deps="0" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="16" dstbuf="i" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="7" type="re" srcbuf="s" srcoff="20" dstbuf="i" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="1" recv="1" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="8" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="4" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="6" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="1" dstbuf="i" dstoff="1" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="5" dstbuf="i" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="9" dstbuf="i" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="1" recv="1" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="20" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="16" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="7" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="13" dstbuf="i" dstoff="5" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="17" dstbuf="i" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="21" dstbuf="i" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="2" recv="2" chan="0">
<step s="0" type="s" srcbuf="i" srcoff="0" dstbuf="s" dstoff="8" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="0" dstbuf="s" dstoff="8" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="4" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="2" dstbuf="i" dstoff="2" cnt="1" depid="2" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="6" dstbuf="i" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="10" dstbuf="i" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="2" recv="2" chan="1">
<step s="0" type="s" srcbuf="i" srcoff="4" dstbuf="s" dstoff="20" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="1" type="r" srcbuf="i" srcoff="4" dstbuf="s" dstoff="20" cnt="4" depid="-1" deps="-1" hasdep="1"/>
<step s="2" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="0" hasdep="0"/>
<step s="3" type="nop" srcbuf="i" srcoff="-1" dstbuf="o" dstoff="-1" cnt="0" depid="5" deps="1" hasdep="0"/>
<step s="4" type="re" srcbuf="s" srcoff="14" dstbuf="i" dstoff="6" cnt="1" depid="3" deps="1" hasdep="0"/>
<step s="5" type="re" srcbuf="s" srcoff="18" dstbuf="i" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
<step s="6" type="re" srcbuf="s" srcoff="22" dstbuf="i" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
</algo>