Move MSCCL algorithm loading to initialization to workaround HIP graph conflict (#982)

* MSCCL: pre-specify channels and pre-load algorithms

* add mutex

* fix bug

* clean include

* disable all-gathers temporarily

[ROCm/rccl commit: 4bb0b4a380]
This commit is contained in:
Ziyue Yang
2023-12-01 01:47:20 +08:00
committed by GitHub
parent 5efe13655d
commit f0c47d085e
12 changed files with 85 additions and 4504 deletions
-8
View File
@@ -56,14 +56,6 @@ ncclResult_t mscclRunAlgo(
NCCLCHECK(mscclSetupSyncFlags(stream));
if (status.connectedAlgos[comm].find(mscclAlgoHandle) == status.connectedAlgos[comm].end()) {
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
NCCLCHECK(mscclSetupConnections(hostAlgo, comm));
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
status.connectedAlgos[comm].insert(mscclAlgoHandle);
}
NCCLCHECK(mscclSetupProxy(hostAlgo, comm, stream));
NCCLCHECK(mscclSetupKernel(sendBuff, recvBuff, count, dataType, op, hostAlgo, devAlgo, comm, stream));
+11 -2
View File
@@ -11,6 +11,8 @@
#include "rings.h"
#include "topo.h"
#include "msccl/msccl_lifecycle.h"
/******************************************************************/
/********************* Internode connection ***********************/
/******************************************************************/
@@ -576,15 +578,22 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
int minNchannels = ncclMinNchannels();
if (mscclEnabled()) {
int mscclNumChannelsRequired = 0;
mscclSchedulerInit(comm, &mscclNumChannelsRequired);
minNchannels = std::max(minNchannels, mscclNumChannelsRequired);
}
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
if (comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(minNchannels, std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(minNchannels, std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
}
// Create rings array and check all is fine
@@ -18,6 +18,8 @@ bool mscclIsCaller();
bool mscclAvailable();
ncclResult_t mscclSchedulerInit(ncclComm_t comm, int* numChannelsRequired);
ncclResult_t mscclInit(ncclComm_t comm);
ncclResult_t mscclGroupStart();
@@ -91,6 +91,8 @@ struct mscclAlgoMeta {
std::string filePath;
// number of chunks of input/output in each MSCCL algorithm loop
int nChunksPerLoop;
// number of channels needed by MSCCL algorithm
int nChannels;
// number of ranks required by this algorithm
int nRanks;
// need to times nRanks for all-gather, reduce-scatter and all-to-all
+64 -19
View File
@@ -26,7 +26,6 @@ RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1);
RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0);
static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH";
static std::atomic<bool> mscclInitialized;
static bool mscclSchedulerTriedLoadAlgo = false;
static std::mutex mscclLifecycleMutex;
bool mscclEnabled() {
@@ -78,8 +77,21 @@ static const char* mscclUnitTestAlgoDefaultDir = "msccl-unit-test-algorithms";
static const char* mscclAlgoShareDirPath = "../share/rccl/msccl-algorithms";
static const char* mscclUnitTestAlgoShareDirPath = "../share/rccl/msccl-unit-test-algorithms";
static ncclResult_t mscclInternalSchedulerInit() {
static ncclResult_t mscclInternalSchedulerInit(ncclComm_t comm, int* numChannelsRequired) {
static bool mscclAlgoMetaLoaded = false;
mscclStatus& status = mscclGetStatus();
*numChannelsRequired = 0;
// Query numChannelsRequired from loaded algorithm metas
if (mscclAlgoMetaLoaded) {
for (auto& m : status.algoMetas) {
if (comm->nRanks == m.nRanks) {
*numChannelsRequired = std::max(*numChannelsRequired, m.nChannels);
}
}
return ncclSuccess;
}
const char* mscclAlgoDir = getenv(mscclAlgoDirEnv);
const char* mscclAlgoShareDir = nullptr;
std::string mscclAlgoDirStr;
@@ -117,6 +129,7 @@ static ncclResult_t mscclInternalSchedulerInit() {
fullDirPath = mscclAlgoDir;
}
INFO(NCCL_INIT, "Using MSCCL files from %s", fullDirPath);
while ((entry = readdir(dp))) {
if (entry->d_type != DT_LNK && entry->d_type != DT_REG) {
continue;
@@ -126,16 +139,28 @@ static ncclResult_t mscclInternalSchedulerInit() {
fullPath += "/";
fullPath += entry->d_name;
NCCLCHECK(mscclGetAlgoMetaFromXmlFile(fullPath.c_str(), &(status.algoMetas.back())));
if (status.algoMetas.back().nRanks == comm->nRanks) {
*numChannelsRequired = std::max(*numChannelsRequired, status.algoMetas.back().nChannels);
}
}
if (closedir(dp)) {
WARN("MSCCL Internal Scheduler: closedir failed, error %d", errno);
return ncclInvalidUsage;
}
status.rankToAlgoHandles.resize(status.algoMetas.size());
mscclAlgoMetaLoaded = true;
return ncclSuccess;
}
static ncclResult_t mscclSchedulerInit() {
ncclResult_t mscclSchedulerInit(ncclComm_t comm, int* numChannelsRequired) {
*numChannelsRequired = 0;
comm->mscclCompatible = mscclCommCompatible(comm);
if (!comm->mscclCompatible) {
return ncclSuccess;
}
std::lock_guard<std::mutex> lock(mscclLifecycleMutex);
mscclStatus& status = mscclGetStatus();
bool useInternalScheduler = false;
@@ -155,11 +180,14 @@ static ncclResult_t mscclSchedulerInit() {
useInternalScheduler = true;
}
}
if (useInternalScheduler) {
NCCLCHECK(mscclInternalSchedulerInit());
NCCLCHECK(mscclInternalSchedulerInit(comm, numChannelsRequired));
} else {
NCCLCHECK(status.mscclSchedulerPtr->init());
*numChannelsRequired = MAXCHANNELS;
}
return ncclSuccess;
}
@@ -170,30 +198,53 @@ ncclResult_t mscclInit(ncclComm_t comm) {
threadLocalStatus.groupDepth = 0;
threadLocalStatus.captureId = ULLONG_MAX;
threadLocalStatus.captureStatus = mscclNoCapture;
comm->mscclCompatible = mscclCommCompatible(comm);
{
std::lock_guard<std::mutex> lock(mscclLifecycleMutex);
mscclStatus& status = mscclGetStatus();
// Free algorithm handles are initialized globally once and before algorithm pre-processing
if (!mscclInitialized.load(std::memory_order_acquire)) {
status.freeAlgoHandles.resize(MSCCL_MAX_NUM_ALGOS);
for (int i = 0; i < MSCCL_MAX_NUM_ALGOS; i++) {
status.freeAlgoHandles[i] = MSCCL_MAX_NUM_ALGOS - i - 1;
}
}
// Pre-process all algorithms for internal scheduler and for different comms.
// This is a temp fix to bypass the issue that stream cannot be synchronized during HIP graph capturing,
// should use dynamic loading approach after the issue is fixed.
if (comm->mscclCompatible && !status.mscclSchedulerPtr) {
for (size_t i = 0; i < status.algoMetas.size(); i++) {
auto &m = status.algoMetas[i];
mscclAlgoHandle_t mscclAlgoHandle;
if (m.nRanks == comm->nRanks) {
// Load algorithms
if (status.rankToAlgoHandles[i].find(comm->rank) == status.rankToAlgoHandles[i].end()) {
NCCLCHECK(mscclLoadAlgo(m.filePath.c_str(), &mscclAlgoHandle, comm->rank));
status.rankToAlgoHandles[i][comm->rank] = mscclAlgoHandle;
}
// Connect algorithms
if (status.connectedAlgos[comm].find(mscclAlgoHandle) == status.connectedAlgos[comm].end()) {
NCCLCHECK(mscclSetupConnections(status.hostAlgos[mscclAlgoHandle], comm));
status.connectedAlgos[comm].insert(mscclAlgoHandle);
}
}
}
}
if (mscclInitialized.load(std::memory_order_acquire)) {
return ncclSuccess;
}
mscclStatus& status = mscclGetStatus();
status.scratchBuffer = nullptr;
status.scratchBufferSize = 0;
status.workIndex = 1;
status.freeAlgoHandles.resize(MSCCL_MAX_NUM_ALGOS);
for (int i = 0; i < MSCCL_MAX_NUM_ALGOS; i++) {
status.freeAlgoHandles[i] = MSCCL_MAX_NUM_ALGOS - i - 1;
}
NCCLCHECK(ncclCudaCalloc(&status.syncFlags, MSCCL_MAX_NUM_THREAD_BLOCKS));
status.lastStream = nullptr;
status.needsProxy = false;
NCCLCHECK(mscclInitWorkFifoStatus(&(status.defaultWorkFifoStatus)));
mscclSchedulerTriedLoadAlgo = false;
NCCLCHECK(mscclSchedulerInit());
mscclInitialized.store(true, std::memory_order_release);
}
@@ -248,12 +299,6 @@ static ncclResult_t mscclInternalSchedulerSelectAlgo(struct mscclSchedulerParam*
m.nRanks == param->nRanks &&
m.func == param->func &&
(isInPlace ? m.inPlace : m.outOfPlace)) {
// If not loaded for current rank, load it
if (status.rankToAlgoHandles[i].find(param->rank) == status.rankToAlgoHandles[i].end()) {
mscclAlgoHandle_t algoHandle;
NCCLCHECK(mscclLoadAlgo(m.filePath.c_str(), &algoHandle, param->rank));
status.rankToAlgoHandles[i][param->rank] = algoHandle;
}
param->handle = status.rankToAlgoHandles[i][param->rank];
param->scheduled = true;
return ncclSuccess;
@@ -731,6 +731,10 @@ ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* str, struct mscclAlgoMeta*
NCCLCHECK(mscclXmlGetAttrInt(node, "nchunksperloop", &nChunksPerLoop));
algoMeta->nChunksPerLoop = nChunksPerLoop;
int nChannels;
NCCLCHECK(mscclXmlGetAttrInt(node, "nchannels", &nChannels));
algoMeta->nChannels = nChannels;
int nGpus;
NCCLCHECK(mscclXmlGetAttrInt(node, "ngpus", &nGpus));
algoMeta->nRanks = nGpus;
+2 -7
View File
@@ -95,14 +95,9 @@ ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo, ncclComm_t comm)
mscclStatus& status = mscclGetStatus();
// Check whether there are enough channels
if (hostAlgo->nChannels > MAXCHANNELS) {
WARN("MSCCL: max number of channels available (%d) less than required (%d)", MAXCHANNELS, hostAlgo->nChannels);
return ncclInvalidUsage;
}
if (hostAlgo->nChannels > comm->nChannels) {
for (int channelId = comm->nChannels; channelId < hostAlgo->nChannels; channelId++) {
NCCLCHECK(initChannel(comm, channelId));
}
WARN("MSCCL: number of channels available (%d) less than required (%d)", comm->nChannels, hostAlgo->nChannels);
return ncclInvalidUsage;
}
// Flag MSCCL connections
@@ -1,356 +0,0 @@
<!-- Copyright (c) Microsoft Corporation. -->
<!-- Licensed under the MIT License. -->
<algo name="all_gather_llm" proto="LL" nchannels="1" nchunksperloop="8" ngpus="8" coll="allgather" inplace="1" outofplace="0" minBytes="0" maxBytes="1023">
<gpu id="0" i_chunks="0" o_chunks="8" s_chunks="0">
<tb id="0" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="1" i_chunks="0" o_chunks="8" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="2" i_chunks="0" o_chunks="8" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="3" i_chunks="0" o_chunks="8" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="4" i_chunks="0" o_chunks="8" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="5" i_chunks="0" o_chunks="8" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="6" i_chunks="0" o_chunks="8" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="7" i_chunks="0" o_chunks="8" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
</algo>
@@ -1,692 +0,0 @@
<!-- Copyright (c) Microsoft Corporation. -->
<!-- Licensed under the MIT License. -->
<algo name="all_gather_llm" proto="LL" nchannels="2" nchunksperloop="16" ngpus="8" coll="allgather" inplace="1" outofplace="0" minBytes="0" maxBytes="4095">
<gpu id="0" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="2" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="3" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="4" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="1" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="2" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="3" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="4" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="1" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="2" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="3" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="4" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="2" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="3" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="4" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="2" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="3" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="4" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="3" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="4" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="3" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="4" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="4" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="4" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="3" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="3" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="5" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="3" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="4" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="3" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="4" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="6" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="3" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="4" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="5" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="3" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="4" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="5" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="7" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="3" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="3" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="4" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="4" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="5" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="5" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="6" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="6" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="3" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="4" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="5" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="6" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
</algo>
File diff suppressed because it is too large Load Diff
@@ -1,692 +0,0 @@
<!-- Copyright (c) Microsoft Corporation. -->
<!-- Licensed under the MIT License. -->
<algo name="all_gather_llm" proto="LL" nchannels="16" nchunksperloop="16" ngpus="8" coll="allgather" inplace="1" outofplace="0" minBytes="4096" maxBytes="8191">
<gpu id="0" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="1" chan="9">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="2">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="2" chan="10">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="3" chan="3">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="3" chan="11">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="4" chan="4">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="4" chan="12">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="5">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="13">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="6">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="14">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="7">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="15">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="1" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="1" recv="-1" chan="8">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="2" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="2" recv="-1" chan="8">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="3" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="3" recv="-1" chan="8">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="4" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="4" recv="-1" chan="8">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="8">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="8">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="0">
<step s="0" type="s" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="8">
<step s="0" type="s" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="1" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="8">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="2" chan="2">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="2" chan="10">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="3" chan="3">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="3" chan="11">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="4" chan="4">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="4" chan="12">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="5">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="13">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="6">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="14">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="7">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="15">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="9">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="2" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="2" recv="-1" chan="9">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="3" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="3" recv="-1" chan="9">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="4" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="4" recv="-1" chan="9">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="9">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="9">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="1">
<step s="0" type="s" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="9">
<step s="0" type="s" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="2" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="8">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="9">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="3" chan="3">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="3" chan="11">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="4" chan="4">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="4" chan="12">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="5">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="13">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="6">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="14">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="7">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="15">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="2">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="10">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="2">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="10">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="3" recv="-1" chan="2">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="3" recv="-1" chan="10">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="4" recv="-1" chan="2">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="4" recv="-1" chan="10">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="2">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="10">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="2">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="10">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="2">
<step s="0" type="s" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="10">
<step s="0" type="s" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="3" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="8">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="9">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="2">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="10">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="4" chan="4">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="4" chan="12">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="5">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="13">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="6">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="14">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="7">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="15">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="3">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="11">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="3">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="11">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="3">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="11">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="4" recv="-1" chan="3">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="4" recv="-1" chan="11">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="3">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="11">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="3">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="11">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="3">
<step s="0" type="s" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="11">
<step s="0" type="s" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="4" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="8">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="9">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="2">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="10">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="3" chan="3">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="3" chan="11">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="5" chan="5">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="5" chan="13">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="6">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="14">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="7">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="15">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="4">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="12">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="4">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="12">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="4">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="12">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="3" recv="-1" chan="4">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="3" recv="-1" chan="12">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="5" recv="-1" chan="4">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="5" recv="-1" chan="12">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="4">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="12">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="4">
<step s="0" type="s" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="12">
<step s="0" type="s" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="5" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="8">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="9">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="2">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="10">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="3" chan="3">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="3" chan="11">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="4" chan="4">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="4" chan="12">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="6" chan="6">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="6" chan="14">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="7">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="15">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="5">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="13">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="5">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="13">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="5">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="13">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="3" recv="-1" chan="5">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="3" recv="-1" chan="13">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="4" recv="-1" chan="5">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="4" recv="-1" chan="13">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="6" recv="-1" chan="5">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="6" recv="-1" chan="13">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="5">
<step s="0" type="s" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="13">
<step s="0" type="s" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="6" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="8">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="9">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="2">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="10">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="3" chan="3">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="3" chan="11">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="4" chan="4">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="4" chan="12">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="5" chan="5">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="5" chan="13">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="7" chan="7">
<step s="0" type="r" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="7" chan="15">
<step s="0" type="r" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="6">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="14">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="6">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="14">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="6">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="14">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="3" recv="-1" chan="6">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="3" recv="-1" chan="14">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="4" recv="-1" chan="6">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="4" recv="-1" chan="14">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="5" recv="-1" chan="6">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="5" recv="-1" chan="14">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="7" recv="-1" chan="6">
<step s="0" type="s" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="7" recv="-1" chan="14">
<step s="0" type="s" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
<gpu id="7" i_chunks="0" o_chunks="16" s_chunks="0">
<tb id="0" send="-1" recv="0" chan="0">
<step s="0" type="r" srcbuf="o" srcoff="0" dstbuf="o" dstoff="0" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="1" send="-1" recv="0" chan="8">
<step s="0" type="r" srcbuf="o" srcoff="1" dstbuf="o" dstoff="1" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="2" send="-1" recv="1" chan="1">
<step s="0" type="r" srcbuf="o" srcoff="2" dstbuf="o" dstoff="2" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="3" send="-1" recv="1" chan="9">
<step s="0" type="r" srcbuf="o" srcoff="3" dstbuf="o" dstoff="3" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="4" send="-1" recv="2" chan="2">
<step s="0" type="r" srcbuf="o" srcoff="4" dstbuf="o" dstoff="4" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="5" send="-1" recv="2" chan="10">
<step s="0" type="r" srcbuf="o" srcoff="5" dstbuf="o" dstoff="5" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="6" send="-1" recv="3" chan="3">
<step s="0" type="r" srcbuf="o" srcoff="6" dstbuf="o" dstoff="6" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="7" send="-1" recv="3" chan="11">
<step s="0" type="r" srcbuf="o" srcoff="7" dstbuf="o" dstoff="7" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="8" send="-1" recv="4" chan="4">
<step s="0" type="r" srcbuf="o" srcoff="8" dstbuf="o" dstoff="8" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="9" send="-1" recv="4" chan="12">
<step s="0" type="r" srcbuf="o" srcoff="9" dstbuf="o" dstoff="9" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="10" send="-1" recv="5" chan="5">
<step s="0" type="r" srcbuf="o" srcoff="10" dstbuf="o" dstoff="10" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="11" send="-1" recv="5" chan="13">
<step s="0" type="r" srcbuf="o" srcoff="11" dstbuf="o" dstoff="11" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="12" send="-1" recv="6" chan="6">
<step s="0" type="r" srcbuf="o" srcoff="12" dstbuf="o" dstoff="12" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="13" send="-1" recv="6" chan="14">
<step s="0" type="r" srcbuf="o" srcoff="13" dstbuf="o" dstoff="13" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="14" send="0" recv="-1" chan="7">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="15" send="0" recv="-1" chan="15">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="16" send="1" recv="-1" chan="7">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="17" send="1" recv="-1" chan="15">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="18" send="2" recv="-1" chan="7">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="19" send="2" recv="-1" chan="15">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="20" send="3" recv="-1" chan="7">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="21" send="3" recv="-1" chan="15">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="22" send="4" recv="-1" chan="7">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="23" send="4" recv="-1" chan="15">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="24" send="5" recv="-1" chan="7">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="25" send="5" recv="-1" chan="15">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="26" send="6" recv="-1" chan="7">
<step s="0" type="s" srcbuf="o" srcoff="14" dstbuf="o" dstoff="14" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
<tb id="27" send="6" recv="-1" chan="15">
<step s="0" type="s" srcbuf="o" srcoff="15" dstbuf="o" dstoff="15" cnt="1" depid="-1" deps="-1" hasdep="0"/>
</tb>
</gpu>
</algo>
File diff suppressed because it is too large Load Diff