2
0

[Transport] Fix IntraNet (#1582)

[ROCm/rccl commit: d88cca3098]
Este cometimento está contido em:
Bertan Dogancay
2025-03-04 13:30:36 -05:00
cometido por GitHub
ascendente 086fa823db
cometimento d1247bbf2a
2 ficheiros modificados com 17 adições e 7 eliminações
+6 -7
Ver ficheiro
@@ -805,6 +805,12 @@ static ncclResult_t scheduleCollTasksToPlan(
}
proxyOp->channelId = c;
proxyOp->opCount = proxyOpId;
proxyOp->connIndex = 0;
if (task->protocol == NCCL_PROTO_SIMPLE && task->algorithm == NCCL_ALGO_RING) {
if (comm->useIntraNet && nBytes > rcclParamIntraNetThreshold()) {
proxyOp->connIndex = NCCL_CONN_IDX_P2P_NET;
}
}
addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
}
@@ -1992,13 +1998,6 @@ static ncclResult_t calcCollChunking(
}
}
proxyOp->connIndex = 0;
if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) {
if (comm->useIntraNet && nBytes > rcclParamIntraNetThreshold()) {
proxyOp->connIndex = NCCL_CONN_IDX_P2P_NET;
}
}
*outChunkSize = chunkSize;
return ncclSuccess;
}
+11
Ver ficheiro
@@ -1631,6 +1631,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
// Connect NET for intranode use
if (comm->graphs[NCCL_ALGO_RING].nIntraChannels && rcclParamP2pNetDisable() == 0) {
comm->useIntraNet = 1;
for (int c = 0; c < comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], NCCL_CONN_IDX_P2P_NET), ret, fail);
}
// Connect Trees
NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);