From f3c5156bbf39c4c5c66014db8d2b0ecd5b3a7079 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Tue, 20 Jan 2026 13:58:28 -0600 Subject: [PATCH] revert memcpy use for direct AG (#2146) Co-authored-by: Islam --- src/collectives.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/collectives.cc b/src/collectives.cc index 50a7a9297f..390045eccc 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -131,16 +131,14 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen dstBuf = recvbuff; } - if (!in_place) - CUDACHECK(cudaMemcpyAsync((char*)dstBuf + rank * rankOffset, srcBuf, rankOffset, cudaMemcpyDeviceToDevice, stream)); - NCCLCHECK(ncclGroupStart()); for (int r = 0; r < nRanks; r++) { - if (r != rank) { - NCCLCHECK(ncclSend(((char*)dstBuf) + rank * rankOffset, sendcount, datatype, r, comm, stream)); - NCCLCHECK(ncclRecv(((char*)dstBuf) + r * rankOffset, sendcount, datatype, r, comm, stream)); - } + if (r == rank && in_place) + continue; + + NCCLCHECK(ncclSend(((char*)srcBuf), sendcount, datatype, r, comm, stream)); + NCCLCHECK(ncclRecv(((char*)dstBuf) + r * rankOffset, sendcount, datatype, r, comm, stream)); } NCCLCHECK(ncclGroupEnd()); return ncclSuccess;