revert memcpy use for direct AG (#2146)

Co-authored-by: Islam <nusislam@amd.com>
此提交包含在:
Nusrat Islam
2026-01-20 13:58:28 -06:00
提交者 GitHub
父節點 2fdcceaabb
當前提交 f3c5156bbf
+5 -7
查看文件
@@ -131,16 +131,14 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
dstBuf = recvbuff;
}
if (!in_place)
CUDACHECK(cudaMemcpyAsync((char*)dstBuf + rank * rankOffset, srcBuf, rankOffset, cudaMemcpyDeviceToDevice, stream));
NCCLCHECK(ncclGroupStart());
for (int r = 0; r < nRanks; r++) {
if (r != rank) {
NCCLCHECK(ncclSend(((char*)dstBuf) + rank * rankOffset, sendcount, datatype, r, comm, stream));
NCCLCHECK(ncclRecv(((char*)dstBuf) + r * rankOffset, sendcount, datatype, r, comm, stream));
}
if (r == rank && in_place)
continue;
NCCLCHECK(ncclSend(((char*)srcBuf), sendcount, datatype, r, comm, stream));
NCCLCHECK(ncclRecv(((char*)dstBuf) + r * rankOffset, sendcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;