From d15a2c6b65c7c789f6832ecd19aebdce327951bf Mon Sep 17 00:00:00 2001 From: Mustafa Abduljabbar Date: Tue, 16 Dec 2025 11:56:39 -0500 Subject: [PATCH] Keep P2P self-copy for batched ops to prevent >32N hang. (#2108) [ROCm/rccl commit: 596567ff959d330e38b0f4ca3079557bbe98676b] --- projects/rccl/src/enqueue.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index ade0c9e855..ca0fcf4c7a 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -1257,8 +1257,9 @@ static ncclResult_t scheduleP2pTasksToPlan( ssize_t recvBytes = recv ? recv->bytes : -1; void* sendBuff = send ? send->buff : nullptr; void* recvBuff = recv ? recv->buff : nullptr; - - if (sendRank == comm->rank && send->buff == recv->buff) { + // Add check to keep in-place send to self when P2P batching is enabled + // Such case is not supported currently and is causing hangs + if (sendRank == comm->rank && send->buff == recv->buff && rcclParamP2pBatchEnable() == 0) { // Skip send to self in-place (we don't need to support this). ncclIntruQueueDequeue(&peers[sendRank].sendQueue); ncclIntruQueueDequeue(&peers[recvRank].recvQueue);