Keep P2P self-copy for batched ops to prevent >32N hang. (#2108)

[ROCm/rccl commit: 596567ff95]
Этот коммит содержится в:
Mustafa Abduljabbar
2025-12-16 11:56:39 -05:00
коммит произвёл GitHub
родитель ddfff6b705
Коммит d15a2c6b65
+3 -2
Просмотреть файл
@@ -1257,8 +1257,9 @@ static ncclResult_t scheduleP2pTasksToPlan(
ssize_t recvBytes = recv ? recv->bytes : -1;
void* sendBuff = send ? send->buff : nullptr;
void* recvBuff = recv ? recv->buff : nullptr;
if (sendRank == comm->rank && send->buff == recv->buff) {
// Add check to keep in-place send to self when P2P batching is enabled
// Such case is not supported currently and is causing hangs
if (sendRank == comm->rank && send->buff == recv->buff && rcclParamP2pBatchEnable() == 0) {
// Skip send to self in-place (we don't need to support this).
ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
ncclIntruQueueDequeue(&peers[recvRank].recvQueue);