From 83ffc82fa7c8c358cb9f5d272337c594d041bf65 Mon Sep 17 00:00:00 2001 From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com> Date: Thu, 13 Nov 2025 09:38:09 -0500 Subject: [PATCH] [Launch] Move cudaEventRecord call to capturing stream only (#2050) --- src/enqueue.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/enqueue.cc b/src/enqueue.cc index ceb6bd24c7..beda2feaf2 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -1919,7 +1919,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch cudaStream_t deviceStream, launchOrder; cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent; - CUDACHECK(cudaEventRecord(finishedEvent, launchStream)); if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) { comm->workFifoProducedLastRecorded = comm->workFifoProduced; @@ -1933,7 +1932,8 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { CUDACHECK(cudaEventCreateWithFlags(&comm->sharedRes->scratchEvent, cudaEventDisableTiming)); } - if (capturing || planner->numStreams != 1) { + if (capturing || planner->numStreams != 1 || ncclParamLaunchOrderImplicit()) { + CUDACHECK(cudaEventRecord(finishedEvent, launchStream)); // deviceStream waits on userStream[0] NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));