diff --git a/src/enqueue.cc b/src/enqueue.cc index ceb6bd24c7..beda2feaf2 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -1919,7 +1919,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch cudaStream_t deviceStream, launchOrder; cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent; - CUDACHECK(cudaEventRecord(finishedEvent, launchStream)); if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) { comm->workFifoProducedLastRecorded = comm->workFifoProduced; @@ -1933,7 +1932,8 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { CUDACHECK(cudaEventCreateWithFlags(&comm->sharedRes->scratchEvent, cudaEventDisableTiming)); } - if (capturing || planner->numStreams != 1) { + if (capturing || planner->numStreams != 1 || ncclParamLaunchOrderImplicit()) { + CUDACHECK(cudaEventRecord(finishedEvent, launchStream)); // deviceStream waits on userStream[0] NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));