[Launch] Move cudaEventRecord call to capturing stream only (#2050)

This commit is contained in:
Bertan Dogancay
2025-11-13 09:38:09 -05:00
committed by GitHub
parent 46b032b760
commit 83ffc82fa7
+2 -2
View File
@@ -1919,7 +1919,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
cudaStream_t deviceStream, launchOrder;
cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent;
CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) {
comm->workFifoProducedLastRecorded = comm->workFifoProduced;
@@ -1933,7 +1932,8 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
CUDACHECK(cudaEventCreateWithFlags(&comm->sharedRes->scratchEvent, cudaEventDisableTiming));
}
if (capturing || planner->numStreams != 1) {
if (capturing || planner->numStreams != 1 || ncclParamLaunchOrderImplicit()) {
CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
// deviceStream waits on userStream[0]
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));