[Launch] Move cudaEventRecord call to capturing stream only (#2050)
This commit is contained in:
+2
-2
@@ -1919,7 +1919,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
|
||||
cudaStream_t deviceStream, launchOrder;
|
||||
cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent;
|
||||
CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
|
||||
|
||||
if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) {
|
||||
comm->workFifoProducedLastRecorded = comm->workFifoProduced;
|
||||
@@ -1933,7 +1932,8 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
CUDACHECK(cudaEventCreateWithFlags(&comm->sharedRes->scratchEvent, cudaEventDisableTiming));
|
||||
}
|
||||
|
||||
if (capturing || planner->numStreams != 1) {
|
||||
if (capturing || planner->numStreams != 1 || ncclParamLaunchOrderImplicit()) {
|
||||
CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
|
||||
// deviceStream waits on userStream[0]
|
||||
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user