From a9bb7e9807ce44331490bf28ea51278c3e50704c Mon Sep 17 00:00:00 2001 From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com> Date: Fri, 7 Nov 2025 13:29:53 -0500 Subject: [PATCH] [Launch] Enable Implicit order launch with serial mode (#2033) --- src/enqueue.cc | 8 +++----- src/include/rocmwrap.h | 12 ++++++++++++ src/misc/rocmwrap.cc | 2 ++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/enqueue.cc b/src/enqueue.cc index b2f3ba4f7b..594474be1c 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -1586,7 +1586,7 @@ static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturin if (capturing && driver < 12090) { *mode = ncclImplicitOrderSerial; return ncclSuccess; } *mode = 12030 <= std::min(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial; #else - *mode = ncclImplicitOrderNone; + *mode = ncclImplicitOrderSerial; #endif return ncclSuccess; } @@ -1900,10 +1900,10 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { ncclIntruQueueConstruct(&planner->planQueue); bool capturing = ncclCudaGraphValid(planner->capturingGraph); - //cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning + cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch cudaStream_t deviceStream, launchOrder; - cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent; + CUDACHECK(cudaEventRecord(finishedEvent, launchStream)); if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) { comm->workFifoProducedLastRecorded = comm->workFifoProduced; @@ -1918,8 +1918,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { } if (capturing || planner->numStreams != 1) { - // CUDACHECK(cudaEventRecord(finishedEvent, launchStream)); - // deviceStream waits on userStream[0] NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); diff --git a/src/include/rocmwrap.h b/src/include/rocmwrap.h index 27301bac82..381d0c5a7a 100644 --- a/src/include/rocmwrap.h +++ b/src/include/rocmwrap.h @@ -9,6 +9,7 @@ #define NCCL_ROCMWRAP_H_ #include +#include "checks.h" typedef hsa_status_t (*PFN_hsa_init)(); typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value); @@ -85,6 +86,17 @@ extern CUmemAllocationHandleType ncclCuMemHandleType; ncclResult_t rocmLibraryInit(void); +extern int ncclCudaDriverVersionCache; extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit() +inline ncclResult_t ncclCudaDriverVersion(int* driver) { + int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED); + if (version == -1) { + CUDACHECK(cudaDriverGetVersion(&version)); + __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED); + } + *driver = version; + return ncclSuccess; +} + #endif diff --git a/src/misc/rocmwrap.cc b/src/misc/rocmwrap.cc index 98239fe320..9f23fe95f9 100644 --- a/src/misc/rocmwrap.cc +++ b/src/misc/rocmwrap.cc @@ -28,6 +28,8 @@ DECLARE_ROCM_PFN(hsa_status_string); static void *hsaLib; static uint16_t version_major, version_minor; + +int ncclCudaDriverVersionCache = -1; bool ncclCudaLaunchBlocking = false; static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;