[Launch] Enable Implicit order launch with serial mode (#2033)

[ROCm/rccl commit: a9bb7e9807]
Этот коммит содержится в:
Bertan Dogancay
2025-11-07 13:29:53 -05:00
коммит произвёл GitHub
родитель 5ca67dc803
Коммит 524453baea
3 изменённых файлов: 17 добавлений и 5 удалений
+3 -5
Просмотреть файл
@@ -1586,7 +1586,7 @@ static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturin
if (capturing && driver < 12090) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
*mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
#else
*mode = ncclImplicitOrderNone;
*mode = ncclImplicitOrderSerial;
#endif
return ncclSuccess;
}
@@ -1900,10 +1900,10 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
ncclIntruQueueConstruct(&planner->planQueue);
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
//cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning
cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
cudaStream_t deviceStream, launchOrder;
cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent;
CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) {
comm->workFifoProducedLastRecorded = comm->workFifoProduced;
@@ -1918,8 +1918,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
}
if (capturing || planner->numStreams != 1) {
// CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
// deviceStream waits on userStream[0]
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+12
Просмотреть файл
@@ -9,6 +9,7 @@
#define NCCL_ROCMWRAP_H_
#include <hsa/hsa.h>
#include "checks.h"
typedef hsa_status_t (*PFN_hsa_init)();
typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
@@ -85,6 +86,17 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
ncclResult_t rocmLibraryInit(void);
extern int ncclCudaDriverVersionCache;
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
inline ncclResult_t ncclCudaDriverVersion(int* driver) {
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
if (version == -1) {
CUDACHECK(cudaDriverGetVersion(&version));
__atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
}
*driver = version;
return ncclSuccess;
}
#endif
+2
Просмотреть файл
@@ -28,6 +28,8 @@ DECLARE_ROCM_PFN(hsa_status_string);
static void *hsaLib;
static uint16_t version_major, version_minor;
int ncclCudaDriverVersionCache = -1;
bool ncclCudaLaunchBlocking = false;
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;