[Launch] Enable Implicit order launch with serial mode (#2033)
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
85baa0d113
Коммит
a9bb7e9807
@@ -1586,7 +1586,7 @@ static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturin
|
||||
if (capturing && driver < 12090) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
|
||||
*mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
|
||||
#else
|
||||
*mode = ncclImplicitOrderNone;
|
||||
*mode = ncclImplicitOrderSerial;
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1900,10 +1900,10 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
ncclIntruQueueConstruct(&planner->planQueue);
|
||||
|
||||
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
|
||||
//cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning
|
||||
cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
|
||||
cudaStream_t deviceStream, launchOrder;
|
||||
|
||||
cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent;
|
||||
CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
|
||||
|
||||
if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) {
|
||||
comm->workFifoProducedLastRecorded = comm->workFifoProduced;
|
||||
@@ -1918,8 +1918,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
if (capturing || planner->numStreams != 1) {
|
||||
// CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
|
||||
|
||||
// deviceStream waits on userStream[0]
|
||||
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#define NCCL_ROCMWRAP_H_
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include "checks.h"
|
||||
|
||||
typedef hsa_status_t (*PFN_hsa_init)();
|
||||
typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
|
||||
@@ -85,6 +86,17 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
|
||||
ncclResult_t rocmLibraryInit(void);
|
||||
|
||||
extern int ncclCudaDriverVersionCache;
|
||||
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
|
||||
|
||||
inline ncclResult_t ncclCudaDriverVersion(int* driver) {
|
||||
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
|
||||
if (version == -1) {
|
||||
CUDACHECK(cudaDriverGetVersion(&version));
|
||||
__atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
|
||||
}
|
||||
*driver = version;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -28,6 +28,8 @@ DECLARE_ROCM_PFN(hsa_status_string);
|
||||
|
||||
static void *hsaLib;
|
||||
static uint16_t version_major, version_minor;
|
||||
|
||||
int ncclCudaDriverVersionCache = -1;
|
||||
bool ncclCudaLaunchBlocking = false;
|
||||
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
|
||||
Ссылка в новой задаче
Block a user