From 47b2fc3a3097bfdfd7c937941e03a75bdf174522 Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Tue, 6 Sep 2022 10:29:46 -0600 Subject: [PATCH] Adding opt-in hipGraph support for RCCL via RCCL_ENABLE_HIPGRAPH (#608) Adding opt-in hipGraph support via RCCL_ENABLE_HIPGRAPH --- CHANGELOG.md | 4 +++- src/enqueue.cc | 43 +++++++++++++++++++++++++++-------------- src/include/alloc.h | 34 ++++++++++++++++++++------------ src/include/param.h | 3 +++ src/include/rccl_vars.h | 30 ++++++++++++++++++++++++++++ src/init.cc | 22 +++++++++++---------- 6 files changed, 99 insertions(+), 37 deletions(-) create mode 100644 src/include/rccl_vars.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 189452d206..93180498c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: - When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll (e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options to resolve the massive port usage issue - - Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1 +- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1 +- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH + ### Removed - Removed experimental clique-based kernels diff --git a/src/enqueue.cc b/src/enqueue.cc index 1356477384..e641dfbd4f 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -14,7 +14,7 @@ #include "gdrwrap.h" #include "bootstrap.h" #include "channel.h" - +#include "rccl_vars.h" #include // std::memcpy // Only generate inline kernels for LL @@ -293,7 +293,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) { if (comm->launchMode == ncclComm::GROUP) { NCCLCHECK(ncclCpuBarrierOut(comm)); } else { - if (!comm->usingCudaGraph) + if (!rcclParamEnableHipGraph() || !comm->usingCudaGraph) CUDACHECK(hipExtLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream, NULL, comm->doneEvent, 0)); else CUDACHECK(hipLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); @@ -328,7 +328,10 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) { // Enqueue event after NCCL kernel (only in non-graph mode) // [RCCL] move event record into hipExtLaunchKernel - // if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream)); + if (rcclParamEnableHipGraph()) + { + if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream)); + } // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || @@ -1231,33 +1234,40 @@ void* graphHelperFunc(void *args) { } } } +RCCL_PARAM(EnableHipGraph, "ENABLE_HIPGRAPH", 0); // Check if we are in CUDA Graph capture mode ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) { comm->usingCudaGraph = 0; // Feature requires CUDA 11.3/R465 or above -#if CUDART_VERSION >= 11030 - cudaStreamCaptureStatus captureStatus; +#if HIP_VERSION >= 50322000 + hipStreamCaptureStatus captureStatus; unsigned long long cudaGraphId; ncclResult_t ret = ncclSuccess; - if (comm->driverVersion < 11030) { + if (comm->driverVersion < 50322000) { // Runtime driver version older than compiler version // Enhanced compat fallback goto enh_compat_end; } // Get CUDA Graph handle - CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end); - if (captureStatus == cudaStreamCaptureStatusActive) { + CUDACHECKGOTO(hipStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end); + if (captureStatus == hipStreamCaptureStatusActive) { if (cudaGraphId != comm->lastCudaGraphId) { INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId); // We are in a new graph, hence need to forget the last setup node so that // the first setup node in the new graph will not have a dependency - comm->lastCudaGraphId = hipGraphId; + comm->lastCudaGraphId = cudaGraphId; comm->lastSetupNode = NULL; } if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH; comm->usingCudaGraph = 1; + if (!rcclParamEnableHipGraph()) + { + WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero in order to support hipGraph usage"); + return ncclInvalidUsage; + } + // Create helper thread that closes IPC handles during graph destruction // Only create this thread when buffer registration is enabled if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) { @@ -1276,9 +1286,9 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) { enh_compat_end: // Enhanced compat fallback (void)ret; - CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus)); - if (captureStatus != cudaStreamCaptureStatusNone) { - WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support"); + CUDACHECK(hipStreamIsCapturing(comm->userStream, &captureStatus)); + if (captureStatus != hipStreamCaptureStatusNone) { + WARN("The installed ROCm driver is older than the minimum version (50322000) required for RCCL's HIP Graphs support"); return ncclInvalidUsage; } // If we are not in capture mode, we can ignore the driver being lower @@ -1288,7 +1298,12 @@ enh_compat_end: // Enhanced compat fallback // Create host setup node in CUDA Graph ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) { -#if CUDART_VERSION >= 11030 +#if HIP_VERSION >= 50322000 + if (!rcclParamEnableHipGraph()) + { + WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero to enable HIP graph feature"); + return ncclInternalError; + } struct ncclQueueInfo* eqInfo = comm->enqueueInfo; // Create a CUDA object to wrap around the argument space // which CUDA graph would manage lifetime of @@ -1309,7 +1324,7 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) { comm->lastSetupNode = setupNode; return ncclSuccess; #else - WARN("NCCL does not support this CUDA version for CUDA graph feature"); + WARN("RCCL does not support this ROCm version for HIP graph feature"); return ncclInternalError; #endif } diff --git a/src/include/alloc.h b/src/include/alloc.h index e9e07f7ae2..e17016f327 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -15,6 +15,7 @@ #include #include #include +#include "rccl_vars.h" template static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { @@ -78,23 +79,32 @@ extern struct allocationTracker allocTracker[]; template static ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) { -#if CUDART_VERSION >= 11030 + // Need async stream for P2P pre-connect + CUDA Graph - hipStream_t stream; - CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); -#endif + static bool streamCreated = false; + static hipStream_t stream; + if (rcclParamEnableHipGraph() && !streamCreated) + { + // Create stream only once to avoid performance penalty + CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + streamCreated = true; + } + if (isFineGrain) CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained)); else CUDACHECK(hipMalloc(ptr, nelem*sizeof(T))); -#if CUDART_VERSION >= 11030 - CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream)); - CUDACHECK(hipStreamSynchronize(stream)); - CUDACHECK(hipStreamDestroy(stream)); -#else - CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T))); - CUDACHECK(hipStreamSynchronize(NULL)); -#endif + + if (rcclParamEnableHipGraph()) { + CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream)); + CUDACHECK(hipStreamSynchronize(stream)); + // NOTE: Currently the re-used stream is not destroyed + //CUDACHECK(hipStreamDestroy(stream)); + } else { + CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T))); + CUDACHECK(hipStreamSynchronize(NULL)); + } + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); int dev; CUDACHECK(hipGetDevice(&dev)); diff --git a/src/include/param.h b/src/include/param.h index ca243ca6fb..52f521da28 100644 --- a/src/include/param.h +++ b/src/include/param.h @@ -27,6 +27,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6 return cache; \ } +#define RCCL_PARAM_DECLARE(name) \ +int64_t rcclParam##name() + #define RCCL_PARAM(name, env, default_value) \ pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \ int64_t rcclParam##name() { \ diff --git a/src/include/rccl_vars.h b/src/include/rccl_vars.h new file mode 100644 index 0000000000..a9b1b45815 --- /dev/null +++ b/src/include/rccl_vars.h @@ -0,0 +1,30 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef RCCL_VARS_H_ +#define RCCL_VARS_H_ + +#include "param.h" + +RCCL_PARAM_DECLARE(EnableHipGraph); // Opt-in environment variable for enabling hipGraph + +#endif diff --git a/src/init.cc b/src/init.cc index 8238acab41..8f9092de7d 100644 --- a/src/init.cc +++ b/src/init.cc @@ -34,6 +34,7 @@ // [RCCL] #include "git_version.h" +#include "rccl_vars.h" //#include "clique/CliqueManager.h" //#include // [/RCCL] @@ -417,13 +418,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua comm->lastCudaGraphId = -1; comm->disableGraphHelper = ncclParamDisableGraphHelper(); comm->graphRegister = ncclParamGraphRegister(); -#if CUDART_VERSION >= 11030 - NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1)); - comm->graphHelperResources->comm = comm; - if (comm->driverVersion >= 11030) - // hipGetDriverEntryPoint requires R465 or above (enhanced compat need) - CUDACHECK(hipGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, hipEnableDefault)); -#endif + + if (rcclParamEnableHipGraph()) + { + NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1)); + comm->graphHelperResources->comm = comm; + comm->pfnCuMemGetAddressRange = hipMemGetAddressRange; + } static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels"); static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels"); @@ -1358,9 +1359,10 @@ static ncclResult_t commDestroy(ncclComm_t comm) { CUDACHECK(hipStreamSynchronize(comm->groupStream)); ncclDestroyQueueInfo(comm->enqueueInfo); -#if CUDART_VERSION >= 11030 - NCCLCHECK(ncclGraphHelperDestroy(comm)); -#endif + + if (rcclParamEnableHipGraph()) + NCCLCHECK(ncclGraphHelperDestroy(comm)); + INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed); NCCLCHECK(commFree(comm));