From 47b2fc3a3097bfdfd7c937941e03a75bdf174522 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Tue, 6 Sep 2022 10:29:46 -0600
Subject: [PATCH] Adding opt-in hipGraph support for RCCL via
 RCCL_ENABLE_HIPGRAPH (#608)

Adding opt-in hipGraph support via RCCL_ENABLE_HIPGRAPH
---
 CHANGELOG.md            |  4 +++-
 src/enqueue.cc          | 43 +++++++++++++++++++++++++++--------------
 src/include/alloc.h     | 34 ++++++++++++++++++++------------
 src/include/param.h     |  3 +++
 src/include/rccl_vars.h | 30 ++++++++++++++++++++++++++++
 src/init.cc             | 22 +++++++++++----------
 6 files changed, 99 insertions(+), 37 deletions(-)
 create mode 100644 src/include/rccl_vars.h
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 189452d206..93180498c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,7 +18,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
   - When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll
     (e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options
     to resolve the massive port usage issue
-  - Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
+- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
+- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH
+
 ### Removed
 - Removed experimental clique-based kernels
 
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 1356477384..e641dfbd4f 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -14,7 +14,7 @@
 #include "gdrwrap.h"
 #include "bootstrap.h"
 #include "channel.h"
-
+#include "rccl_vars.h"
 #include <cstring> // std::memcpy
 
 // Only generate inline kernels for LL
@@ -293,7 +293,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
   if (comm->launchMode == ncclComm::GROUP) {
     NCCLCHECK(ncclCpuBarrierOut(comm));
   } else {
-    if (!comm->usingCudaGraph)
+    if (!rcclParamEnableHipGraph() || !comm->usingCudaGraph)
       CUDACHECK(hipExtLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream, NULL, comm->doneEvent, 0));
     else
       CUDACHECK(hipLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
@@ -328,7 +328,10 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) {
 
   // Enqueue event after NCCL kernel (only in non-graph mode)
   // [RCCL] move event record into hipExtLaunchKernel
-  // if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
+  if (rcclParamEnableHipGraph())
+  {
+    if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
+  }
   // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
   if (comm->launchMode == ncclComm::GROUP &&
       (comm->groupCudaStream ||
@@ -1231,33 +1234,40 @@ void* graphHelperFunc(void *args) {
     }
   }
 }
+RCCL_PARAM(EnableHipGraph, "ENABLE_HIPGRAPH", 0);
 
 // Check if we are in CUDA Graph capture mode
 ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
   comm->usingCudaGraph = 0;
   // Feature requires CUDA 11.3/R465 or above
-#if CUDART_VERSION >= 11030
-  cudaStreamCaptureStatus captureStatus;
+#if HIP_VERSION >= 50322000
+  hipStreamCaptureStatus captureStatus;
   unsigned long long cudaGraphId;
   ncclResult_t ret = ncclSuccess;
-  if (comm->driverVersion < 11030) {
+  if (comm->driverVersion < 50322000) {
     // Runtime driver version older than compiler version
     // Enhanced compat fallback
     goto enh_compat_end;
   }
   // Get CUDA Graph handle
-  CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
-  if (captureStatus == cudaStreamCaptureStatusActive) {
+  CUDACHECKGOTO(hipStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
+  if (captureStatus == hipStreamCaptureStatusActive) {
     if (cudaGraphId != comm->lastCudaGraphId) {
       INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
       // We are in a new graph, hence need to forget the last setup node so that
       // the first setup node in the new graph will not have a dependency
-      comm->lastCudaGraphId = hipGraphId;
+      comm->lastCudaGraphId = cudaGraphId;
       comm->lastSetupNode = NULL;
     }
     if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
     comm->usingCudaGraph = 1;
 
+    if (!rcclParamEnableHipGraph())
+    {
+      WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero in order to support hipGraph usage");
+      return ncclInvalidUsage;
+    }
+
     // Create helper thread that closes IPC handles during graph destruction
     // Only create this thread when buffer registration is enabled
     if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
@@ -1276,9 +1286,9 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
 
 enh_compat_end: // Enhanced compat fallback
   (void)ret;
-  CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
-  if (captureStatus != cudaStreamCaptureStatusNone) {
-    WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+  CUDACHECK(hipStreamIsCapturing(comm->userStream, &captureStatus));
+  if (captureStatus != hipStreamCaptureStatusNone) {
+    WARN("The installed ROCm driver is older than the minimum version (50322000) required for RCCL's HIP Graphs support");
     return ncclInvalidUsage;
   }
   // If we are not in capture mode, we can ignore the driver being lower
@@ -1288,7 +1298,12 @@ enh_compat_end: // Enhanced compat fallback
 
 // Create host setup node in CUDA Graph
 ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
-#if CUDART_VERSION >= 11030
+#if HIP_VERSION >= 50322000
+  if (!rcclParamEnableHipGraph())
+  {
+    WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero to enable HIP graph feature");
+    return ncclInternalError;
+  }
   struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
   // Create a CUDA object to wrap around the argument space
   // which CUDA graph would manage lifetime of
@@ -1309,7 +1324,7 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
   comm->lastSetupNode = setupNode;
   return ncclSuccess;
 #else
-  WARN("NCCL does not support this CUDA version for CUDA graph feature");
+  WARN("RCCL does not support this ROCm version for HIP graph feature");
   return ncclInternalError;
 #endif
 }
diff --git a/src/include/alloc.h b/src/include/alloc.h
index e9e07f7ae2..e17016f327 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -15,6 +15,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
+#include "rccl_vars.h"
 
 template <typename T>
 static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
@@ -78,23 +79,32 @@ extern struct allocationTracker allocTracker[];
 
 template <typename T>
 static ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
-#if CUDART_VERSION >= 11030
+
   // Need async stream for P2P pre-connect + CUDA Graph
-  hipStream_t stream;
-  CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-#endif
+  static bool streamCreated = false;
+  static hipStream_t stream;
+  if (rcclParamEnableHipGraph() && !streamCreated)
+  {
+    // Create stream only once to avoid performance penalty
+    CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+    streamCreated = true;
+  }
+
   if (isFineGrain)
     CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
   else
     CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
-#if CUDART_VERSION >= 11030
-  CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
-  CUDACHECK(hipStreamSynchronize(stream));
-  CUDACHECK(hipStreamDestroy(stream));
-#else
-  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
-  CUDACHECK(hipStreamSynchronize(NULL));
-#endif
+
+  if (rcclParamEnableHipGraph()) {
+    CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
+    CUDACHECK(hipStreamSynchronize(stream));
+    // NOTE: Currently the re-used stream is not destroyed
+    //CUDACHECK(hipStreamDestroy(stream));
+  } else {
+    CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
+    CUDACHECK(hipStreamSynchronize(NULL));
+  }
+
   INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
   int dev;
   CUDACHECK(hipGetDevice(&dev));
diff --git a/src/include/param.h b/src/include/param.h
index ca243ca6fb..52f521da28 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -27,6 +27,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
     return cache; \
   }
 
+#define RCCL_PARAM_DECLARE(name) \
+int64_t rcclParam##name()
+
 #define RCCL_PARAM(name, env, default_value) \
 pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
 int64_t rcclParam##name() { \
diff --git a/src/include/rccl_vars.h b/src/include/rccl_vars.h
new file mode 100644
index 0000000000..a9b1b45815
--- /dev/null
+++ b/src/include/rccl_vars.h
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef RCCL_VARS_H_
+#define RCCL_VARS_H_
+
+#include "param.h"
+
+RCCL_PARAM_DECLARE(EnableHipGraph);  // Opt-in environment variable for enabling hipGraph
+
+#endif
diff --git a/src/init.cc b/src/init.cc
index 8238acab41..8f9092de7d 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -34,6 +34,7 @@
 
 // [RCCL]
 #include "git_version.h"
+#include "rccl_vars.h"
 //#include "clique/CliqueManager.h"
 //#include <hsa/hsa_ext_amd.h>
 // [/RCCL]
@@ -417,13 +418,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
   comm->lastCudaGraphId = -1;
   comm->disableGraphHelper = ncclParamDisableGraphHelper();
   comm->graphRegister = ncclParamGraphRegister();
-#if CUDART_VERSION >= 11030
-  NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
-  comm->graphHelperResources->comm = comm;
-  if (comm->driverVersion >= 11030)
-    // hipGetDriverEntryPoint requires R465 or above (enhanced compat need)
-    CUDACHECK(hipGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, hipEnableDefault));
-#endif
+
+  if (rcclParamEnableHipGraph())
+  {
+    NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
+    comm->graphHelperResources->comm = comm;
+    comm->pfnCuMemGetAddressRange = hipMemGetAddressRange;
+  }
 
   static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
   static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
@@ -1358,9 +1359,10 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
   CUDACHECK(hipStreamSynchronize(comm->groupStream));
 
   ncclDestroyQueueInfo(comm->enqueueInfo);
-#if CUDART_VERSION >= 11030
-  NCCLCHECK(ncclGraphHelperDestroy(comm));
-#endif
+
+  if (rcclParamEnableHipGraph())
+    NCCLCHECK(ncclGraphHelperDestroy(comm));
+
   INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
 
   NCCLCHECK(commFree(comm));