Adding opt-in hipGraph support for RCCL via RCCL_ENABLE_HIPGRAPH (#608)

Adding opt-in hipGraph support via RCCL_ENABLE_HIPGRAPH
2022-09-06 10:29:46 -06:00
parent 06bce9d0c9
commit 47b2fc3a30
6 changed files with 99 additions and 37 deletions
@@ -18,7 +18,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
  - When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll
    (e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options
    to resolve the massive port usage issue
-  - Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
+- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
+- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH
+
 ### Removed
 - Removed experimental clique-based kernels

@@ -14,7 +14,7 @@
 #include "gdrwrap.h"
 #include "bootstrap.h"
 #include "channel.h"
-
+#include "rccl_vars.h"
 #include <cstring> // std::memcpy

 // Only generate inline kernels for LL
@@ -293,7 +293,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
  if (comm->launchMode == ncclComm::GROUP) {
    NCCLCHECK(ncclCpuBarrierOut(comm));
  } else {
-    if (!comm->usingCudaGraph)
+    if (!rcclParamEnableHipGraph() || !comm->usingCudaGraph)
      CUDACHECK(hipExtLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream, NULL, comm->doneEvent, 0));
    else
      CUDACHECK(hipLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
@@ -328,7 +328,10 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) {

  // Enqueue event after NCCL kernel (only in non-graph mode)
  // [RCCL] move event record into hipExtLaunchKernel
-  // if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
+  if (rcclParamEnableHipGraph())
+  {
+    if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
+  }
  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
  if (comm->launchMode == ncclComm::GROUP &&
      (comm->groupCudaStream ||
@@ -1231,33 +1234,40 @@ void* graphHelperFunc(void *args) {
    }
  }
 }
+RCCL_PARAM(EnableHipGraph, "ENABLE_HIPGRAPH", 0);

 // Check if we are in CUDA Graph capture mode
 ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
  comm->usingCudaGraph = 0;
  // Feature requires CUDA 11.3/R465 or above
-#if CUDART_VERSION >= 11030
-  cudaStreamCaptureStatus captureStatus;
+#if HIP_VERSION >= 50322000
+  hipStreamCaptureStatus captureStatus;
  unsigned long long cudaGraphId;
  ncclResult_t ret = ncclSuccess;
-  if (comm->driverVersion < 11030) {
+  if (comm->driverVersion < 50322000) {
    // Runtime driver version older than compiler version
    // Enhanced compat fallback
    goto enh_compat_end;
  }
  // Get CUDA Graph handle
-  CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
-  if (captureStatus == cudaStreamCaptureStatusActive) {
+  CUDACHECKGOTO(hipStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
+  if (captureStatus == hipStreamCaptureStatusActive) {
    if (cudaGraphId != comm->lastCudaGraphId) {
      INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
      // We are in a new graph, hence need to forget the last setup node so that
      // the first setup node in the new graph will not have a dependency
-      comm->lastCudaGraphId = hipGraphId;
+      comm->lastCudaGraphId = cudaGraphId;
      comm->lastSetupNode = NULL;
    }
    if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
    comm->usingCudaGraph = 1;

+    if (!rcclParamEnableHipGraph())
+    {
+      WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero in order to support hipGraph usage");
+      return ncclInvalidUsage;
+    }
+
    // Create helper thread that closes IPC handles during graph destruction
    // Only create this thread when buffer registration is enabled
    if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
@@ -1276,9 +1286,9 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {

 enh_compat_end: // Enhanced compat fallback
  (void)ret;
-  CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
-  if (captureStatus != cudaStreamCaptureStatusNone) {
-    WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+  CUDACHECK(hipStreamIsCapturing(comm->userStream, &captureStatus));
+  if (captureStatus != hipStreamCaptureStatusNone) {
+    WARN("The installed ROCm driver is older than the minimum version (50322000) required for RCCL's HIP Graphs support");
    return ncclInvalidUsage;
  }
  // If we are not in capture mode, we can ignore the driver being lower
@@ -1288,7 +1298,12 @@ enh_compat_end: // Enhanced compat fallback

 // Create host setup node in CUDA Graph
 ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
-#if CUDART_VERSION >= 11030
+#if HIP_VERSION >= 50322000
+  if (!rcclParamEnableHipGraph())
+  {
+    WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero to enable HIP graph feature");
+    return ncclInternalError;
+  }
  struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
  // Create a CUDA object to wrap around the argument space
  // which CUDA graph would manage lifetime of
@@ -1309,7 +1324,7 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
  comm->lastSetupNode = setupNode;
  return ncclSuccess;
 #else
-  WARN("NCCL does not support this CUDA version for CUDA graph feature");
+  WARN("RCCL does not support this ROCm version for HIP graph feature");
  return ncclInternalError;
 #endif
 }
@@ -15,6 +15,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
+#include "rccl_vars.h"

 template <typename T>
 static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
@@ -78,23 +79,32 @@ extern struct allocationTracker allocTracker[];

 template <typename T>
 static ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
-#if CUDART_VERSION >= 11030
+
  // Need async stream for P2P pre-connect + CUDA Graph
-  hipStream_t stream;
-  CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-#endif
+  static bool streamCreated = false;
+  static hipStream_t stream;
+  if (rcclParamEnableHipGraph() && !streamCreated)
+  {
+    // Create stream only once to avoid performance penalty
+    CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+    streamCreated = true;
+  }
+
  if (isFineGrain)
    CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
  else
    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
-#if CUDART_VERSION >= 11030
-  CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
-  CUDACHECK(hipStreamSynchronize(stream));
-  CUDACHECK(hipStreamDestroy(stream));
-#else
-  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
-  CUDACHECK(hipStreamSynchronize(NULL));
-#endif
+
+  if (rcclParamEnableHipGraph()) {
+    CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
+    CUDACHECK(hipStreamSynchronize(stream));
+    // NOTE: Currently the re-used stream is not destroyed
+    //CUDACHECK(hipStreamDestroy(stream));
+  } else {
+    CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
+    CUDACHECK(hipStreamSynchronize(NULL));
+  }
+
  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
  int dev;
  CUDACHECK(hipGetDevice(&dev));
@@ -27,6 +27,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
    return cache; \
  }

+#define RCCL_PARAM_DECLARE(name) \
+int64_t rcclParam##name()
+
 #define RCCL_PARAM(name, env, default_value) \
 pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
 int64_t rcclParam##name() { \
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef RCCL_VARS_H_
+#define RCCL_VARS_H_
+
+#include "param.h"
+
+RCCL_PARAM_DECLARE(EnableHipGraph);  // Opt-in environment variable for enabling hipGraph
+
+#endif
@@ -34,6 +34,7 @@

 // [RCCL]
 #include "git_version.h"
+#include "rccl_vars.h"
 //#include "clique/CliqueManager.h"
 //#include <hsa/hsa_ext_amd.h>
 // [/RCCL]
@@ -417,13 +418,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
  comm->lastCudaGraphId = -1;
  comm->disableGraphHelper = ncclParamDisableGraphHelper();
  comm->graphRegister = ncclParamGraphRegister();
-#if CUDART_VERSION >= 11030
-  NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
-  comm->graphHelperResources->comm = comm;
-  if (comm->driverVersion >= 11030)
-    // hipGetDriverEntryPoint requires R465 or above (enhanced compat need)
-    CUDACHECK(hipGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, hipEnableDefault));
-#endif
+
+  if (rcclParamEnableHipGraph())
+  {
+    NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
+    comm->graphHelperResources->comm = comm;
+    comm->pfnCuMemGetAddressRange = hipMemGetAddressRange;
+  }

  static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
  static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
@@ -1358,9 +1359,10 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
  CUDACHECK(hipStreamSynchronize(comm->groupStream));

  ncclDestroyQueueInfo(comm->enqueueInfo);
-#if CUDART_VERSION >= 11030
-  NCCLCHECK(ncclGraphHelperDestroy(comm));
-#endif
+
+  if (rcclParamEnableHipGraph())
+    NCCLCHECK(ncclGraphHelperDestroy(comm));
+
  INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);

  NCCLCHECK(commFree(comm));