Adding opt-in hipGraph support for RCCL via RCCL_ENABLE_HIPGRAPH (#608)

Adding opt-in hipGraph support via RCCL_ENABLE_HIPGRAPH
This commit is contained in:
gilbertlee-amd
2022-09-06 10:29:46 -06:00
committed by GitHub
parent 06bce9d0c9
commit 47b2fc3a30
6 changed files with 99 additions and 37 deletions
+3 -1
View File
@@ -18,7 +18,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
- When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll
(e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options
to resolve the massive port usage issue
- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH
### Removed
- Removed experimental clique-based kernels
+29 -14
View File
@@ -14,7 +14,7 @@
#include "gdrwrap.h"
#include "bootstrap.h"
#include "channel.h"
#include "rccl_vars.h"
#include <cstring> // std::memcpy
// Only generate inline kernels for LL
@@ -293,7 +293,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
if (comm->launchMode == ncclComm::GROUP) {
NCCLCHECK(ncclCpuBarrierOut(comm));
} else {
if (!comm->usingCudaGraph)
if (!rcclParamEnableHipGraph() || !comm->usingCudaGraph)
CUDACHECK(hipExtLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream, NULL, comm->doneEvent, 0));
else
CUDACHECK(hipLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
@@ -328,7 +328,10 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) {
// Enqueue event after NCCL kernel (only in non-graph mode)
// [RCCL] move event record into hipExtLaunchKernel
// if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
if (rcclParamEnableHipGraph())
{
if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
}
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP &&
(comm->groupCudaStream ||
@@ -1231,33 +1234,40 @@ void* graphHelperFunc(void *args) {
}
}
}
RCCL_PARAM(EnableHipGraph, "ENABLE_HIPGRAPH", 0);
// Check if we are in CUDA Graph capture mode
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
comm->usingCudaGraph = 0;
// Feature requires CUDA 11.3/R465 or above
#if CUDART_VERSION >= 11030
cudaStreamCaptureStatus captureStatus;
#if HIP_VERSION >= 50322000
hipStreamCaptureStatus captureStatus;
unsigned long long cudaGraphId;
ncclResult_t ret = ncclSuccess;
if (comm->driverVersion < 11030) {
if (comm->driverVersion < 50322000) {
// Runtime driver version older than compiler version
// Enhanced compat fallback
goto enh_compat_end;
}
// Get CUDA Graph handle
CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
if (captureStatus == cudaStreamCaptureStatusActive) {
CUDACHECKGOTO(hipStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
if (captureStatus == hipStreamCaptureStatusActive) {
if (cudaGraphId != comm->lastCudaGraphId) {
INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
// We are in a new graph, hence need to forget the last setup node so that
// the first setup node in the new graph will not have a dependency
comm->lastCudaGraphId = hipGraphId;
comm->lastCudaGraphId = cudaGraphId;
comm->lastSetupNode = NULL;
}
if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
comm->usingCudaGraph = 1;
if (!rcclParamEnableHipGraph())
{
WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero in order to support hipGraph usage");
return ncclInvalidUsage;
}
// Create helper thread that closes IPC handles during graph destruction
// Only create this thread when buffer registration is enabled
if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
@@ -1276,9 +1286,9 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
enh_compat_end: // Enhanced compat fallback
(void)ret;
CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
if (captureStatus != cudaStreamCaptureStatusNone) {
WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
CUDACHECK(hipStreamIsCapturing(comm->userStream, &captureStatus));
if (captureStatus != hipStreamCaptureStatusNone) {
WARN("The installed ROCm driver is older than the minimum version (50322000) required for RCCL's HIP Graphs support");
return ncclInvalidUsage;
}
// If we are not in capture mode, we can ignore the driver being lower
@@ -1288,7 +1298,12 @@ enh_compat_end: // Enhanced compat fallback
// Create host setup node in CUDA Graph
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
#if CUDART_VERSION >= 11030
#if HIP_VERSION >= 50322000
if (!rcclParamEnableHipGraph())
{
WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero to enable HIP graph feature");
return ncclInternalError;
}
struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
// Create a CUDA object to wrap around the argument space
// which CUDA graph would manage lifetime of
@@ -1309,7 +1324,7 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
comm->lastSetupNode = setupNode;
return ncclSuccess;
#else
WARN("NCCL does not support this CUDA version for CUDA graph feature");
WARN("RCCL does not support this ROCm version for HIP graph feature");
return ncclInternalError;
#endif
}
+22 -12
View File
@@ -15,6 +15,7 @@
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include "rccl_vars.h"
template <typename T>
static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
@@ -78,23 +79,32 @@ extern struct allocationTracker allocTracker[];
template <typename T>
static ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
#if CUDART_VERSION >= 11030
// Need async stream for P2P pre-connect + CUDA Graph
hipStream_t stream;
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
#endif
static bool streamCreated = false;
static hipStream_t stream;
if (rcclParamEnableHipGraph() && !streamCreated)
{
// Create stream only once to avoid performance penalty
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
streamCreated = true;
}
if (isFineGrain)
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
#if CUDART_VERSION >= 11030
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
CUDACHECK(hipStreamSynchronize(stream));
CUDACHECK(hipStreamDestroy(stream));
#else
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
CUDACHECK(hipStreamSynchronize(NULL));
#endif
if (rcclParamEnableHipGraph()) {
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
CUDACHECK(hipStreamSynchronize(stream));
// NOTE: Currently the re-used stream is not destroyed
//CUDACHECK(hipStreamDestroy(stream));
} else {
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
CUDACHECK(hipStreamSynchronize(NULL));
}
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
int dev;
CUDACHECK(hipGetDevice(&dev));
+3
View File
@@ -27,6 +27,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
return cache; \
}
#define RCCL_PARAM_DECLARE(name) \
int64_t rcclParam##name()
#define RCCL_PARAM(name, env, default_value) \
pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
int64_t rcclParam##name() { \
+30
View File
@@ -0,0 +1,30 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RCCL_VARS_H_
#define RCCL_VARS_H_
#include "param.h"
RCCL_PARAM_DECLARE(EnableHipGraph); // Opt-in environment variable for enabling hipGraph
#endif
+12 -10
View File
@@ -34,6 +34,7 @@
// [RCCL]
#include "git_version.h"
#include "rccl_vars.h"
//#include "clique/CliqueManager.h"
//#include <hsa/hsa_ext_amd.h>
// [/RCCL]
@@ -417,13 +418,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
comm->lastCudaGraphId = -1;
comm->disableGraphHelper = ncclParamDisableGraphHelper();
comm->graphRegister = ncclParamGraphRegister();
#if CUDART_VERSION >= 11030
NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
comm->graphHelperResources->comm = comm;
if (comm->driverVersion >= 11030)
// hipGetDriverEntryPoint requires R465 or above (enhanced compat need)
CUDACHECK(hipGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, hipEnableDefault));
#endif
if (rcclParamEnableHipGraph())
{
NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
comm->graphHelperResources->comm = comm;
comm->pfnCuMemGetAddressRange = hipMemGetAddressRange;
}
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
@@ -1358,9 +1359,10 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
CUDACHECK(hipStreamSynchronize(comm->groupStream));
ncclDestroyQueueInfo(comm->enqueueInfo);
#if CUDART_VERSION >= 11030
NCCLCHECK(ncclGraphHelperDestroy(comm));
#endif
if (rcclParamEnableHipGraph())
NCCLCHECK(ncclGraphHelperDestroy(comm));
INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
NCCLCHECK(commFree(comm));