Adding opt-in hipGraph support for RCCL via RCCL_ENABLE_HIPGRAPH (#608)
Adding opt-in hipGraph support via RCCL_ENABLE_HIPGRAPH
This commit is contained in:
+3
-1
@@ -18,7 +18,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
|
||||
- When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll
|
||||
(e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options
|
||||
to resolve the massive port usage issue
|
||||
- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
|
||||
- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
|
||||
- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH
|
||||
|
||||
### Removed
|
||||
- Removed experimental clique-based kernels
|
||||
|
||||
|
||||
+29
-14
@@ -14,7 +14,7 @@
|
||||
#include "gdrwrap.h"
|
||||
#include "bootstrap.h"
|
||||
#include "channel.h"
|
||||
|
||||
#include "rccl_vars.h"
|
||||
#include <cstring> // std::memcpy
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
@@ -293,7 +293,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
|
||||
if (comm->launchMode == ncclComm::GROUP) {
|
||||
NCCLCHECK(ncclCpuBarrierOut(comm));
|
||||
} else {
|
||||
if (!comm->usingCudaGraph)
|
||||
if (!rcclParamEnableHipGraph() || !comm->usingCudaGraph)
|
||||
CUDACHECK(hipExtLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream, NULL, comm->doneEvent, 0));
|
||||
else
|
||||
CUDACHECK(hipLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
|
||||
@@ -328,7 +328,10 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) {
|
||||
|
||||
// Enqueue event after NCCL kernel (only in non-graph mode)
|
||||
// [RCCL] move event record into hipExtLaunchKernel
|
||||
// if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
|
||||
if (rcclParamEnableHipGraph())
|
||||
{
|
||||
if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
|
||||
}
|
||||
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
|
||||
if (comm->launchMode == ncclComm::GROUP &&
|
||||
(comm->groupCudaStream ||
|
||||
@@ -1231,33 +1234,40 @@ void* graphHelperFunc(void *args) {
|
||||
}
|
||||
}
|
||||
}
|
||||
RCCL_PARAM(EnableHipGraph, "ENABLE_HIPGRAPH", 0);
|
||||
|
||||
// Check if we are in CUDA Graph capture mode
|
||||
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
|
||||
comm->usingCudaGraph = 0;
|
||||
// Feature requires CUDA 11.3/R465 or above
|
||||
#if CUDART_VERSION >= 11030
|
||||
cudaStreamCaptureStatus captureStatus;
|
||||
#if HIP_VERSION >= 50322000
|
||||
hipStreamCaptureStatus captureStatus;
|
||||
unsigned long long cudaGraphId;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (comm->driverVersion < 11030) {
|
||||
if (comm->driverVersion < 50322000) {
|
||||
// Runtime driver version older than compiler version
|
||||
// Enhanced compat fallback
|
||||
goto enh_compat_end;
|
||||
}
|
||||
// Get CUDA Graph handle
|
||||
CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
|
||||
if (captureStatus == cudaStreamCaptureStatusActive) {
|
||||
CUDACHECKGOTO(hipStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
|
||||
if (captureStatus == hipStreamCaptureStatusActive) {
|
||||
if (cudaGraphId != comm->lastCudaGraphId) {
|
||||
INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
|
||||
// We are in a new graph, hence need to forget the last setup node so that
|
||||
// the first setup node in the new graph will not have a dependency
|
||||
comm->lastCudaGraphId = hipGraphId;
|
||||
comm->lastCudaGraphId = cudaGraphId;
|
||||
comm->lastSetupNode = NULL;
|
||||
}
|
||||
if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
|
||||
comm->usingCudaGraph = 1;
|
||||
|
||||
if (!rcclParamEnableHipGraph())
|
||||
{
|
||||
WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero in order to support hipGraph usage");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
// Create helper thread that closes IPC handles during graph destruction
|
||||
// Only create this thread when buffer registration is enabled
|
||||
if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
|
||||
@@ -1276,9 +1286,9 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) {
|
||||
|
||||
enh_compat_end: // Enhanced compat fallback
|
||||
(void)ret;
|
||||
CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
|
||||
if (captureStatus != cudaStreamCaptureStatusNone) {
|
||||
WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
|
||||
CUDACHECK(hipStreamIsCapturing(comm->userStream, &captureStatus));
|
||||
if (captureStatus != hipStreamCaptureStatusNone) {
|
||||
WARN("The installed ROCm driver is older than the minimum version (50322000) required for RCCL's HIP Graphs support");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
// If we are not in capture mode, we can ignore the driver being lower
|
||||
@@ -1288,7 +1298,12 @@ enh_compat_end: // Enhanced compat fallback
|
||||
|
||||
// Create host setup node in CUDA Graph
|
||||
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
#if HIP_VERSION >= 50322000
|
||||
if (!rcclParamEnableHipGraph())
|
||||
{
|
||||
WARN("RCCL_ENABLE_HIPGRAPH must be set to non-zero to enable HIP graph feature");
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
|
||||
// Create a CUDA object to wrap around the argument space
|
||||
// which CUDA graph would manage lifetime of
|
||||
@@ -1309,7 +1324,7 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) {
|
||||
comm->lastSetupNode = setupNode;
|
||||
return ncclSuccess;
|
||||
#else
|
||||
WARN("NCCL does not support this CUDA version for CUDA graph feature");
|
||||
WARN("RCCL does not support this ROCm version for HIP graph feature");
|
||||
return ncclInternalError;
|
||||
#endif
|
||||
}
|
||||
|
||||
+22
-12
@@ -15,6 +15,7 @@
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "rccl_vars.h"
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
@@ -78,23 +79,32 @@ extern struct allocationTracker allocTracker[];
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
// Need async stream for P2P pre-connect + CUDA Graph
|
||||
hipStream_t stream;
|
||||
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
#endif
|
||||
static bool streamCreated = false;
|
||||
static hipStream_t stream;
|
||||
if (rcclParamEnableHipGraph() && !streamCreated)
|
||||
{
|
||||
// Create stream only once to avoid performance penalty
|
||||
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
streamCreated = true;
|
||||
}
|
||||
|
||||
if (isFineGrain)
|
||||
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
|
||||
else
|
||||
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
|
||||
#if CUDART_VERSION >= 11030
|
||||
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
||||
CUDACHECK(hipStreamSynchronize(stream));
|
||||
CUDACHECK(hipStreamDestroy(stream));
|
||||
#else
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
CUDACHECK(hipStreamSynchronize(NULL));
|
||||
#endif
|
||||
|
||||
if (rcclParamEnableHipGraph()) {
|
||||
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
||||
CUDACHECK(hipStreamSynchronize(stream));
|
||||
// NOTE: Currently the re-used stream is not destroyed
|
||||
//CUDACHECK(hipStreamDestroy(stream));
|
||||
} else {
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
CUDACHECK(hipStreamSynchronize(NULL));
|
||||
}
|
||||
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
|
||||
@@ -27,6 +27,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
|
||||
return cache; \
|
||||
}
|
||||
|
||||
#define RCCL_PARAM_DECLARE(name) \
|
||||
int64_t rcclParam##name()
|
||||
|
||||
#define RCCL_PARAM(name, env, default_value) \
|
||||
pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
|
||||
int64_t rcclParam##name() { \
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef RCCL_VARS_H_
|
||||
#define RCCL_VARS_H_
|
||||
|
||||
#include "param.h"
|
||||
|
||||
RCCL_PARAM_DECLARE(EnableHipGraph); // Opt-in environment variable for enabling hipGraph
|
||||
|
||||
#endif
|
||||
+12
-10
@@ -34,6 +34,7 @@
|
||||
|
||||
// [RCCL]
|
||||
#include "git_version.h"
|
||||
#include "rccl_vars.h"
|
||||
//#include "clique/CliqueManager.h"
|
||||
//#include <hsa/hsa_ext_amd.h>
|
||||
// [/RCCL]
|
||||
@@ -417,13 +418,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
|
||||
comm->lastCudaGraphId = -1;
|
||||
comm->disableGraphHelper = ncclParamDisableGraphHelper();
|
||||
comm->graphRegister = ncclParamGraphRegister();
|
||||
#if CUDART_VERSION >= 11030
|
||||
NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
|
||||
comm->graphHelperResources->comm = comm;
|
||||
if (comm->driverVersion >= 11030)
|
||||
// hipGetDriverEntryPoint requires R465 or above (enhanced compat need)
|
||||
CUDACHECK(hipGetDriverEntryPoint("cuMemGetAddressRange", (void**)&comm->pfnCuMemGetAddressRange, hipEnableDefault));
|
||||
#endif
|
||||
|
||||
if (rcclParamEnableHipGraph())
|
||||
{
|
||||
NCCLCHECK(ncclCalloc(&comm->graphHelperResources, 1));
|
||||
comm->graphHelperResources->comm = comm;
|
||||
comm->pfnCuMemGetAddressRange = hipMemGetAddressRange;
|
||||
}
|
||||
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
|
||||
@@ -1358,9 +1359,10 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
CUDACHECK(hipStreamSynchronize(comm->groupStream));
|
||||
|
||||
ncclDestroyQueueInfo(comm->enqueueInfo);
|
||||
#if CUDART_VERSION >= 11030
|
||||
NCCLCHECK(ncclGraphHelperDestroy(comm));
|
||||
#endif
|
||||
|
||||
if (rcclParamEnableHipGraph())
|
||||
NCCLCHECK(ncclGraphHelperDestroy(comm));
|
||||
|
||||
INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
|
||||
|
||||
NCCLCHECK(commFree(comm));
|
||||
|
||||
Reference in New Issue
Block a user