Add support for using contiguous for GPU direct RDMA (#1096)

Enabled by env var RCCL_NET_CONTIGUOUS_MEM=1

[ROCm/rccl commit: cbd955627e]
This commit is contained in:
Wenkai Du
2024-02-29 10:06:43 -08:00
committed by GitHub
parent 058886cb20
commit e5aedb153e
7 changed files with 75 additions and 38 deletions
+6
View File
@@ -148,6 +148,9 @@ check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVE
### Check for hipDeviceMallocUncached support
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
### Check for hipDeviceMallocContiguous support
check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY)
### Check for indirect function call support
if(ENABLE_IFC)
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
@@ -575,6 +578,9 @@ endif()
if(${HIP_EVENT_DISABLE_FENCE})
target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
endif()
if(${HIP_CONTIGUOUS_MEMORY})
target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY)
endif()
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.7.31920")
target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY)
message(STATUS "HIP_UNCACHED_MEMORY enabled")
+2 -2
View File
@@ -190,7 +190,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
hipIpcMemHandle_t handle;
// Allocate fine-grained device memory on rank 0 and get IPC handle for it
// Re-usable barrier consists of (globalCount / globalSense) pair of integers
NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, true), res, dropback);
NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, hipDeviceMallocFinegrained), res, dropback);
if (hipIpcGetMemHandle(&handle, m_fineGrainBarrierMem) != hipSuccess)
{
WARN("Unable to get IPC handle for barrier memory");
@@ -228,7 +228,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
// First rank prepares fine-grained memory shared across ranks used for the two barrier variables
if (m_rank == 0)
{
NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, true), res, dropback);
NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, hipDeviceMallocFinegrained), res, dropback);
// Prepare all barriers
for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++)
{
+7 -28
View File
@@ -160,19 +160,12 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
#endif
template <typename T>
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, unsigned int flags = hipDeviceMallocDefault) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (isFineGrain) {
#if defined(HIP_UNCACHED_MEMORY)
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
#else
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
#endif
} else
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
@@ -182,7 +175,7 @@ finish:
#define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__)
template <typename T>
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, bool isFineGrain = false) {
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, unsigned int flags = hipDeviceMallocDefault) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
@@ -190,15 +183,8 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
// Need a side stream so as not to interfere with graph capture.
cudaStream_t stream = sideStream;
if (stream == nullptr)
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
if (isFineGrain) {
#if defined(HIP_UNCACHED_MEMORY)
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
#else
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
#endif
} else
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
if (sideStream == nullptr)
@@ -218,19 +204,12 @@ finish:
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__FILE__, __LINE__, __VA_ARGS__)
template <typename T>
ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, unsigned int flags = hipDeviceMallocDefault) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (isFineGrain) {
#if defined(HIP_UNCACHED_MEMORY)
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
#else
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
#endif
} else
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
int dev;
CUDACHECK(hipGetDevice(&dev));
+5 -2
View File
@@ -176,8 +176,11 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, true));
#if defined(HIP_UNCACHED_MEMORY)
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocUncached));
#else
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocFinegrained));
#endif
gdr_mem_desc_t* md;
NCCLCHECK(ncclCalloc(&md, 1));
md->gdrDevMem = devMem;
+5 -1
View File
@@ -378,7 +378,11 @@ static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int
*size = collNet->size;
if (cuda && collNet->cudaBuff == NULL) {
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda));
#if defined(HIP_UNCACHED_MEMORY)
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
#else
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
#endif
}
if (!cuda && collNet->hostBuff == NULL) {
NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size));
+40 -3
View File
@@ -166,6 +166,10 @@ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
#if defined(HIP_CONTIGUOUS_MEMORY)
RCCL_PARAM(NetContiguousMem, "NET_CONTIGUOUS_MEM", 0);
#endif
struct setupReq {
int tpRank;
int tpLocalRank;
@@ -521,7 +525,18 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
if (sameProcess == 0 || ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
} else {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, cuda));
#if defined(HIP_UNCACHED_MEMORY)
#if defined(HIP_CONTIGUOUS_MEMORY)
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
cuda ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
#else
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
#endif
#else
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
#endif
}
}
if (!cuda && state->hostBuff == NULL) {
@@ -750,7 +765,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
#if defined(HIP_UNCACHED_MEMORY)
#if defined(HIP_CONTIGUOUS_MEMORY)
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
#else
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault));
#endif
#else
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
#endif
}
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
}
@@ -911,7 +937,18 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
#if defined(HIP_UNCACHED_MEMORY)
#if defined(HIP_CONTIGUOUS_MEMORY)
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
#else
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault));
#endif
#else
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
#endif
}
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
}
+10 -2
View File
@@ -218,7 +218,11 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
#endif
} else {
// Allocate a CUDA buffer and generate an IPC handle for it
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, true));
#if defined(HIP_UNCACHED_MEMORY)
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocUncached));
#else
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocFinegrained));
#endif
cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr);
if (res != cudaSuccess) {
WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
@@ -593,7 +597,11 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
connection->transportResources = proxyInfo;
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, true));
#if defined(HIP_UNCACHED_MEMORY)
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocUncached));
#else
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocFinegrained));
#endif
char shmPath[PATH_MAX];
shmPath[0] = '\0';