Add support for using contiguous for GPU direct RDMA (#1096)
Enabled by env var RCCL_NET_CONTIGUOUS_MEM=1
[ROCm/rccl commit: cbd955627e]
This commit is contained in:
@@ -148,6 +148,9 @@ check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVE
|
||||
### Check for hipDeviceMallocUncached support
|
||||
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
|
||||
|
||||
### Check for hipDeviceMallocContiguous support
|
||||
check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY)
|
||||
|
||||
### Check for indirect function call support
|
||||
if(ENABLE_IFC)
|
||||
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
|
||||
@@ -575,6 +578,9 @@ endif()
|
||||
if(${HIP_EVENT_DISABLE_FENCE})
|
||||
target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
|
||||
endif()
|
||||
if(${HIP_CONTIGUOUS_MEMORY})
|
||||
target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY)
|
||||
endif()
|
||||
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.7.31920")
|
||||
target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY)
|
||||
message(STATUS "HIP_UNCACHED_MEMORY enabled")
|
||||
|
||||
@@ -190,7 +190,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
|
||||
hipIpcMemHandle_t handle;
|
||||
// Allocate fine-grained device memory on rank 0 and get IPC handle for it
|
||||
// Re-usable barrier consists of (globalCount / globalSense) pair of integers
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, true), res, dropback);
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, hipDeviceMallocFinegrained), res, dropback);
|
||||
if (hipIpcGetMemHandle(&handle, m_fineGrainBarrierMem) != hipSuccess)
|
||||
{
|
||||
WARN("Unable to get IPC handle for barrier memory");
|
||||
@@ -228,7 +228,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
|
||||
// First rank prepares fine-grained memory shared across ranks used for the two barrier variables
|
||||
if (m_rank == 0)
|
||||
{
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, true), res, dropback);
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, hipDeviceMallocFinegrained), res, dropback);
|
||||
// Prepare all barriers
|
||||
for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++)
|
||||
{
|
||||
|
||||
@@ -160,19 +160,12 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, unsigned int flags = hipDeviceMallocDefault) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (isFineGrain) {
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
|
||||
#else
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
#endif
|
||||
} else
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
|
||||
@@ -182,7 +175,7 @@ finish:
|
||||
#define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, bool isFineGrain = false) {
|
||||
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, unsigned int flags = hipDeviceMallocDefault) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
@@ -190,15 +183,8 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream = sideStream;
|
||||
if (stream == nullptr)
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
if (isFineGrain) {
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
|
||||
#else
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
#endif
|
||||
} else
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
if (sideStream == nullptr)
|
||||
@@ -218,19 +204,12 @@ finish:
|
||||
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
|
||||
ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, unsigned int flags = hipDeviceMallocDefault) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (isFineGrain) {
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
|
||||
#else
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
#endif
|
||||
} else
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
|
||||
@@ -176,8 +176,11 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
|
||||
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
|
||||
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
|
||||
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, true));
|
||||
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocUncached));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocFinegrained));
|
||||
#endif
|
||||
gdr_mem_desc_t* md;
|
||||
NCCLCHECK(ncclCalloc(&md, 1));
|
||||
md->gdrDevMem = devMem;
|
||||
|
||||
@@ -378,7 +378,11 @@ static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int
|
||||
*size = collNet->size;
|
||||
|
||||
if (cuda && collNet->cudaBuff == NULL) {
|
||||
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda));
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
#endif
|
||||
}
|
||||
if (!cuda && collNet->hostBuff == NULL) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size));
|
||||
|
||||
@@ -166,6 +166,10 @@ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
|
||||
NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
|
||||
NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
|
||||
|
||||
#if defined(HIP_CONTIGUOUS_MEMORY)
|
||||
RCCL_PARAM(NetContiguousMem, "NET_CONTIGUOUS_MEM", 0);
|
||||
#endif
|
||||
|
||||
struct setupReq {
|
||||
int tpRank;
|
||||
int tpLocalRank;
|
||||
@@ -521,7 +525,18 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
|
||||
if (sameProcess == 0 || ncclCuMemEnable()) {
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, cuda));
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
#if defined(HIP_CONTIGUOUS_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
|
||||
cuda ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
|
||||
cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
#endif
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
|
||||
cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if (!cuda && state->hostBuff == NULL) {
|
||||
@@ -750,7 +765,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
|
||||
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
#if defined(HIP_CONTIGUOUS_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
#endif
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
#endif
|
||||
}
|
||||
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
|
||||
}
|
||||
@@ -911,7 +937,18 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
|
||||
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
#if defined(HIP_CONTIGUOUS_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
#endif
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
#endif
|
||||
}
|
||||
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
|
||||
}
|
||||
|
||||
@@ -218,7 +218,11 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
|
||||
#endif
|
||||
} else {
|
||||
// Allocate a CUDA buffer and generate an IPC handle for it
|
||||
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, true));
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocUncached));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocFinegrained));
|
||||
#endif
|
||||
cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr);
|
||||
if (res != cudaSuccess) {
|
||||
WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
|
||||
@@ -593,7 +597,11 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
connection->transportResources = proxyInfo;
|
||||
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, true));
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocUncached));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocFinegrained));
|
||||
#endif
|
||||
|
||||
char shmPath[PATH_MAX];
|
||||
shmPath[0] = '\0';
|
||||
|
||||
Reference in New Issue
Block a user