diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index 4204dc5ed2..0e27118a94 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -148,6 +148,9 @@ check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVE ### Check for hipDeviceMallocUncached support check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) +### Check for hipDeviceMallocContiguous support +check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY) + ### Check for indirect function call support if(ENABLE_IFC) if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201") @@ -575,6 +578,9 @@ endif() if(${HIP_EVENT_DISABLE_FENCE}) target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE) endif() +if(${HIP_CONTIGUOUS_MEMORY}) + target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY) +endif() if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.7.31920") target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY) message(STATUS "HIP_UNCACHED_MEMORY enabled") diff --git a/projects/rccl/src/clique/CliqueManager.cc b/projects/rccl/src/clique/CliqueManager.cc index 7abfc67fda..368d99a797 100644 --- a/projects/rccl/src/clique/CliqueManager.cc +++ b/projects/rccl/src/clique/CliqueManager.cc @@ -190,7 +190,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix) hipIpcMemHandle_t handle; // Allocate fine-grained device memory on rank 0 and get IPC handle for it // Re-usable barrier consists of (globalCount / globalSense) pair of integers - NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, true), res, dropback); + NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, hipDeviceMallocFinegrained), res, dropback); if (hipIpcGetMemHandle(&handle, m_fineGrainBarrierMem) != hipSuccess) { WARN("Unable to get IPC handle for barrier memory"); @@ -228,7 +228,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix) // First rank prepares fine-grained memory shared across ranks used for the two barrier variables if (m_rank == 0) { - NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, true), res, dropback); + NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, hipDeviceMallocFinegrained), res, dropback); // Prepare all barriers for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++) { diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h index 3258202856..09f5386dda 100644 --- a/projects/rccl/src/include/alloc.h +++ b/projects/rccl/src/include/alloc.h @@ -160,19 +160,12 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) { #endif template -ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) { +ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, unsigned int flags = hipDeviceMallocDefault) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (isFineGrain) { -#if defined(HIP_UNCACHED_MEMORY) - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish); -#else - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); -#endif - } else - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T)); @@ -182,7 +175,7 @@ finish: #define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__) template -ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, bool isFineGrain = false) { +ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, unsigned int flags = hipDeviceMallocDefault) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; @@ -190,15 +183,8 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t // Need a side stream so as not to interfere with graph capture. cudaStream_t stream = sideStream; if (stream == nullptr) - CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - if (isFineGrain) { -#if defined(HIP_UNCACHED_MEMORY) - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish); -#else - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); -#endif - } else - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish); CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); if (sideStream == nullptr) @@ -218,19 +204,12 @@ finish: #define ncclCudaCalloc(...) ncclCudaCallocDebug(__FILE__, __LINE__, __VA_ARGS__) template -ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) { +ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, unsigned int flags = hipDeviceMallocDefault) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (isFineGrain) { -#if defined(HIP_UNCACHED_MEMORY) - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish); -#else - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); -#endif - } else - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish); CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); int dev; CUDACHECK(hipGetDevice(&dev)); diff --git a/projects/rccl/src/include/gdrwrap.h b/projects/rccl/src/include/gdrwrap.h index f532a705e1..a791e050f0 100644 --- a/projects/rccl/src/include/gdrwrap.h +++ b/projects/rccl/src/include/gdrwrap.h @@ -176,8 +176,11 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too - NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, true)); - +#if defined(HIP_UNCACHED_MEMORY) + NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocUncached)); +#else + NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocFinegrained)); +#endif gdr_mem_desc_t* md; NCCLCHECK(ncclCalloc(&md, 1)); md->gdrDevMem = devMem; diff --git a/projects/rccl/src/transport/coll_net.cc b/projects/rccl/src/transport/coll_net.cc index 9f58017d48..fed72c6ff4 100644 --- a/projects/rccl/src/transport/coll_net.cc +++ b/projects/rccl/src/transport/coll_net.cc @@ -378,7 +378,11 @@ static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int *size = collNet->size; if (cuda && collNet->cudaBuff == NULL) { - NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda)); +#if defined(HIP_UNCACHED_MEMORY) + NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault)); +#else + NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); +#endif } if (!cuda && collNet->hostBuff == NULL) { NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size)); diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc index cd8b2e8c11..2cdac7efd9 100644 --- a/projects/rccl/src/transport/net.cc +++ b/projects/rccl/src/transport/net.cc @@ -166,6 +166,10 @@ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncc NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1); +#if defined(HIP_CONTIGUOUS_MEMORY) +RCCL_PARAM(NetContiguousMem, "NET_CONTIGUOUS_MEM", 0); +#endif + struct setupReq { int tpRank; int tpLocalRank; @@ -521,7 +525,18 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int if (sameProcess == 0 || ncclCuMemEnable()) { NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff)); } else { - NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, cuda)); +#if defined(HIP_UNCACHED_MEMORY) +#if defined(HIP_CONTIGUOUS_MEMORY) + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, + cuda ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault)); +#else + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, + cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault)); +#endif +#else + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, + cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); +#endif } } if (!cuda && state->hostBuff == NULL) { @@ -750,7 +765,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr)); +#if defined(HIP_UNCACHED_MEMORY) +#if defined(HIP_CONTIGUOUS_MEMORY) + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault)); +#else + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault)); +#endif +#else + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); +#endif } map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } @@ -911,7 +937,18 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr)); +#if defined(HIP_UNCACHED_MEMORY) +#if defined(HIP_CONTIGUOUS_MEMORY) + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault)); +#else + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault)); +#endif +#else + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); +#endif } map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc index 5e5dc24b83..c412022276 100644 --- a/projects/rccl/src/transport/p2p.cc +++ b/projects/rccl/src/transport/p2p.cc @@ -218,7 +218,11 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v #endif } else { // Allocate a CUDA buffer and generate an IPC handle for it - NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, true)); +#if defined(HIP_UNCACHED_MEMORY) + NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocUncached)); +#else + NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocFinegrained)); +#endif cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr); if (res != cudaSuccess) { WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); @@ -593,7 +597,11 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st NCCLCHECK(ncclCalloc(&proxyInfo, 1)); connection->transportResources = proxyInfo; - NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, true)); +#if defined(HIP_UNCACHED_MEMORY) + NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocUncached)); +#else + NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocFinegrained)); +#endif char shmPath[PATH_MAX]; shmPath[0] = '\0';