diff --git a/src/include/alloc.h b/src/include/alloc.h index 8e7fe88de3..4d6b6f4daf 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -18,10 +18,6 @@ #include #include #include "rccl_vars.h" -#include -#include - -#define RCCL_HP_MIN_SIZE 2097152 #if CUDART_VERSION >= 11030 #include @@ -35,9 +31,6 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); } template<> constexpr size_t ncclSizeOfT() { return 1; } -extern std::unordered_map hugepageAllocs; -extern std::mutex hugepageAllocsMutex; - #if CUDART_VERSION >= 12020 static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) { @@ -112,100 +105,43 @@ static inline ncclResult_t ncclCuMemHostFree(void* ptr) { } #endif /* CUDART_VERSION >= 12020 */ + template -ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line, int hp_request=0 ) { +ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; - size_t size = nelem * ncclSizeOfT(); - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); int managed = 0; - int huge=0; CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0)); - if (nelem > 0) { if (managed) { #if defined(HIP_UNCACHED_MEMORY) - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, size, hipDeviceMallocUncached), result, finish); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT(), hipDeviceMallocUncached), result, finish); #else - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, size, hipDeviceMallocFinegrained), result, finish); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT(), hipDeviceMallocFinegrained), result, finish); #endif - } else { - if (hp_request) { - if (size < RCCL_HP_MIN_SIZE) { - WARN("small size : forcing back to hipHostMalloc"); + } else #if defined(HIP_HOST_UNCACHED_MEMORY) - CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish); + CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT(), cudaHostAllocMapped | hipHostMallocUncached), result, finish); #else - CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish); + CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT(), cudaHostAllocMapped), result, finish); #endif - memset(*ptr, 0, size); - } else { - // Hugepage allocation via mmap - void* hostPtr = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (hostPtr == MAP_FAILED) { - WARN("Hugepage allocation failed. Falling back to hipHostMalloc"); -#if defined(HIP_HOST_UNCACHED_MEMORY) - CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish); -#else - CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish); -#endif - memset(*ptr, 0, size); - } else { - memset(hostPtr, 0, size); - CUDACHECKGOTO(hipHostRegister(hostPtr, size, hipHostRegisterMapped), result, finish); - void* devPtr = nullptr; - CUDACHECKGOTO(hipHostGetDevicePointer(&devPtr, hostPtr, 0), result, finish); - *ptr = reinterpret_cast(hostPtr); - INFO(NCCL_ALLOC, "Cuda Host Alloc Size done using hugepages"); - huge=1; - std::lock_guard lock(hugepageAllocsMutex); - hugepageAllocs[hostPtr] = size; - for (auto &kv : hugepageAllocs) INFO(NCCL_ALLOC, "updated Hugepage alloc ptr %p size %zu", kv.first, kv.second); - } - } - } else { -#if defined(HIP_HOST_UNCACHED_MEMORY) - CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish); -#else - CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish); -#endif - memset(*ptr, 0, size); - } - } + memset(*ptr, 0, nelem*ncclSizeOfT()); } - finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", size); - INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p hp_request %d managed %d hugepage_alloc %d", filefunc, line, size, *ptr, hp_request, managed, huge); + if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT()); + INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } - -static inline ncclResult_t ncclCudaHostFree(void* ptr, size_t alloc_size=0, int hp_request=0) { - if (hp_request) { - if (alloc_size > 0) { - std::lock_guard lock(hugepageAllocsMutex); - // for (auto &kv : hugepageAllocs) INFO(NCCL_ALLOC, "Hugepage alloc ptr %p size %zu", kv.first, kv.second); - auto it = hugepageAllocs.find(ptr); - if (it != hugepageAllocs.end()) { - // INFO(NCCL_ALLOC, "%s:%d Cuda Host HugePage unmap size %ld pointer %p app_tracked_size %ld", __FILE__, __LINE__, it->second, ptr, alloc_size); - hipHostUnregister(ptr); - munmap(ptr, it->second); - hugepageAllocs.erase(it); - return ncclSuccess; - } - } - INFO(NCCL_ALLOC, "Cudafree being done to %p, size=%ld", ptr,alloc_size); - } +static inline ncclResult_t ncclCudaHostFree(void* ptr) { CUDACHECK(cudaFreeHost(ptr)); return ncclSuccess; } -#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__, 0) +#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { diff --git a/src/init.cc b/src/init.cc index df5d0d6022..3dcfee0fa4 100644 --- a/src/init.cc +++ b/src/init.cc @@ -95,8 +95,6 @@ NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT); struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {}; static ncclResult_t commReclaim(ncclComm_t comm); -std::unordered_map hugepageAllocs; -std::mutex hugepageAllocsMutex; #ifdef ENABLE_MSCCLPP size_t std::hash::operator ()(const ncclUniqueId& uniqueId) const noexcept { diff --git a/src/transport/net.cc b/src/transport/net.cc index fab532a822..a7df9a155c 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -179,7 +179,6 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1); -RCCL_PARAM(NetHostBufferHugePageAlloc, "NET_HOST_BUFFER_HUGE_PAGE_ALLOC", 0); #if defined(HIP_CONTIGUOUS_MEMORY) RCCL_PARAM(NetContiguousMem, "NET_CONTIGUOUS_MEM", 0); #endif @@ -603,7 +602,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int } } if (!cuda && state->hostBuff == NULL) { - NCCLCHECK(ncclCudaHostCallocDebug(&state->hostBuff, state->size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc())); + NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size)); } if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; if (gpuPtr) *gpuPtr = (cpuPtr && sameProcess) ? *cpuPtr : NULL; @@ -632,9 +631,7 @@ static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, i } NCCLCHECK(ncclCudaFree(state->cudaBuff)); } - if (state->hostBuff) { - NCCLCHECK(ncclCudaHostFree(state->hostBuff, (state->size)*(sizeof(int64_t)), rcclParamNetHostBufferHugePageAlloc())); - } + if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff)); } if (peer->send.refcount || peer->recv.refcount) return ncclSuccess; @@ -891,7 +888,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str } } if (map->sameProcess) { - NCCLCHECK(ncclCudaHostCallocDebug(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc())); + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; } else { NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM)); @@ -1093,7 +1090,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } } - NCCLCHECK(ncclCudaHostCallocDebug(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc())); + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && map->sameProcess) { uint64_t *cpuPtr, *gpuPtr; @@ -1168,7 +1165,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct } struct connectMapMem* mems = resources->map.mems; if (resources->map.sameProcess) { - NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, (mems[NCCL_NET_MAP_HOSTMEM].size)*(sizeof(int)), rcclParamNetHostBufferHugePageAlloc())); + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); } else { NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc)); } @@ -1212,7 +1209,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct } } struct connectMapMem* mems = resources->map.mems; - NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, (mems[NCCL_NET_MAP_HOSTMEM].size)*(sizeof(int)), rcclParamNetHostBufferHugePageAlloc())); + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (!resources->map.sameProcess || ncclCuMemEnable()) { // cuMem API support