Use one side stream per process (#2063)
* Use one side stream per process
* Handle multiple GPUs per process
* Reset stream when not found
* Address review comments
* Fix missing mutex initializer
[ROCm/rccl commit: 185e78a8f0]
Этот коммит содержится в:
@@ -17,6 +17,7 @@
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <map>
|
||||
#include "rccl_vars.h"
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
@@ -31,6 +32,78 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
|
||||
template<>
|
||||
constexpr size_t ncclSizeOfT<void>() { return 1; }
|
||||
|
||||
struct ncclSideStream {
|
||||
cudaStream_t stream;
|
||||
uint64_t refCount;
|
||||
};
|
||||
|
||||
inline std::unordered_map<int64_t, ncclSideStream> sideStream;
|
||||
inline pthread_mutex_t sideStreamLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
extern ncclResult_t getBusId(int cudaDev, int64_t *busId);
|
||||
|
||||
static inline ncclResult_t ncclCreateSideStream(int cudaDev) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
int64_t busId;
|
||||
NCCLCHECK(getBusId(cudaDev, &busId));
|
||||
pthread_mutex_lock(&sideStreamLock);
|
||||
if (auto it = sideStream.find(busId); it != sideStream.end()) {
|
||||
it->second.refCount++;
|
||||
INFO(NCCL_ALLOC, "Side stream %p of dev %d busid %lx inc count to %ld",
|
||||
it->second.stream, cudaDev, busId, it->second.refCount);
|
||||
} else {
|
||||
cudaStream_t stream;
|
||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), res, fail);
|
||||
sideStream.emplace(busId, ncclSideStream{stream, 1});
|
||||
INFO(NCCL_ALLOC, "Created side stream %p of dev %d busid %lx",
|
||||
stream, cudaDev, busId);
|
||||
}
|
||||
fail:
|
||||
pthread_mutex_unlock(&sideStreamLock);
|
||||
return res;
|
||||
};
|
||||
|
||||
static inline ncclResult_t ncclDestroySideStream(int cudaDev) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
int64_t busId;
|
||||
NCCLCHECK(getBusId(cudaDev, &busId));
|
||||
pthread_mutex_lock(&sideStreamLock);
|
||||
if (auto it = sideStream.find(busId); it != sideStream.end()) {
|
||||
it->second.refCount--;
|
||||
if (it->second.refCount== 0) {
|
||||
INFO(NCCL_ALLOC, "Destroyed side stream %p of dev %d busid %lx",
|
||||
it->second.stream, cudaDev, busId);
|
||||
CUDACHECKGOTO(cudaStreamDestroy(it->second.stream), res, fail);
|
||||
sideStream.erase(it);
|
||||
} else {
|
||||
INFO(NCCL_ALLOC, "Side stream %p of dev %d busid %lx dec count to %ld",
|
||||
it->second.stream, cudaDev, busId, it->second.refCount);
|
||||
}
|
||||
} else {
|
||||
WARN("Side stream of dev %d busid %lx was not found for destroy", cudaDev, busId);
|
||||
}
|
||||
fail:
|
||||
pthread_mutex_unlock(&sideStreamLock);
|
||||
return res;
|
||||
};
|
||||
|
||||
static inline ncclResult_t getSideStream(cudaStream_t *stream) {
|
||||
int cudaDev;
|
||||
int64_t busId;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
NCCLCHECK(getBusId(cudaDev, &busId));
|
||||
pthread_mutex_lock(&sideStreamLock);
|
||||
if (auto it = sideStream.find(busId); it != sideStream.end()) {
|
||||
*stream = it->second.stream;
|
||||
INFO(NCCL_ALLOC, "Found side stream %p of dev %d busid %lx count %ld",
|
||||
it->second.stream, cudaDev, busId, it->second.refCount);
|
||||
} else {
|
||||
*stream = 0;
|
||||
WARN("Side stream of dev %d busid %lx was not found", cudaDev, busId);
|
||||
}
|
||||
pthread_mutex_unlock(&sideStreamLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#if CUDART_VERSION >= 12020
|
||||
|
||||
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
|
||||
@@ -362,7 +435,7 @@ finish:
|
||||
#define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, unsigned int flags = hipDeviceMallocDefault) {
|
||||
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, unsigned int flags = hipDeviceMallocDefault) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
@@ -370,13 +443,15 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
|
||||
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream = sideStream;
|
||||
if (stream == nullptr)
|
||||
cudaStream_t stream, sidestream;
|
||||
NCCLCHECK(getSideStream(&sidestream));
|
||||
stream = sidestream;
|
||||
if (sidestream == nullptr)
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT<T>(), flags), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
if (sideStream == nullptr)
|
||||
if (sidestream == nullptr)
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
@@ -428,11 +503,15 @@ ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream;
|
||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
|
||||
cudaStream_t stream, sidestream;
|
||||
NCCLCHECK(getSideStream(&sidestream));
|
||||
stream = sidestream;
|
||||
if (sidestream == nullptr)
|
||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
if (sidestream == nullptr)
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
|
||||
@@ -629,8 +629,6 @@ struct ncclComm {
|
||||
|
||||
struct ncclKernelPlanner planner;
|
||||
|
||||
hipStream_t sideStream; // [RCCL] Cached non-captured stream
|
||||
|
||||
cudaMemPool_t memPool;
|
||||
// Queue of events and associated callbacks for cleaning up asynchronous work.
|
||||
// Using this is preferable to using CUDA host callbacks because host callbacks
|
||||
|
||||
@@ -180,7 +180,7 @@ static gdr_t ncclGdrInit() {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) {
|
||||
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
|
||||
// gdr_info_t info; // unused variable - compiler warning
|
||||
size_t mapSize;
|
||||
// gdr_mh_t mh; // unused variable - compiler warning
|
||||
@@ -193,9 +193,9 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
|
||||
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
|
||||
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocUncached));
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, hipDeviceMallocUncached));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocFinegrained));
|
||||
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, hipDeviceMallocFinegrained));
|
||||
#endif
|
||||
gdr_mem_desc_t* md;
|
||||
NCCLCHECK(ncclCalloc(&md, 1));
|
||||
|
||||
@@ -94,7 +94,7 @@ NCCL_PARAM(CtaPolicy, "CTA_POLICY", NCCL_CONFIG_UNDEF_INT);
|
||||
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT);
|
||||
|
||||
struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
|
||||
static ncclResult_t commReclaim(ncclComm_t comm);
|
||||
ncclResult_t commReclaim(ncclComm_t comm);
|
||||
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
size_t std::hash<ncclUniqueId>::operator ()(const ncclUniqueId& uniqueId) const noexcept {
|
||||
@@ -521,7 +521,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
NCCLCHECK(dtor->fn(dtor));
|
||||
dtor = dtor->next;
|
||||
}
|
||||
CUDACHECK(hipStreamDestroy(comm->sideStream));
|
||||
|
||||
ncclMemoryStackDestruct(&comm->memScoped);
|
||||
ncclMemoryStackDestruct(&comm->memPermanent);
|
||||
@@ -544,6 +543,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
NCCLCHECK(ncclNvlsSymmetricFinalize(comm));
|
||||
NCCLCHECK(ncclIpcSymmetricFinalize(comm));
|
||||
}
|
||||
NCCLCHECK(ncclDestroySideStream(comm->cudaDev));
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
|
||||
|
||||
commPoison(comm); // poison comm before free to avoid comm reuse.
|
||||
@@ -650,6 +650,9 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
comm->lastStream = nullptr;
|
||||
CUDACHECK(cudaGetDevice(&comm->cudaDev));
|
||||
|
||||
// RCCL: create persistent stream for calloc
|
||||
NCCLCHECK(ncclCreateSideStream(comm->cudaDev));
|
||||
|
||||
// Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
|
||||
// but can be enabled via environment variable
|
||||
if (rcclParamEnableContextTracking() == 1) {
|
||||
@@ -666,9 +669,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
comm->compCap = ncclCudaCompCap();
|
||||
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
|
||||
|
||||
// RCCL: create persistent stream for calloc
|
||||
CUDACHECK(hipStreamCreateWithFlags(&comm->sideStream, hipStreamNonBlocking));
|
||||
|
||||
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
||||
comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
|
||||
|
||||
@@ -819,7 +819,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
|
||||
if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
|
||||
// The workFifoBuf lives in GDR mapped CUDA memory.
|
||||
NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoBuf, &comm->workFifoBufDev, comm->workFifoBytes, &comm->workFifoBufGdrHandle, comm->sideStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoBuf, &comm->workFifoBufDev, comm->workFifoBytes, &comm->workFifoBufGdrHandle), ret, fail);
|
||||
ncclCommPushCudaGdrFree(comm, comm->workFifoBufGdrHandle);
|
||||
} else {
|
||||
// The workFifoBuf lives in cudaHost memory.
|
||||
@@ -876,7 +876,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
NCCLCHECK(ncclCudaCalloc(&tmpCommAndChans.comm.devProf, MAXCHANNELS*PROFILE_NUM_LAUNCHES, comm->sideStream));
|
||||
NCCLCHECK(ncclCudaCalloc(&tmpCommAndChans.comm.devProf, MAXCHANNELS*PROFILE_NUM_LAUNCHES));
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_FAULT_INJECTION
|
||||
|
||||
@@ -409,9 +409,9 @@ static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int
|
||||
|
||||
if (cuda && collNet->cudaBuff == NULL) {
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
#endif
|
||||
}
|
||||
if (!cuda && collNet->hostBuff == NULL) {
|
||||
@@ -504,7 +504,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
|
||||
if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) {
|
||||
uint64_t *cpuPtr, *gpuPtr;
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr));
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
|
||||
|
||||
resources->gdcSync = cpuPtr;
|
||||
struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
|
||||
@@ -582,7 +582,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
|
||||
if (ncclGdrCopy) {
|
||||
uint64_t *cpuPtr, *gpuPtr;
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr));
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
|
||||
|
||||
if (ncclParamGdrCopySyncEnable()) {
|
||||
resources->gdcSync = cpuPtr;
|
||||
|
||||
@@ -597,14 +597,14 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
|
||||
} else {
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
#if defined(HIP_CONTIGUOUS_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size,
|
||||
cuda ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size,
|
||||
cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
#endif
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size,
|
||||
cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
#endif
|
||||
}
|
||||
@@ -888,14 +888,14 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
} else {
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
#if defined(HIP_CONTIGUOUS_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size,
|
||||
resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size,
|
||||
resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
#endif
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size,
|
||||
resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
#endif
|
||||
}
|
||||
@@ -914,7 +914,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
}
|
||||
if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
|
||||
uint64_t *cpuPtr, *gpuPtr;
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr));
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
|
||||
|
||||
resources->gdcSync = cpuPtr;
|
||||
struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
|
||||
@@ -1091,14 +1091,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
} else {
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
#if defined(HIP_CONTIGUOUS_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size,
|
||||
resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size,
|
||||
resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault));
|
||||
#endif
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size,
|
||||
resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
|
||||
#endif
|
||||
}
|
||||
@@ -1109,7 +1109,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
|
||||
if (ncclGdrCopy && map->sameProcess) {
|
||||
uint64_t *cpuPtr, *gpuPtr;
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr));
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
|
||||
|
||||
if (ncclParamGdrCopySyncEnable()) {
|
||||
resources->gdcSync = cpuPtr;
|
||||
|
||||
@@ -1855,9 +1855,9 @@ ib_recv:
|
||||
if (rComm->flushEnabled) {
|
||||
if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) {
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocUncached), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), hipDeviceMallocUncached), ret, fail);
|
||||
#else
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocFinegrained), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), hipDeviceMallocFinegrained), ret, fail);
|
||||
#endif
|
||||
if (useDmaBuf)
|
||||
{
|
||||
|
||||
@@ -247,9 +247,9 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDe
|
||||
} else {
|
||||
// Allocate a CUDA buffer and generate an IPC handle for it
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocUncached));
|
||||
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, hipDeviceMallocUncached));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocFinegrained));
|
||||
NCCLCHECK(ncclCudaCalloc((char **)ptr, size, hipDeviceMallocFinegrained));
|
||||
#endif
|
||||
cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr);
|
||||
if (res != cudaSuccess) {
|
||||
@@ -667,9 +667,9 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
|
||||
connection->transportResources = proxyInfo;
|
||||
|
||||
#if defined(HIP_UNCACHED_MEMORY)
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocUncached));
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], hipDeviceMallocUncached));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocFinegrained));
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], hipDeviceMallocFinegrained));
|
||||
#endif
|
||||
|
||||
// Create a SHM segment for the peer to attach to
|
||||
|
||||
Ссылка в новой задаче
Block a user