src/include/alloc.h

/*************************************************************************
 * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#ifndef NCCL_ALLOC_H_
#define NCCL_ALLOC_H_

#include "nccl.h"
#include "checks.h"
#include "align.h"
#include <sys/mman.h>

template <typename T>
static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
  CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
  memset(*ptr, 0, nelem*sizeof(T));
  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
  return ncclSuccess;
}
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

static inline ncclResult_t ncclCudaHostFree(void* ptr) {
  CUDACHECK(cudaFreeHost(ptr));
  return ncclSuccess;
}

template <typename T>
static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
  void* p = malloc(nelem*sizeof(T));
  if (p == NULL) {
    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
    return ncclSystemError;
  }
  memset(p, 0, nelem*sizeof(T));
  *ptr = (T*)p;
  INFO(NCCL_ALLOC, "%s:%d Mem Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
  return ncclSuccess;
}
#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

template <typename T>
static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
  // Need async stream for P2P pre-connect + CUDA Graph
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
  CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
  CUDACHECK(cudaStreamSynchronize(stream));
  CUDACHECK(cudaStreamDestroy(stream));
  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
  return ncclSuccess;
}
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
  return ncclSuccess;
}

// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
// allocated on separate pages as those pages will be marked DONTFORK
// and if they are shared, that could cause a crash in a child process
static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
  size_t page_size = sysconf(_SC_PAGESIZE);
  void* p;
  int size_aligned = ROUNDUP(size, page_size);
  int ret = posix_memalign(&p, page_size, size_aligned);
  if (ret != 0) return ncclSystemError;
  memset(p, 0, size);
  *ptr = p;
  INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
  return ncclSuccess;
}
#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)

#endif
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`/*************************************************************************`
2.9.6-1 2021-04-12 16:00:11 -07:00			`* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`*`
			`* See LICENSE.txt for license information`
			`************************************************************************/`

			`#ifndef NCCL_ALLOC_H_`
			`#define NCCL_ALLOC_H_`

			`#include "nccl.h"`
			`#include "checks.h"`
2.6.4-1 2020-01-16 16:02:42 -08:00			`#include "align.h"`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`#include <sys/mman.h>`

2.7.3-1 2020-05-12 14:40:18 -07:00			`template <typename T>`
2.10.3-1 2021-07-08 14:12:04 -07:00			`static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {`
2.7.3-1 2020-05-12 14:40:18 -07:00			`CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));`
			`memset(ptr, 0, nelemsizeof(T));`
2.10.3-1 2021-07-08 14:12:04 -07:00			`INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelemsizeof(T), ptr);`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`return ncclSuccess;`
			`}`
2.10.3-1 2021-07-08 14:12:04 -07:00			`#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
			`static inline ncclResult_t ncclCudaHostFree(void* ptr) {`
			`CUDACHECK(cudaFreeHost(ptr));`
			`return ncclSuccess;`
			`}`

			`template <typename T>`
2.10.3-1 2021-07-08 14:12:04 -07:00			`static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`void* p = malloc(nelem*sizeof(T));`
			`if (p == NULL) {`
			`WARN("Failed to malloc %ld bytes", nelem*sizeof(T));`
			`return ncclSystemError;`
			`}`
			`memset(p, 0, nelem*sizeof(T));`
			`ptr = (T)p;`
2.10.3-1 2021-07-08 14:12:04 -07:00			`INFO(NCCL_ALLOC, "%s:%d Mem Alloc Size %ld pointer %p", filefunc, line, nelemsizeof(T), ptr);`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`return ncclSuccess;`
			`}`
2.10.3-1 2021-07-08 14:12:04 -07:00			`#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
			`template <typename T>`
2.10.3-1 2021-07-08 14:12:04 -07:00			`static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {`
2.9.6-1 2021-04-12 16:00:11 -07:00			`// Need async stream for P2P pre-connect + CUDA Graph`
			`cudaStream_t stream;`
			`CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));`
2.9.6-1 2021-04-12 16:00:11 -07:00			`CUDACHECK(cudaMemsetAsync(ptr, 0, nelemsizeof(T), stream));`
			`CUDACHECK(cudaStreamSynchronize(stream));`
			`CUDACHECK(cudaStreamDestroy(stream));`
2.10.3-1 2021-07-08 14:12:04 -07:00			`INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelemsizeof(T), ptr);`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`return ncclSuccess;`
			`}`
2.10.3-1 2021-07-08 14:12:04 -07:00			`#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)`
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00
			`template <typename T>`
			`static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {`
			`CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));`
			`return ncclSuccess;`
			`}`

2.6.4-1 2020-01-16 16:02:42 -08:00			`// Allocate memory to be potentially ibv_reg_mr'd. This needs to be`
			`// allocated on separate pages as those pages will be marked DONTFORK`
			`// and if they are shared, that could cause a crash in a child process`
2.10.3-1 2021-07-08 14:12:04 -07:00			`static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {`
2.6.4-1 2020-01-16 16:02:42 -08:00			`size_t page_size = sysconf(_SC_PAGESIZE);`
			`void* p;`
			`int size_aligned = ROUNDUP(size, page_size);`
			`int ret = posix_memalign(&p, page_size, size_aligned);`
			`if (ret != 0) return ncclSystemError;`
			`memset(p, 0, size);`
			`*ptr = p;`
2.10.3-1 2021-07-08 14:12:04 -07:00			`INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);`
2.6.4-1 2020-01-16 16:02:42 -08:00			`return ncclSuccess;`
			`}`
2.10.3-1 2021-07-08 14:12:04 -07:00			`#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)`
2.6.4-1 2020-01-16 16:02:42 -08:00
NCCL 2.4.6-1 2019-03-14 19:39:20 -07:00			`#endif`