Файли
rocm-systems/src/include/alloc.h
T

182 рядки
7.2 KiB
C++
Неформатований Звичайний вигляд Історія

2019-03-14 19:39:20 -07:00
/*************************************************************************
2022-01-07 06:39:55 -08:00
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
2019-03-14 19:39:20 -07:00
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ALLOC_H_
#define NCCL_ALLOC_H_
#include "nccl.h"
#include "checks.h"
2020-01-16 16:02:42 -08:00
#include "align.h"
2022-05-24 02:02:31 -07:00
#include "utils.h"
2019-03-14 19:39:20 -07:00
#include <sys/mman.h>
2021-09-08 13:56:25 -07:00
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
2019-03-14 19:39:20 -07:00
2022-05-24 02:02:31 -07:00
uint64_t clockNano(); // from utils.h with which we have a circular dependency
2020-05-12 14:40:18 -07:00
template <typename T>
2022-05-24 02:02:31 -07:00
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
2020-05-12 14:40:18 -07:00
memset(*ptr, 0, nelem*sizeof(T));
2022-05-24 02:02:31 -07:00
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
2022-08-18 02:53:17 -07:00
if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
2022-05-24 02:02:31 -07:00
return result;
2019-03-14 19:39:20 -07:00
}
2021-07-08 14:12:04 -07:00
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
2019-03-14 19:39:20 -07:00
2022-05-24 02:02:31 -07:00
inline ncclResult_t ncclCudaHostFree(void* ptr) {
2019-03-14 19:39:20 -07:00
CUDACHECK(cudaFreeHost(ptr));
return ncclSuccess;
}
template <typename T>
2022-05-24 02:02:31 -07:00
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
2019-03-14 19:39:20 -07:00
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
}
2022-01-07 06:39:55 -08:00
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
2019-03-14 19:39:20 -07:00
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
return ncclSuccess;
}
2022-01-07 06:39:55 -08:00
#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
2022-05-24 02:02:31 -07:00
ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
2022-01-07 06:39:55 -08:00
if (nelem < oldNelem) return ncclInternalError;
if (nelem == oldNelem) return ncclSuccess;
T* oldp = *ptr;
T* p = (T*)malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
}
memcpy(p, oldp, oldNelem*sizeof(T));
free(oldp);
memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
*ptr = (T*)p;
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
return ncclSuccess;
}
2019-03-14 19:39:20 -07:00
template <typename T>
2022-05-24 02:02:31 -07:00
ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
2022-08-18 02:53:17 -07:00
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
2022-05-24 02:02:31 -07:00
return result;
}
#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
2021-04-12 16:00:11 -07:00
cudaStream_t stream;
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
2022-05-24 02:02:31 -07:00
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
2022-08-18 02:53:17 -07:00
if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
2022-05-24 02:02:31 -07:00
return result;
2019-03-14 19:39:20 -07:00
}
2021-07-08 14:12:04 -07:00
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
2019-03-14 19:39:20 -07:00
template <typename T>
2022-05-24 02:02:31 -07:00
ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
2022-08-18 02:53:17 -07:00
if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
2022-05-24 02:02:31 -07:00
return result;
}
#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
cudaStream_t stream;
CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
}
template <typename T>
ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
}
template <typename T>
ncclResult_t ncclCudaFree(T* ptr) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaFree(ptr), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
2019-03-14 19:39:20 -07:00
}
2020-01-16 16:02:42 -08:00
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
// allocated on separate pages as those pages will be marked DONTFORK
// and if they are shared, that could cause a crash in a child process
2022-05-24 02:02:31 -07:00
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
2020-01-16 16:02:42 -08:00
size_t page_size = sysconf(_SC_PAGESIZE);
void* p;
int size_aligned = ROUNDUP(size, page_size);
int ret = posix_memalign(&p, page_size, size_aligned);
if (ret != 0) return ncclSystemError;
memset(p, 0, size);
*ptr = p;
2021-07-08 14:12:04 -07:00
INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
2020-01-16 16:02:42 -08:00
return ncclSuccess;
}
2021-07-08 14:12:04 -07:00
#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
2020-01-16 16:02:42 -08:00
2019-03-14 19:39:20 -07:00
#endif