a46ea10583
Add support for CUDA graphs. Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439. Fix bootstrap issue caused by connection reordering. Fix CPU locking block. Improve CollNet algorithm. Improve performance on DGX A100 for communicators with only one GPU per node.
252 lines
8.5 KiB
C++
252 lines
8.5 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_GDRWRAP_H_
|
|
#define NCCL_GDRWRAP_H_
|
|
|
|
#include "nccl.h"
|
|
#include <stdint.h> // for standard [u]intX_t types
|
|
#include <stdio.h>
|
|
|
|
// These can be used if the GDR library isn't thread safe
|
|
#include <pthread.h>
|
|
extern pthread_mutex_t gdrLock;
|
|
#define GDRLOCK() pthread_mutex_lock(&gdrLock)
|
|
#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
|
|
#define GDRLOCKCALL(cmd, ret) do { \
|
|
GDRLOCK(); \
|
|
ret = cmd; \
|
|
GDRUNLOCK(); \
|
|
} while(false)
|
|
|
|
#define GDRCHECK(cmd) do { \
|
|
int e; \
|
|
/* GDRLOCKCALL(cmd, e); */ \
|
|
e = cmd; \
|
|
if( e != 0 ) { \
|
|
WARN("GDRCOPY failure %d", e); \
|
|
return ncclSystemError; \
|
|
} \
|
|
} while(false)
|
|
|
|
// This is required as the GDR memory is mapped WC
|
|
#if !defined(__NVCC__)
|
|
#if defined(__PPC__)
|
|
static inline void wc_store_fence(void) { asm volatile("sync") ; }
|
|
#elif defined(__x86_64__)
|
|
#include <immintrin.h>
|
|
static inline void wc_store_fence(void) { _mm_sfence(); }
|
|
#elif defined(__aarch64__)
|
|
#ifdef __cplusplus
|
|
#include <atomic>
|
|
static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
|
|
#else
|
|
#include <stdatomic.h>
|
|
static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
|
|
#endif
|
|
#endif
|
|
#endif
|
|
|
|
//#define GDR_DIRECT 1
|
|
#ifdef GDR_DIRECT
|
|
// Call the GDR API library code directly rather than via
|
|
// dlopen() wrappers
|
|
#include <gdrapi.h>
|
|
|
|
static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
|
|
static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
|
|
static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
|
|
static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
|
|
GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
|
|
return ncclSuccess;
|
|
}
|
|
static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
|
|
GDRCHECK(gdr_unpin_buffer(g, handle));
|
|
return ncclSuccess;
|
|
}
|
|
static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
|
|
GDRCHECK(gdr_get_info(g, handle, info));
|
|
return ncclSuccess;
|
|
}
|
|
static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
|
|
GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
|
|
return ncclSuccess;
|
|
}
|
|
static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
|
|
GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
|
|
return ncclSuccess;
|
|
}
|
|
static void wrap_gdr_runtime_get_version(int *major, int *minor) {
|
|
gdr_runtime_get_version(major, minor);
|
|
return ncclSuccess;
|
|
}
|
|
static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
|
|
gdr_driver_get_version(g, major, minor);
|
|
return ncclSuccess;
|
|
}
|
|
static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
|
|
GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
|
|
return ncclSuccess;
|
|
}
|
|
static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
|
|
GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
#else
|
|
// Dynamically handle dependency the GDR API library
|
|
|
|
/* Extracted from gdrapi.h (v2.1 Nov 2020) */
|
|
|
|
#define GPU_PAGE_SHIFT 16
|
|
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
|
|
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
|
|
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
|
|
|
|
struct gdr;
|
|
typedef struct gdr *gdr_t;
|
|
|
|
typedef struct gdr_mh_s {
|
|
unsigned long h;
|
|
} gdr_mh_t;
|
|
|
|
struct gdr_info {
|
|
uint64_t va;
|
|
uint64_t mapped_size;
|
|
uint32_t page_size;
|
|
uint64_t tm_cycles;
|
|
uint32_t cycles_per_ms;
|
|
unsigned mapped:1;
|
|
unsigned wc_mapping:1;
|
|
};
|
|
typedef struct gdr_info gdr_info_t;
|
|
|
|
/* End of gdrapi.h */
|
|
|
|
ncclResult_t wrap_gdr_symbols(void);
|
|
|
|
gdr_t wrap_gdr_open(void);
|
|
ncclResult_t wrap_gdr_close(gdr_t g);
|
|
ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
|
|
ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
|
|
ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
|
|
ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
|
|
ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
|
|
ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
|
|
ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
|
|
ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
|
|
ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
|
|
|
|
#endif // GDR_DIRECT
|
|
|
|
// Global GDR driver handle
|
|
extern gdr_t ncclGdrCopy;
|
|
|
|
#include "alloc.h"
|
|
|
|
typedef struct gdr_mem_desc {
|
|
void *gdrDevMem;
|
|
void *gdrMap;
|
|
size_t gdrOffset;
|
|
size_t gdrMapSize;
|
|
gdr_mh_t gdrMh;
|
|
} gdr_mem_desc_t;
|
|
|
|
static gdr_t ncclGdrInit() {
|
|
int libMajor, libMinor, drvMajor, drvMinor;
|
|
gdr_t handle = NULL;
|
|
// Dynamically load the GDRAPI library symbols
|
|
if (wrap_gdr_symbols() == ncclSuccess) {
|
|
handle = wrap_gdr_open();
|
|
|
|
if (handle != NULL) {
|
|
ncclResult_t res;
|
|
|
|
// Query the version of libgdrapi
|
|
NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
|
|
|
|
// Query the version of gdrdrv driver
|
|
NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
|
|
|
|
// Only support GDRAPI 2.1 and later
|
|
if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
|
|
goto error;
|
|
}
|
|
else
|
|
INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
|
|
}
|
|
}
|
|
return handle;
|
|
error:
|
|
if (handle != NULL) (void) wrap_gdr_close(handle);
|
|
return NULL;
|
|
}
|
|
|
|
template <typename T>
|
|
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
|
|
gdr_info_t info;
|
|
size_t mapSize;
|
|
gdr_mh_t mh;
|
|
char *devMem;
|
|
void *gdrMap;
|
|
|
|
mapSize = sizeof(T)*nelem;
|
|
|
|
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
|
|
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
|
|
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
|
|
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
|
|
uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
|
|
size_t align = alignedAddr - (uint64_t)devMem;
|
|
|
|
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
|
|
NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
|
|
|
|
NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
|
|
//TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
|
|
|
|
NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
|
|
|
|
// Will offset ever be non zero ?
|
|
ssize_t off = info.va - alignedAddr;
|
|
|
|
gdr_mem_desc_t* md;
|
|
NCCLCHECK(ncclCalloc(&md, 1));
|
|
md->gdrDevMem = devMem;
|
|
md->gdrMap = gdrMap;
|
|
md->gdrMapSize = mapSize;
|
|
md->gdrOffset = off+align;
|
|
md->gdrMh = mh;
|
|
*gdrHandle = md;
|
|
|
|
*ptr = (T *)((char *)gdrMap+off);
|
|
if (devPtr) *devPtr = (T *)(devMem+off+align);
|
|
|
|
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
|
|
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
|
|
|
|
return ncclSuccess;
|
|
}
|
|
|
|
template <typename T>
|
|
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
|
|
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
|
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
|
|
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
|
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
|
|
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
|
|
CUDACHECK(cudaFree(md->gdrDevMem));
|
|
free(md);
|
|
|
|
return ncclSuccess;
|
|
}
|
|
|
|
#endif // End include guard
|