Add tree algorithms for allreduce to improve performance at scale.
Add ncclCommAbort() and ncclCommGetAsyncError() to properly handle
network errors and be permit recover.
Detect initial CPU affinity and no longer escape it.
此提交包含在:
Sylvain Jeaugey
2018-12-13 15:56:12 -08:00
父節點 4861e197fd
當前提交 1450d42675
共有 66 個檔案被更改,包括 3746 行新增3251 行删除
+3 -4
查看文件
@@ -15,8 +15,7 @@ PROFAPI ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
@@ -36,14 +35,14 @@ CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
# Include Volta support if we're using CUDA9 or above
ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
else
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS += -Wall -Wno-sign-compare
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path