2.4.2-1
Add tree algorithms for allreduce to improve performance at scale. Add ncclCommAbort() and ncclCommGetAsyncError() to properly handle network errors and be permit recover. Detect initial CPU affinity and no longer escape it.
此提交包含在:
+3
-4
@@ -15,8 +15,7 @@ PROFAPI ?= 0
|
||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
CUDA_LIB ?= $(CUDA_HOME)/lib64
|
||||
CUDA_INC ?= $(CUDA_HOME)/include
|
||||
CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
||||
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
||||
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
|
||||
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
@@ -36,14 +35,14 @@ CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
||||
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
||||
|
||||
# Include Volta support if we're using CUDA9 or above
|
||||
ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
|
||||
else
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
|
||||
endif
|
||||
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
|
||||
|
||||
CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
|
||||
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
|
||||
CXXFLAGS += -Wall -Wno-sign-compare
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
||||
# Use addprefix so that we can specify more than one path
|
||||
|
||||
新增問題並參考
封鎖使用者