Add support for inter-node communication using sockets and InfiniBand/RoCE.
Improve latency.
Add support for aggregation.
Improve LL/regular tuning.
Remove tests as those are now at github.com/nvidia/nccl-tests .
This commit is contained in:
Sylvain Jeaugey
2018-09-24 16:06:59 -07:00
parent 286916a1a3
commit f93fe9bfd9
132 changed files with 12424 additions and 9415 deletions
+62
View File
@@ -0,0 +1,62 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
RPMPREPDIR := $(BUILDDIR)/redhat
PKGDIR := $(BUILDDIR)/pkg/rpm/
RPMGEN_IN := $(wildcard *.in)
RPMGEN := $(RPMGEN_IN:.in=)
RPMFILES := $(RPMGEN)
RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
PKG_TIMESTAMP := $(shell date -R)
ARCH := $(shell uname -m)
PKG_ARCH ?= $(shell uname -m)
PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch)
ifeq ($(PKG_MULTIARCH),)
# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
PKG_MULTIARCH := $(ARCH)-linux-gnu
endif
prep : $(RPMTARGETS)
$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
build : prep
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
@printf "Building Redhat package\n"
mkdir -p $(PKGDIR)
rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
--define "_rpmdir $(PKGDIR)" \
--define "_builddir $(PKGDIR)/build/" \
--define "_buildrootdir $(PKGDIR)/buildroot/" \
-bb $(BUILDDIR)/redhat/nccl.spec
clean:
rm -Rf $(RPMPREPDIR) $(PKGDIR)
$(RPMPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(RPMPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
-e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
-e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
-e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
$< > $@
$(RPMPREPDIR)/% : %
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(RPMPREPDIR)
cp -f $< $@
+73
View File
@@ -0,0 +1,73 @@
Name: libnccl
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
Release: ${pkg:Revision}
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Group: Development/Libraries
License: BSD
URL: http://developer.nvidia.com/nccl
Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
%description
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
It has been optimized to achieve high bandwidth on any platform using PCIe,
NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.
%package devel
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Group: Development/Libraries
%description devel
NCCL development files
%package static
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Group: Development/Libraries
%description static
NCCL static library
%define debug_package %{nil}
%prep
%setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
%build
%install
rm -rf $RPM_BUILD_ROOT
install -m 755 -d $RPM_BUILD_ROOT
install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
# devel
install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
# static
install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
%clean
rm -rf $RPM_BUILD_ROOT
%files devel
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_includedir}/nccl.h
%{_libdir}/libnccl.so
%files static
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_libdir}/libnccl_static.a
%files
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_libdir}/libnccl.so.${nccl:Major}
%{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
%changelog