Merge branch 'master' into xgmi_bench

[ROCm/rccl commit: deea20d49c]
Этот коммит содержится в:
rpathani
2019-08-16 10:56:56 +05:30
коммит произвёл GitHub
родитель 2dbcb62caf 761a2d2274
Коммит eaa1cdb48c
128 изменённых файлов: 7416 добавлений и 6097 удалений
+49 -40
Просмотреть файл
@@ -55,8 +55,16 @@ else()
endif()
# Setup VERSION
set(VERSION_STRING "2.6.0")
rocm_setup_version(VERSION ${VERSION_STRING})
set(VERSION_STRING "2.6.0.")
# Check if BUILD_NUMBER is defined in a Jenkins environment
if($ENV{BUILD_NUMBER})
string(CONCAT BUILD_VERSION ${VERSION_STRING} $ENV{BUILD_NUMBER})
else()
string(CONCAT BUILD_VERSION ${VERSION_STRING} "0")
endif()
rocm_setup_version(VERSION ${BUILD_VERSION} NO_GIT_TAG_VERSION)
list(APPEND CMAKE_PREFIX_PATH
/opt/rocm
@@ -79,27 +87,12 @@ include_directories(src/collectives)
include_directories(src/collectives/device)
set(CU_SOURCES
src/bootstrap.cu
src/collectives/all_gather.cu
src/collectives/all_reduce.cu
src/collectives/broadcast.cu
src/collectives/reduce.cu
src/collectives/reduce_scatter.cu
src/collectives/device/functions.cu
src/init.cu
src/misc/enqueue.cu
src/misc/group.cu
src/misc/ibvwrap.cu
src/misc/nvmlwrap_stub.cu
src/misc/rings.cu
src/misc/utils.cu
src/ring.cu
src/transport.cu
src/transport/net.cu
src/transport/net_ib.cu
src/transport/net_socket.cu
src/transport/p2p.cu
src/transport/shm.cu)
src/collectives/device/all_reduce.cu
src/collectives/device/all_gather.cu
src/collectives/device/reduce.cu
src/collectives/device/broadcast.cu
src/collectives/device/reduce_scatter.cu
src/collectives/device/functions.cu)
set(CPP_SOURCES)
foreach(filename ${CU_SOURCES})
@@ -111,20 +104,34 @@ foreach(filename ${CU_SOURCES})
list(APPEND CPP_SOURCES ${cpp_filename})
endforeach(filename)
list(APPEND CPP_SOURCES src/collectives/device/all_gather_0.cpp)
list(APPEND CPP_SOURCES src/collectives/device/all_reduce_0.cpp)
list(APPEND CPP_SOURCES src/collectives/device/all_reduce_1.cpp)
list(APPEND CPP_SOURCES src/collectives/device/all_reduce_2.cpp)
list(APPEND CPP_SOURCES src/collectives/device/all_reduce_3.cpp)
list(APPEND CPP_SOURCES src/collectives/device/broadcast_0.cpp)
list(APPEND CPP_SOURCES src/collectives/device/reduce_0.cpp)
list(APPEND CPP_SOURCES src/collectives/device/reduce_1.cpp)
list(APPEND CPP_SOURCES src/collectives/device/reduce_2.cpp)
list(APPEND CPP_SOURCES src/collectives/device/reduce_3.cpp)
list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_0.cpp)
list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_1.cpp)
list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_2.cpp)
list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_3.cpp)
set(CC_SOURCES
src/init.cc
src/collectives/all_reduce.cc
src/collectives/all_gather.cc
src/collectives/reduce.cc
src/collectives/broadcast.cc
src/collectives/reduce_scatter.cc
src/channel.cc
src/misc/trees.cc
src/misc/rings.cc
src/misc/argcheck.cc
src/misc/group.cc
src/misc/utils.cc
src/misc/ibvwrap.cc
src/misc/nvmlwrap_stub.cc
src/misc/topo.cc
src/transport/net.cc
src/transport/net_ib.cc
src/transport/net_socket.cc
src/transport/p2p.cc
src/transport/shm.cc
src/transport.cc
src/bootstrap.cc
src/enqueue.cc)
foreach(filename ${CC_SOURCES})
list(APPEND CPP_SOURCES ${filename})
endforeach(filename)
add_library(rccl ${CPP_SOURCES})
@@ -132,18 +139,20 @@ if(TRACE)
add_definitions(-DENABLE_TRACE)
endif()
if(PROFILE)
add_definitions(-DENABLE_PROFILING)
endif()
target_link_libraries(rccl
PRIVATE --amdgpu-target=gfx803
PRIVATE --amdgpu-target=gfx900
PRIVATE --amdgpu-target=gfx906
PRIVATE --amdgpu-target=gfx908)
PRIVATE --amdgpu-target=gfx906)
if("${HIP_COMPILER}" MATCHES "clang")
target_compile_options(rccl
PRIVATE --amdgpu-target=gfx803
PRIVATE --amdgpu-target=gfx900
PRIVATE --amdgpu-target=gfx906
PRIVATE --amdgpu-target=gfx908
PRIVATE -fgpu-rdc)
target_link_libraries(rccl PRIVATE -fgpu-rdc)
target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
поставляемый
+1 -1
Просмотреть файл
@@ -80,7 +80,7 @@ rcclCI:
sudo dpkg -i package/*.deb
"""
//platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""")
}
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
+63 -63
Просмотреть файл
@@ -162,7 +162,7 @@ FULL_PATH_NAMES = YES
# will be relative from the directory where doxygen is started.
# This tag requires that the tag FULL_PATH_NAMES is set to YES.
STRIP_FROM_PATH =
STRIP_FROM_PATH =
# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
# path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +171,7 @@ STRIP_FROM_PATH =
# specify the list of include paths that are normally passed to the compiler
# using the -I flag.
STRIP_FROM_INC_PATH =
STRIP_FROM_INC_PATH =
# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
# less readable) file names. This can be useful is your file systems doesn't
@@ -238,13 +238,13 @@ TAB_SIZE = 4
# "Side Effects:". You can put \n's in the value part of an alias to insert
# newlines.
ALIASES =
ALIASES =
# This tag can be used to specify a number of word-keyword mappings (TCL only).
# A mapping has the form "name=value". For example adding "class=itcl::class"
# will allow you to use the command class in the itcl::class meaning.
TCL_SUBST =
TCL_SUBST =
# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
# only. Doxygen will then generate output that is more tailored for C. For
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL = NO
# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
# the files are not read by doxygen.
EXTENSION_MAPPING =
EXTENSION_MAPPING =
# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
# according to the Markdown format, which allows for more readable
@@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES
# sections, marked by \if <section_label> ... \endif and \cond <section_label>
# ... \endcond blocks.
ENABLED_SECTIONS =
ENABLED_SECTIONS =
# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
# initial value of a variable or macro / define can have for it to appear in the
@@ -683,7 +683,7 @@ SHOW_NAMESPACES = YES
# by doxygen. Whatever the program writes to standard output is used as the file
# version. For an example see the documentation.
FILE_VERSION_FILTER =
FILE_VERSION_FILTER =
# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
# by doxygen. The layout file controls the global structure of the generated
@@ -696,7 +696,7 @@ FILE_VERSION_FILTER =
# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
# tag is left empty.
LAYOUT_FILE =
LAYOUT_FILE =
# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
# the reference definitions. This must be a list of .bib files. The .bib
@@ -706,7 +706,7 @@ LAYOUT_FILE =
# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
# search path. See also \cite for info how to create references.
CITE_BIB_FILES =
CITE_BIB_FILES =
#---------------------------------------------------------------------------
# Configuration options related to warning and progress messages
@@ -765,7 +765,7 @@ WARN_FORMAT = "$file:$line: $text"
# messages should be written. If left blank the output is written to standard
# error (stderr).
WARN_LOGFILE =
WARN_LOGFILE =
#---------------------------------------------------------------------------
# Configuration options related to the input files
@@ -858,7 +858,7 @@ RECURSIVE = NO
# Note that relative paths are relative to the directory from which doxygen is
# run.
EXCLUDE =
EXCLUDE =
# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
# directories that are symbolic links (a Unix file system feature) are excluded
@@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS = NO
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories for example use the pattern */test/*
EXCLUDE_PATTERNS =
EXCLUDE_PATTERNS =
# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
# (namespaces, classes, functions, etc.) that should be excluded from the
@@ -885,13 +885,13 @@ EXCLUDE_PATTERNS =
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories use the pattern */test/*
EXCLUDE_SYMBOLS =
EXCLUDE_SYMBOLS =
# The EXAMPLE_PATH tag can be used to specify one or more files or directories
# that contain example code fragments that are included (see the \include
# command).
EXAMPLE_PATH =
EXAMPLE_PATH =
# If the value of the EXAMPLE_PATH tag contains directories, you can use the
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE = NO
# that contain images that are to be included in the documentation (see the
# \image command).
IMAGE_PATH =
IMAGE_PATH =
# The INPUT_FILTER tag can be used to specify a program that doxygen should
# invoke to filter for each input file. Doxygen will invoke the filter program
@@ -928,7 +928,7 @@ IMAGE_PATH =
# code is scanned, but not when the output code is generated. If lines are added
# or removed, the anchors will not be placed correctly.
INPUT_FILTER =
INPUT_FILTER =
# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
# basis. Doxygen will compare the file name with each pattern and apply the
@@ -937,7 +937,7 @@ INPUT_FILTER =
# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
# patterns match the file name, INPUT_FILTER is applied.
FILTER_PATTERNS =
FILTER_PATTERNS =
# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
# INPUT_FILTER) will also be used to filter the input files that are used for
@@ -952,7 +952,7 @@ FILTER_SOURCE_FILES = NO
# *.ext= (so without naming a filter).
# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
FILTER_SOURCE_PATTERNS =
FILTER_SOURCE_PATTERNS =
# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
# is part of the input, its contents will be placed on the main page
@@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO
# specified with INPUT and INCLUDE_PATH.
# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
CLANG_OPTIONS =
CLANG_OPTIONS =
#---------------------------------------------------------------------------
# Configuration options related to the alphabetical class index
@@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX = 5
# while generating the index headers.
# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
IGNORE_PREFIX =
IGNORE_PREFIX =
#---------------------------------------------------------------------------
# Configuration options related to the HTML output
@@ -1134,7 +1134,7 @@ HTML_FILE_EXTENSION = .html
# of the possible markers and block names see the documentation.
# This tag requires that the tag GENERATE_HTML is set to YES.
HTML_HEADER =
HTML_HEADER =
# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
# generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1144,7 +1144,7 @@ HTML_HEADER =
# that doxygen normally uses.
# This tag requires that the tag GENERATE_HTML is set to YES.
HTML_FOOTER =
HTML_FOOTER =
# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
# sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1156,7 +1156,7 @@ HTML_FOOTER =
# obsolete.
# This tag requires that the tag GENERATE_HTML is set to YES.
HTML_STYLESHEET =
HTML_STYLESHEET =
# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
# cascading style sheets that are included after the standard style sheets
@@ -1169,7 +1169,7 @@ HTML_STYLESHEET =
# list). For an example see the documentation.
# This tag requires that the tag GENERATE_HTML is set to YES.
HTML_EXTRA_STYLESHEET =
HTML_EXTRA_STYLESHEET =
# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
# other source files which should be copied to the HTML output directory. Note
@@ -1179,7 +1179,7 @@ HTML_EXTRA_STYLESHEET =
# files will be copied as-is; there are no commands or markers available.
# This tag requires that the tag GENERATE_HTML is set to YES.
HTML_EXTRA_FILES =
HTML_EXTRA_FILES =
# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
# will adjust the colors in the style sheet and background images according to
@@ -1308,7 +1308,7 @@ GENERATE_HTMLHELP = NO
# written to the html output directory.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
CHM_FILE =
CHM_FILE =
# The HHC_LOCATION tag can be used to specify the location (absolute path
# including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1316,7 +1316,7 @@ CHM_FILE =
# The file has to be specified with full path.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
HHC_LOCATION =
HHC_LOCATION =
# The GENERATE_CHI flag controls if a separate .chi index file is generated
# (YES) or that it should be included in the master .chm file (NO).
@@ -1329,7 +1329,7 @@ GENERATE_CHI = NO
# and project file content.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
CHM_INDEX_ENCODING =
CHM_INDEX_ENCODING =
# The BINARY_TOC flag controls whether a binary table of contents is generated
# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1360,7 +1360,7 @@ GENERATE_QHP = NO
# the HTML output folder.
# This tag requires that the tag GENERATE_QHP is set to YES.
QCH_FILE =
QCH_FILE =
# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
# Project output. For more information please see Qt Help Project / Namespace
@@ -1385,7 +1385,7 @@ QHP_VIRTUAL_FOLDER = doc
# filters).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_CUST_FILTER_NAME =
QHP_CUST_FILTER_NAME =
# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
# custom filter to add. For more information please see Qt Help Project / Custom
@@ -1393,21 +1393,21 @@ QHP_CUST_FILTER_NAME =
# filters).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_CUST_FILTER_ATTRS =
QHP_CUST_FILTER_ATTRS =
# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
# project's filter section matches. Qt Help Project / Filter Attributes (see:
# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_SECT_FILTER_ATTRS =
QHP_SECT_FILTER_ATTRS =
# The QHG_LOCATION tag can be used to specify the location of Qt's
# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
# generated .qhp file.
# This tag requires that the tag GENERATE_QHP is set to YES.
QHG_LOCATION =
QHG_LOCATION =
# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
# generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1540,7 +1540,7 @@ MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest
# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
# This tag requires that the tag USE_MATHJAX is set to YES.
MATHJAX_EXTENSIONS =
MATHJAX_EXTENSIONS =
# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
# of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1548,7 +1548,7 @@ MATHJAX_EXTENSIONS =
# example see the documentation.
# This tag requires that the tag USE_MATHJAX is set to YES.
MATHJAX_CODEFILE =
MATHJAX_CODEFILE =
# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
# the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1608,7 +1608,7 @@ EXTERNAL_SEARCH = NO
# Searching" for details.
# This tag requires that the tag SEARCHENGINE is set to YES.
SEARCHENGINE_URL =
SEARCHENGINE_URL =
# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
# search data is written to a file for indexing by an external tool. With the
@@ -1624,7 +1624,7 @@ SEARCHDATA_FILE = searchdata.xml
# projects and redirect the results back to the right project.
# This tag requires that the tag SEARCHENGINE is set to YES.
EXTERNAL_SEARCH_ID =
EXTERNAL_SEARCH_ID =
# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
# projects other than the one defined by this configuration file, but that are
@@ -1634,7 +1634,7 @@ EXTERNAL_SEARCH_ID =
# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
# This tag requires that the tag SEARCHENGINE is set to YES.
EXTRA_SEARCH_MAPPINGS =
EXTRA_SEARCH_MAPPINGS =
#---------------------------------------------------------------------------
# Configuration options related to the LaTeX output
@@ -1698,7 +1698,7 @@ PAPER_TYPE = a4
# If left blank no extra packages will be included.
# This tag requires that the tag GENERATE_LATEX is set to YES.
EXTRA_PACKAGES =
EXTRA_PACKAGES =
# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
# generated LaTeX document. The header should contain everything until the first
@@ -1714,7 +1714,7 @@ EXTRA_PACKAGES =
# to HTML_HEADER.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_HEADER =
LATEX_HEADER =
# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
# generated LaTeX document. The footer should contain everything after the last
@@ -1725,7 +1725,7 @@ LATEX_HEADER =
# Note: Only use a user-defined footer if you know what you are doing!
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_FOOTER =
LATEX_FOOTER =
# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
# LaTeX style sheets that are included after the standard style sheets created
@@ -1736,7 +1736,7 @@ LATEX_FOOTER =
# list).
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_EXTRA_STYLESHEET =
LATEX_EXTRA_STYLESHEET =
# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
# other source files which should be copied to the LATEX_OUTPUT output
@@ -1744,7 +1744,7 @@ LATEX_EXTRA_STYLESHEET =
# markers available.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_EXTRA_FILES =
LATEX_EXTRA_FILES =
# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1844,14 +1844,14 @@ RTF_HYPERLINKS = NO
# default style sheet that doxygen normally uses.
# This tag requires that the tag GENERATE_RTF is set to YES.
RTF_STYLESHEET_FILE =
RTF_STYLESHEET_FILE =
# Set optional variables used in the generation of an RTF document. Syntax is
# similar to doxygen's config file. A template extensions file can be generated
# using doxygen -e rtf extensionFile.
# This tag requires that the tag GENERATE_RTF is set to YES.
RTF_EXTENSIONS_FILE =
RTF_EXTENSIONS_FILE =
# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
# with syntax highlighting in the RTF output.
@@ -1896,7 +1896,7 @@ MAN_EXTENSION = .3
# MAN_EXTENSION with the initial . removed.
# This tag requires that the tag GENERATE_MAN is set to YES.
MAN_SUBDIR =
MAN_SUBDIR =
# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
# will generate one additional man file for each entity documented in the real
@@ -1915,7 +1915,7 @@ MAN_LINKS = NO
# captures the structure of the code including all documentation.
# The default value is: NO.
GENERATE_XML = YES
GENERATE_XML = YES
# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -2009,7 +2009,7 @@ PERLMOD_PRETTY = YES
# overwrite each other's variables.
# This tag requires that the tag GENERATE_PERLMOD is set to YES.
PERLMOD_MAKEVAR_PREFIX =
PERLMOD_MAKEVAR_PREFIX =
#---------------------------------------------------------------------------
# Configuration options related to the preprocessor
@@ -2050,7 +2050,7 @@ SEARCH_INCLUDES = YES
# preprocessor.
# This tag requires that the tag SEARCH_INCLUDES is set to YES.
INCLUDE_PATH =
INCLUDE_PATH =
# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
# patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2058,7 +2058,7 @@ INCLUDE_PATH =
# used.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
INCLUDE_FILE_PATTERNS =
INCLUDE_FILE_PATTERNS =
# The PREDEFINED tag can be used to specify one or more macro names that are
# defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2068,7 +2068,7 @@ INCLUDE_FILE_PATTERNS =
# recursively expanded use the := operator instead of the = operator.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
PREDEFINED =
PREDEFINED =
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
@@ -2077,7 +2077,7 @@ PREDEFINED =
# definition found in the source code.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
EXPAND_AS_DEFINED =
EXPAND_AS_DEFINED =
# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
# remove all references to function-like macros that are alone on a line, have
@@ -2106,13 +2106,13 @@ SKIP_FUNCTION_MACROS = YES
# the path). If a tag file is not located in the directory in which doxygen is
# run, you must also specify the path to the tagfile here.
TAGFILES =
TAGFILES =
# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
# tag file that is based on the input files it reads. See section "Linking to
# external documentation" for more information about the usage of tag files.
GENERATE_TAGFILE =
GENERATE_TAGFILE =
# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
# the class index. If set to NO, only the inherited external classes will be
@@ -2161,14 +2161,14 @@ CLASS_DIAGRAMS = NO
# the mscgen tool resides. If left empty the tool is assumed to be found in the
# default search path.
MSCGEN_PATH =
MSCGEN_PATH =
# You can include diagrams made with dia in doxygen documentation. Doxygen will
# then run dia to produce the diagram and insert it in the documentation. The
# DIA_PATH tag allows you to specify the directory where the dia binary resides.
# If left empty dia is assumed to be found in the default search path.
DIA_PATH =
DIA_PATH =
# If set to YES the inheritance and collaboration graphs will hide inheritance
# and usage relations if the target is undocumented or is not a class.
@@ -2217,7 +2217,7 @@ DOT_FONTSIZE = 10
# the path where dot can find it using this tag.
# This tag requires that the tag HAVE_DOT is set to YES.
DOT_FONTPATH =
DOT_FONTPATH =
# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
# each documented class showing the direct and indirect inheritance relations.
@@ -2361,26 +2361,26 @@ INTERACTIVE_SVG = NO
# found. If left blank, it is assumed the dot tool can be found in the path.
# This tag requires that the tag HAVE_DOT is set to YES.
DOT_PATH =
DOT_PATH =
# The DOTFILE_DIRS tag can be used to specify one or more directories that
# contain dot files that are included in the documentation (see the \dotfile
# command).
# This tag requires that the tag HAVE_DOT is set to YES.
DOTFILE_DIRS =
DOTFILE_DIRS =
# The MSCFILE_DIRS tag can be used to specify one or more directories that
# contain msc files that are included in the documentation (see the \mscfile
# command).
MSCFILE_DIRS =
MSCFILE_DIRS =
# The DIAFILE_DIRS tag can be used to specify one or more directories that
# contain dia files that are included in the documentation (see the \diafile
# command).
DIAFILE_DIRS =
DIAFILE_DIRS =
# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
# path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2388,12 +2388,12 @@ DIAFILE_DIRS =
# generate a warning when it encounters a \startuml command in this case and
# will not generate output for the diagram.
PLANTUML_JAR_PATH =
PLANTUML_JAR_PATH =
# When using plantuml, the specified paths are searched for files specified by
# the !include statement in a plantuml block.
PLANTUML_INCLUDE_PATH =
PLANTUML_INCLUDE_PATH =
# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
# that will be shown in the graph. If the number of nodes in a graph becomes
+2 -2
Просмотреть файл
@@ -1,5 +1,5 @@
.. toctree::
:maxdepth: 4
:maxdepth: 4
:caption: Contents:
=======
@@ -8,4 +8,4 @@ All API
.. doxygenindex::
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
.. toctree::
:maxdepth: 4
:maxdepth: 4
:caption: Contents:
===
+2 -2
Просмотреть файл
@@ -7,10 +7,10 @@ Welcome to RCCL's documentation!
==================================
.. toctree::
:maxdepth: 4
:maxdepth: 4
:caption: Contents:
library
library
api
allapi
+2 -2
Просмотреть файл
@@ -1,6 +1,6 @@
.. toctree::
:maxdepth: 4
:maxdepth: 4
:caption: Contents:
======
@@ -10,4 +10,4 @@ RCCL
Introduction
------------
The RCCL is an AMD port of NCCL.
The RCCL is an AMD port of NCCL.
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-112
Просмотреть файл
@@ -1,112 +0,0 @@
#!/bin/bash
# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
FILES="
./src/nccl.h.in
./src/bootstrap.cu
./src/collectives/all_gather.cu
./src/collectives/all_reduce.cu
./src/collectives/broadcast.cu
./src/collectives/collectives.h
./src/collectives/device/all_gather.cu
./src/collectives/device/all_gather.h
./src/collectives/device/all_reduce.cu
./src/collectives/device/all_reduce.h
./src/collectives/device/broadcast.cu
./src/collectives/device/broadcast.h
./src/collectives/device/common.h
./src/collectives/device/common_kernel.h
./src/collectives/device/functions.cu
./src/collectives/device/ll_kernel.h
./src/collectives/device/primitives.h
./src/collectives/device/reduce.cu
./src/collectives/device/reduce.h
./src/collectives/device/reduce_kernel.h
./src/collectives/device/reduce_scatter.cu
./src/collectives/device/reduce_scatter.h
./src/collectives/reduce.cu
./src/collectives/reduce_scatter.cu
./src/include/bootstrap.h
./src/include/common_coll.h
./src/include/core.h
./src/include/debug.h
./src/include/enqueue.h
./src/include/group.h
./src/include/ibvwrap.h
./src/include/nccl_net.h
./src/include/net.h
./src/include/nvlink.h
./src/include/nvmlwrap.h
./src/include/param.h
./src/include/ring.h
./src/include/rings.h
./src/include/shm.h
./src/include/socket.h
./src/include/topo.h
./src/include/transport.h
./src/include/utils.h
./src/init.cu
./src/misc/enqueue.cu
./src/misc/group.cu
./src/misc/ibvwrap.cu
./src/misc/nvmlwrap.cu
./src/misc/rings.cu
./src/misc/utils.cu
./src/ring.cu
./src/transport.cu
./src/transport/net.cu
./src/transport/net_ib.cu
./src/transport/net_socket.cu
./src/transport/p2p.cu
./src/transport/shm.cu
"
for f in $FILES
do
sed -i \
-e 's@cuda_runtime.h@hip/hip_runtime_api.h@g' \
-e 's@cuda_fp16.h@hip/hip_fp16.h@g' \
-e 's/cudaDeviceCanAccessPeer/hipDeviceCanAccessPeer/g' \
-e 's/cudaDeviceEnablePeerAccess/hipDeviceEnablePeerAccess/g' \
-e 's/cudaDeviceGetPCIBusId/hipDeviceGetPCIBusId/g' \
-e 's/cudaErrorPeerAccessAlreadyEnabled/hipErrorPeerAccessAlreadyEnabled/g' \
-e 's/cudaError_t/hipError_t/g' \
-e 's/cudaEventCreateWithFlags/hipEventCreateWithFlags/g' \
-e 's/cudaEventDestroy/hipEventDestroy/g' \
-e 's/cudaEventDisableTiming/hipEventDisableTiming/g' \
-e 's/cudaEventRecord/hipEventRecord/g' \
-e 's/cudaEvent_t/hipEvent_t/g' \
-e 's/cudaFree/hipFree/g' \
-e 's/cudaFreeHost/hipHostFree/g' \
-e 's/cudaGetDevice/hipGetDevice/g' \
-e 's/cudaGetErrorString/hipGetErrorString/g' \
-e 's/cudaGetLastError/hipGetLastError/g' \
-e 's/cudaHostAlloc/hipHostMalloc/g' \
-e 's/cudaHostAllocMapped/hipHostMallocMapped/g' \
-e 's/cudaHostGetDevicePointer/hipHostGetDevicePointer/g' \
-e 's/cudaHostRegister/hipHostRegister/g' \
-e 's/cudaHostRegisterMapped/hipHostRegisterMapped/g' \
-e 's/cudaHostUnregister/hipHostUnregister/g' \
-e 's/cudaIpcCloseMemHandle/hipIpcCloseMemHandle/g' \
-e 's/cudaIpcGetMemHandle/hipIpcGetMemHandle/g' \
-e 's/cudaIpcMemHandle_t/hipIpcMemHandle_t/g' \
-e 's/cudaIpcMemLazyEnablePeerAccess/hipIpcMemLazyEnablePeerAccess/g' \
-e 's/cudaIpcOpenMemHandle/hipIpcOpenMemHandle/g' \
-e 's/cudaMalloc/hipMalloc/g' \
-e 's/cudaMemcpy/hipMemcpy/g' \
-e 's/cudaMemcpyAsync/hipMemcpyAsync/g' \
-e 's/cudaMemcpyDefault/hipMemcpyDefault/g' \
-e 's/cudaMemcpyDeviceToDevice/hipMemcpyDeviceToDevice/g' \
-e 's/cudaMemoryTypeDevice/hipMemoryTypeDevice/g' \
-e 's/cudaMemset/hipMemset/g' \
-e 's/cudaPointerAttributes/hipPointerAttribute_t/g' \
-e 's/cudaPointerGetAttributes/hipPointerGetAttributes/g' \
-e 's/cudaSetDevice/hipSetDevice/g' \
-e 's/cudaStreamCreateWithFlags/hipStreamCreateWithFlags/g' \
-e 's/cudaStreamDestroy/hipStreamDestroy/g' \
-e 's/cudaStreamNonBlocking/hipStreamNonBlocking/g' \
-e 's/cudaStreamWaitEvent/hipStreamWaitEvent/g' \
-e 's/cudaStream_t/hipStream_t/g' \
-e 's/cudaSuccess/hipSuccess/g' \
$f
done
+7 -6
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -16,7 +16,7 @@ NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
@@ -36,15 +36,16 @@ CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
# Include Volta support if we're using CUDA9 or above
ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
else
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS += -Wall -Wno-sign-compare
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
CXXFLAGS += -I $(CUDA_INC)
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
@@ -68,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
CXXFLAGS += -Wall -Wextra
else
.SILENT:
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
+2 -2
Просмотреть файл
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 3
NCCL_PATCH := 7
NCCL_MINOR := 4
NCCL_PATCH := 8
NCCL_SUFFIX :=
PKG_REVISION := 1
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
+2 -2
Просмотреть файл
@@ -1,6 +1,6 @@
Name: libnccl
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
Release: ${pkg:Revision}
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Group: Development/Libraries
+2 -1
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
$< > $@
+3 -2
Просмотреть файл
@@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major}
NCCL_MINOR=${nccl:Minor}
NCCL_PATCH=${nccl:Patch}
NCCL_SUFFIX=${nccl:Suffix}
NCCL_BUILD=${pkg:Revision}
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
tar --exclude build \
--exclude ".git*" \
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
+1 -1
Просмотреть файл
@@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
+42 -19
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -9,41 +9,48 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
##### lib files
LIBNAME := libnccl.so
STATICLIBNAME := libnccl_static.a
##### pkgconfig files
PKGCONFIGFILE := nccl.pc
##### dirs
BUILDDIR ?= $(abspath ../build)
INCDIR := $(BUILDDIR)/include
LIBDIR := $(BUILDDIR)/lib
OBJDIR := $(BUILDDIR)/obj
PKGDIR := $(BUILDDIR)/lib/pkgconfig
##### target files
CUDARTLIB ?= cudart_static
INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
STATICLIBTARGET := $(STATICLIBNAME)
LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
PKGTARGET := $(PKGCONFIGFILE)
LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
DEPFILES := $(LIBOBJ:%.o=%.d)
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lrt
LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a
##### rules
build : lib staticlib
lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
staticlib : $(LIBDIR)/$(STATICLIBTARGET)
devicelib: $(INCDIR)/nccl.h
$(DEVICELIB): ALWAYS_REBUILD
$(MAKE) -C collectives/device
# Empty target to force rebuild
ALWAYS_REBUILD:
-include $(DEPFILES)
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
@@ -51,7 +58,7 @@ $(INCDIR)/nccl.h : nccl.h.in
# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
mkdir -p $(INCDIR)
printf "Generating %-35s > %s\n" $< $@
@printf "Generating %-35s > %s\n" $< $@
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
@@ -59,14 +66,14 @@ $(INCDIR)/nccl.h : nccl.h.in
-e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
$< > $@
$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
mkdir -p $(LIBDIR)
$(eval TMP := $(shell mktemp -d))
@@ -75,6 +82,15 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
ar cr $@ $(LIBOBJ) $(TMP)/*.o
rm -Rf $(TMP)
$(PKGDIR)/nccl.pc : nccl.pc.in
mkdir -p $(PKGDIR)
@printf "Generating %-35s > %s\n" $< $@
sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \
-e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
$< > $@
$(INCDIR)/%.h : %.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
@@ -85,27 +101,34 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
mkdir -p $(INCDIR)
cp -f $< $@
$(OBJDIR)/%.o : %.cu
$(PKGDIR)/%.pc : %.pc
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(PKGDIR)
cp -f $< $@
$(OBJDIR)/%.o : %.cc
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@rm -f $(@:%.o=%.d.tmp)
clean :
rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR}
rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
$(MAKE) -C collectives/device clean
install : lib
mkdir -p $(PREFIX)/lib
mkdir -p $(PREFIX)/lib/pkgconfig
mkdir -p $(PREFIX)/include
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
# as the BUILDDIR variable.
+467
Просмотреть файл
@@ -0,0 +1,467 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "core.h"
#include "utils.h"
#include "bootstrap.h"
#include "net.h"
#include "socket.h"
#include <unistd.h>
#include <sys/types.h>
// Always use sockets for bootstrap
struct bootstrapNetHandle {
union socketAddress connectAddr;
};
struct bootstrapNetComm {
int fd;
};
/* Init functions */
static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
static int bootstrapNetIfs = -1;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
ncclResult_t bootstrapNetInit() {
if (bootstrapNetIfs == -1) {
pthread_mutex_lock(&bootstrapNetLock);
if (bootstrapNetIfs == -1) {
bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
if (bootstrapNetIfs <= 0) {
WARN("Bootstrap : no socket interface found");
return ncclInternalError;
} else {
char line[1024];
char addrline[1024];
line[0] = '\0';
for (int i=0; i<bootstrapNetIfs; i++) {
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
}
line[1023] = '\0';
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
}
}
pthread_mutex_unlock(&bootstrapNetLock);
}
return ncclSuccess;
}
static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
NCCLCHECK(ncclCalloc(comm, 1));
(*comm)->fd = -1;
return ncclSuccess;
}
static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
if (dev >= bootstrapNetIfs) return ncclInternalError;
memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
return ncclSuccess;
}
/* Socket Interface Selection type */
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) {
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large");
// if dev >= 0, listen based on dev
if (dev >= 0) {
NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr)));
} else if (dev == findSubnetIf) {
// handle stores a remote address
// need to find a local addr that is in the same network as the remote addr
union socketAddress localAddr;
char ifName[MAX_IF_NAME_SIZE];
if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return ncclSystemError;
}
// pass the local address back
memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
} // Otherwise, handle stores a local address
struct bootstrapNetComm* comm;
NCCLCHECK(bootstrapNetNewComm(&comm));
NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
*listenComm = comm;
return ncclSuccess;
}
static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) {
struct bootstrapNetComm* comm;
NCCLCHECK(bootstrapNetNewComm(&comm));
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
*sendComm = comm;
return ncclSuccess;
}
static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
struct bootstrapNetComm* rComm;
NCCLCHECK(bootstrapNetNewComm(&rComm));
struct sockaddr_in sockaddr;
socklen_t socklen = sizeof(struct sockaddr_in);
SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
*recvComm = rComm;
return ncclSuccess;
}
static ncclResult_t bootstrapNetClose(void* opaqueComm) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
if (comm) {
close(comm->fd);
free(comm);
}
return ncclSuccess;
}
static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
// Additional sync functions
static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
NCCLCHECK(socketSend(comm->fd, data, size));
return ncclSuccess;
}
static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
int recvSize;
NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
if (recvSize > size) {
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
return ncclInternalError;
}
NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
return ncclSuccess;
}
ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) {
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str));
return ncclSuccess;
}
struct extId {
ncclNetHandle_t extHandleRoot;
void* extListenComm;
uint64_t hostHash;
pid_t pid;
int fd;
pthread_t boostrapThread;
};
struct extInfo {
int rank;
int nranks;
ncclNetHandle_t extHandleListenRoot;
ncclNetHandle_t extHandleListen;
};
#include <sys/resource.h>
static ncclResult_t setFilesLimit() {
struct rlimit filesLimit;
SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
filesLimit.rlim_cur = filesLimit.rlim_max;
SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
return ncclSuccess;
}
static void *bootstrapRoot(void* commId) {
struct extInfo info;
struct extId* id = (struct extId*)commId;
ncclNetHandle_t *rankHandles = NULL;
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
ncclNetHandle_t zero = { 0 }; // for sanity checking
void* tmpComm;
ncclResult_t res;
setFilesLimit();
TRACE(NCCL_INIT, "BEGIN");
/* Receive addresses from all ranks */
int nranks = 0, c = 0;
do {
NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
if (c == 0) {
nranks = info.nranks;
NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
}
if (nranks != info.nranks) {
WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
goto out;
}
if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
goto out;
}
// Save the connection handle for that rank
memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
++c;
} while (c < nranks);
TRACE(NCCL_INIT, "COLLECTED HANDLES");
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
void *tmpSendComm;
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
}
TRACE(NCCL_INIT, "SENT OUT HANDLES");
out:
bootstrapNetCloseListen(id->extListenComm);
free(commId);
if (rankHandles) free(rankHandles);
if (rankHandlesRoot) free(rankHandlesRoot);
TRACE(NCCL_INIT, "DONE");
return NULL;
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
struct extId* id = (struct extId*)commId;
id->hostHash = getHostHash();
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
ncclUniqueId* threadIdCopy;
NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
return ncclSuccess;
}
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
extId* id = (extId*)out;
char* env = getenv("NCCL_COMM_ID");
if (env) {
if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
id->pid = -1;
} else {
id->pid = getpid();
NCCLCHECK(bootstrapCreateRoot(out, false));
}
return ncclSuccess;
}
struct unexConn {
int peer;
void* comm;
struct unexConn* next;
};
struct extState {
void* extBstrapListenComm;
void* extBstrapRingRecvComm;
void* extBstrapRingSendComm;
ncclNetHandle_t* peerBstrapHandles;
struct unexConn* unexpectedConnections;
int rank;
int nranks;
int dev;
};
ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
struct extId* id = (struct extId*)commId;
bool idFromEnv = id->pid < 0;
struct extState* state;
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
state->nranks = nranks;
*commState = state;
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
struct extInfo info = { 0 };
info.rank = rank;
info.nranks = nranks;
void *tmpSendComm, *tmpRecvComm;
// Pass the remote address to listen via info
if (idFromEnv) {
memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
}
// listen will return the local address via info (specify interface type 'findSubnetIf')
state->dev = idFromEnv ? findSubnetIf : 0;
void* extBstrapListenCommRoot;
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
// stagger connection times to avoid an overload of the root at very high rank counts
if (nranks > 128) {
long msec = rank;
struct timespec tv;
tv.tv_sec = msec / 1000;
tv.tv_nsec = 1000000 * (msec % 1000);
TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
(void) nanosleep(&tv, NULL);
}
// send info on my listening socket to root
NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
// get info on my "next" rank in the bootstrap ring from root
ncclNetHandle_t extHandleNext;
NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
// AllGather all listen handlers
NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
}
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
struct extState* state = (struct extState*)commState;
char* data = (char*)allData;
int rank = state->rank;
int nranks = state->nranks;
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
/* Simple ring based AllGather
* At each step i receive data from (rank-i-1) from left
* and send previous step's data from (rank-i) to right
*/
for (int i=0; i<nranks-1; i++) {
size_t rslice = (rank - i - 1 + nranks) % nranks;
size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right
NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
// Recv slice from the left
NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
return ncclSuccess;
}
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
void* tmpSendComm;
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
return ncclSuccess;
}
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
// New unex
struct unexConn* unex;
NCCLCHECK(ncclCalloc(&unex, 1));
unex->peer = peer;
unex->comm = comm;
// Enqueue
struct unexConn* list = state->unexpectedConnections;
if (list == NULL) {
state->unexpectedConnections = unex;
return ncclSuccess;
}
while (list->next) list = list->next;
list->next = unex;
return ncclSuccess;
}
void* unexpectedDequeue(struct extState* state, int peer) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
while (elem) {
if (elem->peer == peer) {
if (prev == NULL) {
state->unexpectedConnections = elem->next;
} else {
prev->next = elem->next;
}
void* comm = elem->comm;
free(elem);
return comm;
}
prev = elem;
elem = elem->next;
}
return NULL;
}
// We can't know who we'll receive from, so we need to receive everything at once
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
void* tmpRecvComm;
// Search unexpected connections first
if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
return ncclSuccess;
}
// Then look for new connections
while (1) {
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
int newPeer;
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
if (newPeer == peer) {
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
return ncclSuccess;
}
// Unexpected connection. Save for later.
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
}
}
ncclResult_t bootstrapClose(void* commState) {
struct extState* state = (struct extState*)commState;
if (state->unexpectedConnections != NULL) {
WARN("Unexpected connections are not empty.\n");
return ncclInternalError;
}
NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
free(state->peerBstrapHandles);
free(state);
return ncclSuccess;
}
-249
Просмотреть файл
@@ -1,249 +0,0 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "core.h"
#include "utils.h"
#include "bootstrap.h"
#include "net.h"
#include <unistd.h>
#include <sys/types.h>
// Always use sockets for bootstrap
ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
// Additional sync functions based on async + test for bootstrap, using host ptrs.
static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
void* request;
NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
int done = 0;
while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
return ncclSuccess;
}
static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
void* request;
NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
int done = 0;
while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
return ncclSuccess;
}
struct extId {
ncclNetHandle_t extHandleRoot;
void* extListenComm;
uint64_t hostHash;
pid_t pid;
int fd;
pthread_t boostrapThread;
};
struct extInfo {
int rank;
int nranks;
ncclNetHandle_t extHandleListenFromRoot;
ncclNetHandle_t extHandleRing;
};
#include <sys/resource.h>
static ncclResult_t setFilesLimit() {
struct rlimit filesLimit;
SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
filesLimit.rlim_cur = filesLimit.rlim_max;
SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
return ncclSuccess;
}
static void *bootstrapRoot(void* commId) {
struct extInfo info;
struct extId* id = (struct extId*)commId;
ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange
ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation
ncclNetHandle_t zero = { 0 }; // for sanity checking
void* tmpComm;
ncclResult_t res;
setFilesLimit();
/* Receive addresses from all ranks */
int nranks = 0, c = 0;
do {
NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out);
NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out);
NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out);
if (c == 0) {
extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
if (extHandleBstrap == NULL || extHandleRing == NULL) {
WARN("Bootstrap thread : failed to allocate memory");
goto out;
}
nranks = info.nranks;
}
if (nranks != info.nranks) {
WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
goto out;
}
if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) {
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
goto out;
}
// Save the connection handle for connecting back to the ranks
memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t));
// Save the connection handle for the AllGather ring
memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t));
++c;
} while (c < nranks);
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
void *tmpSendComm;
NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out);
NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out);
NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out);
}
out:
bootstrapCloseListen(id->extListenComm);
free(commId);
free(extHandleBstrap);
free(extHandleRing);
return NULL;
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
struct extId* id = (struct extId*)commId;
id->hostHash = getHostHash();
NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
ncclUniqueId* threadIdCopy;
NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
return ncclSuccess;
}
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
extId* id = (extId*)out;
char* env = getenv("NCCL_COMM_ID");
if (env) {
if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
id->pid = -1;
} else {
id->pid = getpid();
NCCLCHECK(bootstrapCreateRoot(out, false));
}
return ncclSuccess;
}
struct extState {
void* extBstrapRingRecvComm;
void* extBstrapRingSendComm;
ncclNetHandle_t extBstrapRootHandle;
int rank;
int nranks;
int dev;
};
ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
struct extId* id = (struct extId*)commId;
bool idFromEnv = id->pid < 0;
struct extState* state;
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
state->nranks = nranks;
*commState = state;
void* extBstrapRootListenComm; // comm on which we accept root's connections
struct extInfo info = { 0 };
info.rank = rank;
info.nranks = nranks;
void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm;
// Pass the remote address to listen via info
if (idFromEnv) {
memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t));
}
// listen will return the local address via info (specify interface type 'findSubnetIf')
state->dev = idFromEnv ? findSubnetIf : 0;
NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm));
NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring
memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t));
// send info on my listening sockets to root
NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm));
NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info)));
NCCLCHECK(bootstrapCloseSend(tmpSendComm));
// get info on my "next" rank in the bootstrap ring from root
ncclNetHandle_t extHandleNext;
NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm));
NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
NCCLCHECK(bootstrapCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm));
NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm));
NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm));
return ncclSuccess;
}
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
struct extState* state = (struct extState*)commState;
char* data = (char*)allData;
int rank = state->rank;
int nranks = state->nranks;
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
/* Simple ring based AllGather
* At each step i receive data from (rank-i-1) from left
* and send previous step's data from (rank-i) to right
*/
for (int i=0; i<nranks-1; i++) {
int rslice = (rank - i - 1 + nranks) % nranks;
int sslice = (rank - i + nranks) % nranks;
// Send slice to the right
NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size));
// Recv slice from the left
NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
return ncclSuccess;
}
ncclResult_t bootstrapClose(void* commState) {
struct extState* state = (struct extState*)commState;
NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm));
NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm));
free(state);
return ncclSuccess;
}
+57
Просмотреть файл
@@ -0,0 +1,57 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "channel.h"
#include "param.h"
NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
struct ncclChannel* channel = comm->channels+channelid;
channel->id = channelid;
// Setup intermediate buffering
channel->buffSize = ncclParamBuffsize();
// Ring index to user rank table.
NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
// Communication structures with peers.
NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
for (size_t i=0; i<comm->nRanks; ++i) {
channel->peers[i].send.comm = comm;
channel->peers[i].recv.comm = comm;
}
// Per-channel operation list.
NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
// Operation list
NCCLCHECK(ncclCudaHostFree(channel->collectives));
// Free Ring index to rank tables
free(channel->ring.userRanks);
CUDACHECK(hipFree(channel->ring.devUserRanks));
// Free transport proxy resources
for (int r=0; r<nRanks; r++) {
struct ncclPeer* peer = channel->peers+r;
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
}
// Free the peer structures.
CUDACHECK(hipFree(channel->devPeers));
free(channel->peers);
return ncclSuccess;
}
+19
Просмотреть файл
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
-33
Просмотреть файл
@@ -1,33 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
ncclSum, 0, comm, stream);
}
+19
Просмотреть файл
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
-33
Просмотреть файл
@@ -1,33 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
op, 0, comm, stream);
}
+27
Просмотреть файл
@@ -0,0 +1,27 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
}
-43
Просмотреть файл
@@ -1,43 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
}
return ncclSuccess;
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
ncclSum, root, comm, stream);
}
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
ncclSum, root, comm, stream);
}
+27 -18
Просмотреть файл
@@ -1,5 +1,6 @@
#include "hip/hip_runtime.h"
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -8,9 +9,7 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
@@ -19,13 +18,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
coll##Kernel_##op##_##dtype
/* Declare all collective operations */
#define DECL_COLL4(coll, op, dtype) \
#define DECL_COLL5(coll, op, dtype) \
extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
#define DECL_COLL4(coll, op, dtype) \
DECL_COLL5(coll, op, dtype) \
DECL_COLL5(coll##LL, op, dtype)
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##LL, op, dtype) \
DECL_COLL4(coll, op, dtype)
DECL_COLL4(coll##Ring, op, dtype)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
@@ -53,15 +55,22 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
DECL_ALL_COLLS
#define ALLREDUCE_SUBSTEPS 2
#define ALLREDUCE_BUFCHUNKS 2
#define ALLGATHER_SUBSTEPS 2
#define ALLGATHER_BUFCHUNKS 2
#define REDUCESCATTER_SUBSTEPS 2
#define REDUCESCATTER_BUFCHUNKS 2
#define BROADCAST_SUBSTEPS 8
#define BROADCAST_BUFCHUNKS 2
#define REDUCE_SUBSTEPS 8
#define REDUCE_BUFCHUNKS 2
// CHUNKSIZE must be a multiple of SLICESIZE
//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
//#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
//#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
//#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
//#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
//#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
#define ALLREDUCE_SLICESTEPS 4
#define ALLREDUCE_CHUNKSTEPS 4
#define ALLGATHER_SLICESTEPS 4
#define ALLGATHER_CHUNKSTEPS 4
#define REDUCESCATTER_SLICESTEPS 4
#define REDUCESCATTER_CHUNKSTEPS 4
#define BROADCAST_SLICESTEPS 1
#define BROADCAST_CHUNKSTEPS 1
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
#endif
+13 -28
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
LIBOBJ := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
$(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
$(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
$(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
$(OBJDIR)/functions.o
LIBSRCFILES += functions.cu
DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
DEPENDFILES := $(DEPFILES:%.d=%.dep)
DEPENDFILES:= $(DEPFILES:%.d=%.dep)
STATICLIB := $(OBJDIR)/colldevice.a
DEVOBJ := $(OBJDIR)/devlink.o
RULESFILE := $(OBJDIR)/Makefile.rules
NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
@@ -33,6 +28,16 @@ all: $(STATICLIB)
# Dummy rule so that the extra dependency (%.dep) files are preserved by make
all_deps: $(DEPENDFILES)
# Auto-generating the rules per op/reduction/datatype/algorithm
$(RULESFILE) :
@printf "Generating %-35s > %s\n" rules $@
@mkdir -p $(OBJDIR)
@./gen_rules.sh $(OBJDIR) > $@
-include $(RULESFILE)
LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o
-include $(DEPFILES)
$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
mkdir -p `dirname $@`
$(NVCC) $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
# ... and create the device-side linked object with all those.
$(DEVOBJ) : $(LIBOBJ)
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
+1 -2
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,6 +11,4 @@
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
#endif
+46 -182
Просмотреть файл
@@ -1,81 +1,44 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and poffset/noffset for buffer sync
#define NEXT_STEP \
step++; \
poffset = noffset; \
noffset += sliceSize; \
if (noffset == buffSize) noffset = 0;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x;
const int bid = args->bid;
__shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
int prevdirect = 0;
int nextdirect = 0;
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->next_hdp_reg);
typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
if (tid == 0) {
// Update in case we skipped some collectives
STORE(ring->recv.conn.opCount, args->opCount);
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
if (prevdirect) {
*ring->recv.conn.ptrExchange = args->ThisOutput;
}
if (nextdirect) {
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
while (LOAD(ptr) == nullptr);
sharedNextOutput = (T*)LOAD(ptr);
STORE(ptr, nullptr);
}
}
__syncthreads();
uint64_t step = 0ULL;
int poffset, noffset = 0;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*chunkSize;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int maxOffset = min(chunkSize, size-chunkOffset);
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
@@ -83,130 +46,53 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
Prims::Copy(tid, nthreads,
thisInput + chunkOffset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
prims.directSend(thisInput+chunkOffset, offset, nelem);
} else {
Prims::DoubleCopy(tid, nthreads,
thisInput + chunkOffset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
}
NEXT_STEP; // Increases step, poffset, noffset
// k-2 steps: copy to next GPU
if (prevdirect) {
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
Prims::Copy(tid, nthreads,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
Prims::Copy(tid, nthreads,
NULL,
NULL,
0, 0,
step,
waitReadyFromPrev,
postDoneToPrev);
} else {
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
Prims::DoubleCopy(tid, nthreads,
prevInput + poffset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// Make final copy from buffer to dest.
rankDest = ring->devUserRanks[1];
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
// Here we need to copy from buffer to this output.
Prims::Copy(tid, nthreads,
prevInput + poffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
}
if (tid == 0) {
waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
STORE(ring->send.conn.head, 0ULL);
STORE(ring->recv.conn.tail, 0ULL);
__threadfence_system();
STORE(ring->recv.conn.opCount, args->opCount+1);
// Make final copy from buffer to dest.
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
// Final wait/copy.
prims.directRecv(thisOutput+offset, offset, nelem);
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
poffset = noffset; \
pflag = nflag; \
noffset += NCCL_LL_SLICE_LINES; \
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
nflag++; \
step++;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
typedef LLPrimitives<T, FUNC> LL;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t pflag, nflag = step + 1;
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -216,57 +102,35 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int maxOffset = min(chunkSize, size-chunkOffset);
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
WAIT_NEXT;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LL::ReduceCopy(
thisInput + chunkOffset,
nextOutput + noffset,
maxOffset, nflag, llNthreads);
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LL::ReduceCopy(
thisInput + chunkOffset,
thisOutput + offset,
nextOutput + noffset,
maxOffset, nflag, llNthreads);
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
POST_SIZE;
NEXT_STEP_LL;
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
WAIT_NEXT;
LL::ReduceCopy(
prevInput + poffset,
thisOutput + offset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LL::ReduceCopy(
prevInput + poffset,
thisOutput + offset,
maxOffset, pflag, llNthreads);
ACK_PREV;
LLprims.recv(thisOutput+offset, nelem);
}
FIFO_CLEANING_AND_SAVE_STEP(nflag);
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
+2 -6
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,12 +11,7 @@
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum);
#elif NCCL_OP == 1
IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
#elif NCCL_OP == 2
IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin);
#elif NCCL_OP == 3
IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax);
#endif
IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax);
+177 -228
Просмотреть файл
@@ -1,243 +1,181 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and poffset/noffset for buffer sync
#define NEXT_STEP \
step++; \
poffset = noffset; \
noffset += sliceSize; \
if (noffset == buffSize) noffset = 0;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x;
const int bid = args->bid;
__shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
int prevdirect = 0;
int nextdirect = 0;
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->next_hdp_reg);
typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
if (tid == 0) {
// Update in case we skipped some collectives
STORE(ring->recv.conn.opCount, args->opCount);
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
if (prevdirect) {
*ring->recv.conn.ptrExchange = args->ThisOutput;
}
if (nextdirect) {
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
while (LOAD(ptr) == nullptr);
sharedNextOutput = (T*)LOAD(ptr);
STORE(ptr, nullptr);
}
}
__syncthreads();
uint64_t step = 0ULL;
int poffset, noffset = 0;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t clk, t0 = 0ULL, ws, wr;
if (tid == 0) clk = clock64();
#endif
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int maxOffset;
int nelem;
int slice;
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
Prims::Copy(tid, nthreads,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
NEXT_STEP; // Increases step, poffset, noffset
INIT_COUNTER;
prims.send(thisInput+offset, nelem);
ACCUMULATE_COUNTER(send);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
Prims::Reduce(tid, nthreads,
prevInput + poffset,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
INIT_COUNTER;
prims.recvReduceSend(thisInput+offset, nelem);
ACCUMULATE_COUNTER(recvReduceSend);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
Prims::ReduceCopy(tid, nthreads,
prevInput + poffset,
thisInput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
thisOutput + offset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
INIT_COUNTER;
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvReduceCopySend);
// k-2 steps: copy to next GPU
if (prevdirect) {
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks - j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
Prims::Copy(tid, nthreads,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
Prims::Copy(tid, nthreads,
NULL,
NULL,
0, 0,
step,
waitReadyFromPrev,
postDoneToPrev);
} else {
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks - j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
Prims::DoubleCopy(tid, nthreads,
prevInput + poffset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
}
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
Prims::Copy(tid, nthreads,
prevInput + poffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
INIT_COUNTER;
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvCopySend);
}
}
if (tid == 0) {
// Wait for next to have consumed all data before we reset the flag
waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
STORE(ring->send.conn.head, 0ULL);
STORE(ring->recv.conn.tail, 0ULL);
__threadfence_system();
STORE(ring->recv.conn.opCount, args->opCount+1);
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
// Final wait/copy.
INIT_COUNTER;
prims.directRecv(thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecv);
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
#endif
}
#include "ll_kernel.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = args->lastChunkSize;
const ssize_t loopSize = args->nChannels*chunkSize;
#define NEXT_STEP_LL \
poffset = noffset; \
pflag = nflag; \
noffset += NCCL_LL_SLICE_LINES; \
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
nflag++; \
step++;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
do {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.send(thisInput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
} while(0);
do {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.recv(thisOutput+offset, nelem);
} else {
prims.recvCopySend(thisOutput+offset, nelem);
}
}
} while(0);
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
typedef LLPrimitives<T, FUNC> LL;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*nranks*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t pflag, nflag = step + 1;
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -247,89 +185,100 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int maxOffset;
int nelem;
int slice;
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
nelem = min(chunkSize, size-offset);
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
nextOutput + noffset,
maxOffset, nflag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
nelem = min(chunkSize, size-offset);
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
prevInput + poffset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
nelem = min(chunkSize, size-offset);
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
prevInput + poffset,
thisOutput + offset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks - j];
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
nelem = min(chunkSize, size-offset);
WAIT_NEXT;
LL::ReduceCopy(
prevInput + poffset,
thisOutput + offset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
offset = chunkOffset + slice * chunkSize;
maxOffset = min(chunkSize, size-offset);
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
LL::ReduceCopy(
prevInput + poffset,
thisOutput + offset,
maxOffset, pflag, llNthreads);
ACK_PREV;
LLprims.recv(thisOutput+offset, nelem);
}
FIFO_CLEANING_AND_SAVE_STEP(nflag);
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
do {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
} while(0);
do {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
}
}
} while(0);
}
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 1
#include "device/all_reduce.cu"
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 2
#include "device/all_reduce.cu"
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 3
#include "device/all_reduce.cu"
+2 -3
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,6 +11,4 @@
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
#endif
IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
+80 -188
Просмотреть файл
@@ -1,184 +1,101 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and boffset for buffer sync
#define NEXT_STEP \
step++; \
boffset += sliceSize; \
if (boffset == buffSize) boffset = 0;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x;
const int bid = args->bid;
__shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
int prevdirect = 0;
int nextdirect = 0;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->root;
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t clk, t0 = 0ULL, ws, wr;
if (tid == 0) clk = clock64();
#endif
WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->next_hdp_reg);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
INIT_COUNTER;
prims.send(thisInput+offset, nelem);
ACCUMULATE_COUNTER(send);
} else {
INIT_COUNTER;
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
ACCUMULATE_COUNTER(copySend);
}
} else if (nextRank == root) {
INIT_COUNTER;
prims.recv(thisOutput+offset, nelem);
ACCUMULATE_COUNTER(recv);
} else {
INIT_COUNTER;
prims.recvCopySend(thisOutput+offset, nelem);
ACCUMULATE_COUNTER(recvCopySend);
}
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
#endif
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->root;
if (tid == 0) {
// Update in case we skipped some collectives
STORE(ring->recv.conn.opCount, args->opCount);
if (nextRank != root) {
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
}
if (rank != root && prevdirect) {
*ring->recv.conn.ptrExchange = args->ThisOutput;
}
if (nextRank != root && nextdirect) {
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
while (LOAD(ptr) == nullptr);
sharedNextOutput = (T*)LOAD(ptr);
STORE(ptr, nullptr);
}
}
__syncthreads();
uint64_t step = 0ULL;
int boffset = 0;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*chunkSize;
int maxOffset = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
Prims::Copy(tid, nthreads,
thisInput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
} else {
Prims::DoubleCopy(tid, nthreads,
thisInput + offset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
}
} else if (nextRank == root) {
if (prevdirect) maxOffset = 0; // Only wait for signals
Prims::Copy(tid, nthreads,
prevInput + boffset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
} else {
if (prevdirect) {
Prims::Copy(tid, nthreads,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
} else {
Prims::DoubleCopy(tid, nthreads,
prevInput + boffset,
thisOutput + offset,
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
}
}
NEXT_STEP; // Increases step, boffset
}
if (tid == 0) {
if (nextRank != root) {
// Wait for next to have consumed data before resetting the flag
waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
STORE(ring->send.conn.head, 0ULL);
}
STORE(ring->recv.conn.tail, 0ULL);
__threadfence_system();
STORE(ring->recv.conn.opCount, args->opCount+1);
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
boffset += NCCL_LL_SLICE_LINES; \
if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
flag++; \
step++;
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
const int rank = comm->rank;
const int nextRank = ring->devUserRanks[1];
const int root = args->root;
typedef LLPrimitives<T, FUNC> LL;
const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t flag = step + 1;
int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -186,46 +103,21 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
}
ssize_t offset = gridOffset + bid*chunkSize;
int maxOffset = min(chunkSize, size-offset);
int nelem = min(chunkSize, size-offset);
if (rank == root) {
WAIT_NEXT;
if (thisInput == thisOutput) {
LL::ReduceCopy(
thisInput + offset,
nextOutput + boffset,
maxOffset, flag, llNthreads);
LLprims.send(thisInput+offset, nelem);
} else {
LL::ReduceCopy(
thisInput + offset,
thisOutput + offset,
nextOutput + boffset,
maxOffset, flag, llNthreads);
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
}
POST_SIZE;
NEXT_STEP_LL;
} else if (nextRank == root) {
LL::ReduceCopy(
prevInput + boffset,
thisOutput + offset,
maxOffset, flag, llNthreads);
NEXT_STEP_LL;
ACK_PREV;
LLprims.recv(thisOutput + offset, nelem);
} else {
WAIT_NEXT;
LL::ReduceCopy(
prevInput + boffset,
thisOutput + offset,
nextOutput + boffset,
maxOffset, flag, flag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
ACK_PREV;
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
// We need everyone to acknowledge data even if they didn't receive anything
// so that the next collective can start right away.
ACK_PREV;
FIFO_CLEANING_AND_SAVE_STEP(flag);
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 0
#include "device/broadcast.cu"
+76 -47
Просмотреть файл
@@ -1,5 +1,6 @@
#include "hip/hip_runtime.h"
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -8,18 +9,38 @@
#ifndef NCCL_DEVICE_COMMON_H_
#define NCCL_DEVICE_COMMON_H_
#include <hip/hip_runtime.h>
#include "../collectives.h"
#include "core.h"
#include "devcomm.h"
#include "nccl.h"
#include <type_traits>
typedef void(*ncclKern_t)(struct CollectiveArgs* args);
#define NCCL_FUNC4(coll, op, dtype) \
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
// Each thread sets a predicate to true if abort == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
// If any of the thread's predicate was True, all the threads call exit()
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define exitIfAbortBarrier(abort, abortCount) \
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
__syncthreads(); \
if (LOAD(abortCount)) { asm volatile ("s_endpgm"); return; }
#else
static inline __device__ void exitIfAbortBarrier(int abort) {
uint32_t popc;
asm ("{");
asm volatile (" .reg .pred barr_pred;");
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
asm ("}");
if (popc) { asm volatile ("exit;"); }
}
#endif
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll, op, dtype), \
NCCL_COLL_NAME(coll##LL, op, dtype) \
NCCL_COLL_NAME(coll##LL, op, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Ring, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
@@ -64,20 +85,13 @@ typedef void(*ncclKern_t)(struct CollectiveArgs* args);
NCCL_FUNCS2A(ncclAllReduce) }
// Must be consistent with the ncclFuncSet enum
using ncclKern_t = void (*)(struct CollectiveArgs*);
using ncclFunc_t = void (*)(struct CollectiveArgs*);
static const __device__ constexpr ncclKern_t ncclFuncs[]{
#if defined(__HIP_DEVICE_COMPILE__)
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce)
#endif
static const __device__ constexpr ncclFunc_t ncclFuncs[]{
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
#if defined(__HIP_DEVICE_COMPILE__)
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
@@ -88,82 +102,89 @@ static const __device__ constexpr ncclKern_t ncclFuncs[]{
template<unsigned short f, unsigned short l>
struct Caller {
static
__device__ void call(ncclColl* const c) noexcept
static __device__ __host__
void call(ncclColl* const c) noexcept
{
constexpr unsigned short m = f + (l - f) / 2;
return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
}
};
template<unsigned short f>
struct Caller<f, f + 1>{
static
__device__ void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
static __device__ __host__
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
};
inline
__device__
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept
{
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
if (c->funcIndex < 72) {
if (c->funcIndex % 2) ncclBroadcastLL_copy_i8(&c->args);
else ncclBroadcast_copy_i8(&c->args);
if (c->funcIndex % 2) ncclBroadcastRingLL_copy_i8(&c->args);
else ncclBroadcastRing_copy_i8(&c->args);
}
else if (c->funcIndex < 144) Caller<72, 144>::call(c);
else if (c->funcIndex < 216) {
if (c->funcIndex % 2) ncclAllGatherLL_copy_i8(&c->args);
else ncclAllGather_copy_i8(&c->args);
if (c->funcIndex % 2) ncclAllGatherRingLL_copy_i8(&c->args);
else ncclAllGatherRing_copy_i8(&c->args);
}
else Caller<216, 360>::call(c);
}
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
int* d = (int*)dst;
int* s = (int*)src;
__syncthreads();
// When aggregation is effective, if some threads have aborted inside the LL kernel,
// make sure the rest of the threads abort as well
exitIfAbortBarrier(0, abortCount);
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
__syncthreads();
}
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, uint32_t* abortCount) {
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
if (tid == 0) hostColl->active = 0;
}
/* Functions for aggregation case */
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
}
#if NCCL_OP == 0
/* Kernels with the first operation inlined */
#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int tid = threadIdx.x; \
int bid = blockIdx.x; \
__shared__ struct ncclColl localColl; \
__shared__ uint32_t abortCount; \
if (tid == 0) abortCount = 0; \
__syncthreads(); \
\
struct ncclComm* comm = firstColl.args.comm; \
struct ncclRing* ring = comm->rings+bid; \
struct ncclDevComm* comm = firstColl.args.comm; \
struct ncclChannel* channel = comm->channels+bid; \
struct ncclColl* c; \
channel->abortCount = &abortCount; \
if (bid == 0) { \
/* To optimize for latency, (only) the first operation is passed as argument.*/ \
c = &firstColl; \
} else { \
c = &localColl; \
load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
load_coll(c, channel->devCollectives+channel->collFifoHead, tid, &abortCount); \
} \
while (1) { \
if (tid < c->nThreads) { \
if (tid < c->args.nThreads) { \
if (c->funcIndex == fIndex) { \
coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
} else { \
NCCL_CALL_FUNCTIONS(c); \
} \
} \
int nextIndex = c->nextIndex; \
if (tid == 0) ring->collFifoHead = nextIndex; \
if (tid == 0) channel->collFifoHead = nextIndex; \
\
if (c->active == 2) { \
return; \
@@ -171,15 +192,21 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
\
/* Load next collective operation*/ \
c = &localColl; /* for bid 0 */ \
load_coll(c, ring->devCollectives+nextIndex, tid); \
load_coll(c, channel->devCollectives+nextIndex, tid, &abortCount); \
} \
}
#else
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
#endif
// Only generate inline kernels for LL
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0)
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
@@ -192,4 +219,6 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64)
#define COLL_UNROLL 2
#endif
+113 -106
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -8,25 +8,25 @@
#ifndef NCCL_COMMON_KERNEL_H_
#define NCCL_COMMON_KERNEL_H_
#include "core.h"
#include "devcomm.h"
#include <cstdio>
#include <cstdint>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
// Define min for ssize_t
static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
typedef uint64_t PackType;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
template<class FUNC, typename T>
struct MULTI {
__device__ PackType operator()(const PackType x, const PackType y) const
{
return FUNC()(x, y);
}
__device__ PackType operator()(const PackType x, const PackType y) const
{
return FUNC()(x, y);
}
};
#else
@@ -205,15 +205,7 @@ struct MULTI<FUNC, int64_t> {
}
};
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a))
template<typename T>
__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
size_t ptrval = reinterpret_cast<size_t>(ptr);
return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
}
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
@@ -225,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
*ptr = val;
}
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
half r;
@@ -251,26 +243,6 @@ void vStore<half>(volatile half* ptr, const half val) {
}
#endif
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
__attribute__((noinline))
__device__ inline void ReduceCopy(
const int tid, const int nthreads,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1,
volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1, const int N) {
for (int idx = tid; idx < N; idx += nthreads) {
T val = vFetch(src0+idx);
if (TWO_INPUTS) {
val = FUNC()(val, vFetch(src1+idx));
}
vStore(dest0+idx, val);
if (TWO_OUTPUTS) {
vStore(dest1+idx, val);
}
}
}
typedef ulong2 Pack128;
template<class FUNC, typename T>
@@ -281,8 +253,8 @@ struct MULTI128 {
}
};
inline __device__ void Fetch128(Pack128& v, Pack128* p) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
v.x = p->x;
v.y = p->y;
#else
@@ -290,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, Pack128* p) {
#endif
}
inline __device__ void Store128(Pack128* p, Pack128& v) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
p->x = v.x;
p->y = v.y;
#else
@@ -298,67 +270,104 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
#endif
}
#define WARP_SIZE 32
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
__attribute__((noinline))
__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
const int N) {
Pack128 t0[UNROLL];
Pack128 t1[UNROLL];
const Pack128* src0_end = src0 + N;
const int inc = nw * UNROLL * WARP_SIZE;
const int offset = w * UNROLL * WARP_SIZE + t;
src0 += offset; if (TWO_INPUTS) src1 += offset;
dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
const int offset, const int N) {
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
T val = vFetch(srcs[0]+idx);
#pragma unroll
for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
#pragma unroll 1
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
while (src0 < src0_end) {
#pragma unroll
for (int u = 0; u < UNROLL; ++u) {
Fetch128(t0[u], src0+u*WARP_SIZE);
if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
}
#pragma unroll
for (int u = 0; u < UNROLL; ++u) {
if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
Store128(dest0+u*WARP_SIZE, t0[u]);
if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
}
src0 += inc; if (TWO_INPUTS) src1 += inc;
dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
#pragma unroll
for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
}
}
template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
__attribute__((noinline))
__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
#define WARP_SIZE 64
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
const int elemOffset, const int Npack) {
const int inc = nw * UNROLL * WARP_SIZE;
int offset = w * UNROLL * WARP_SIZE + t;
const Pack128* srcs[MAXSRCS];
for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
Pack128* dsts[MAXDSTS];
for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
while (offset < Npack) {
Pack128 vals[UNROLL];
// Load and reduce
for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
for (int i=1; i<MINSRCS; i++) {
Pack128 vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
#pragma unroll 1
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
Pack128 vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
// Store
for (int i = 0; i < MINDSTS; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
offset += inc;
}
}
template <typename T>
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
// Try to limit consecutive load/stores to 8.
// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
int N) {
int Nrem = N;
if (Nrem <= 0) return;
int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
int alignDiff = 0;
int align = ptrAlign128(srcs[0]);
#pragma unroll
for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
#pragma unroll
for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
// stage 0: check if we'll be able to use the fast, 128-bit aligned path.
// If not, we'll just use the slow preamble path for the whole operation
bool alignable = (((AlignUp(src0, alignof(Pack128)) == src0 + Npreamble)) &&
(!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
(!HAS_SRC1 || (AlignUp(src1, alignof(Pack128)) == src1 + Npreamble)));
if (!alignable) {
Npreamble = Nrem;
}
int Npreamble = alignDiff ? Nrem :
N < alignof(Pack128) ? N :
(alignof(Pack128) - align) % alignof(Pack128);
// stage 1: preamble: handle any elements up to the point of everything coming
// into alignment
ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
Nrem -= Npreamble;
if (Nrem == 0) return;
dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; }
if (Npreamble) {
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
Nrem -= Npreamble;
if (Nrem == 0) return;
}
int offset = Npreamble;
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit alignable.
@@ -366,35 +375,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads,
int nw = nthreads / WARP_SIZE; // Number of warps
int t = tid % WARP_SIZE; // Thread (inside the warp)
const int PackFactor = sizeof(Pack128) / sizeof(T);
const int packFactor = sizeof(Pack128) / sizeof(T);
// stage 2a: main loop
int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
* (UNROLL * nthreads); // round down
int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
* (AUTOUNROLL * WARP_SIZE); // round down
int Nelem2a = Npack2a * packFactor;
ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
int Ndone2a = Nalign2a * PackFactor;
Nrem -= Ndone2a;
Nrem -= Nelem2a;
if (Nrem == 0) return;
dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; }
offset += Nelem2a;
// stage 2b: slightly less optimized for section when we don't have full
// UNROLLs
// unrolling
int Nalign2b = Nrem / PackFactor;
int Npack2b = Nrem / packFactor;
int Nelem2b = Npack2b * packFactor;
ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
int Ndone2b = Nalign2b * PackFactor;
Nrem -= Ndone2b;
Nrem -= Nelem2b;
if (Nrem == 0) return;
dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; }
offset += Nelem2b;
// stage 2c: tail
ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
}
#endif // COMMON_KERNEL_H_
+2 -4
Просмотреть файл
@@ -1,15 +1,13 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "collectives.h"
#include "common.h"
// Workaround for https://reviews.llvm.org/D55580
__device__ void ncclWorkaroundClangD55580() {}
+28
Просмотреть файл
@@ -0,0 +1,28 @@
#!/bin/bash
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
dir=$1
targets="GENOBJS := \\\\\n"
for base in all_reduce all_gather broadcast reduce reduce_scatter; do
opn=0
for op in sum prod min max; do
dtn=0
for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do
echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
echo " mkdir -p ${dir}"
echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
echo ""
targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
dtn=$(($dtn + 1))
done
opn=$(($opn + 1))
done
done
echo -e "$targets"
-186
Просмотреть файл
@@ -1,186 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_LL_KERNEL_H_
#define NCCL_LL_KERNEL_H_
static __device__ __attribute__((noinline)) uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
do {
asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
"s_waitcnt vmcnt(0)\n"
"buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
} while (i4[1] != flag || i4[3] != flag);
uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
return val64;
#else
uint32_t data1, flag1, data2, flag2;
do {
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
} while ((flag1 != flag) || (flag2 != flag));
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
return val64;
#endif
}
static __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = val & 0xffffffff;
i4[1] = flag;
i4[2] = (val >> 32);
i4[3] = flag;
asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
"s_waitcnt vmcnt(0)\n"
"buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
#else
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
#endif
}
// Using memcpy handles misaligned pointers.
static __device__ uint64_t readAL(uint64_t* src) {
uint64_t val;
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
return val;
}
static __device__ void storeAL(uint64_t* dst, uint64_t val) {
memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
}
template <typename T, class FUNC>
class LLPrimitives {
private:
template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
__attribute__((noinline))
static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
if (size <= 0) return;
size_t size64 = size * sizeof(T) / sizeof(uint64_t);
uint64_t* src1A = (uint64_t*)src1;
uint64_t* dst1A = (uint64_t*)dst1;
int offset = threadIdx.x;
// Do multiples of 64 bits
#pragma unroll 1
for (; offset < size64; offset += nthreads) {
uint64_t val;
if (HAS_SRC1) {
val = readAL(src1A+offset);
if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
} else if (HAS_SRC2) {
val = readLL(src2+offset, iflag);
}
if (HAS_DST1) storeAL(dst1A+offset, val);
if (HAS_DST2) storeLL(dst2+offset, val, oflag);
}
// Finish last word
int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
int sizeRem = size - sizeDone;
if (threadIdx.x == 0 && sizeRem) {
const T* src1B = src1 + sizeDone;
T* dst1B = dst1 + sizeDone;
uint64_t lastVal;
T* vals = (T*)&lastVal;
if (HAS_SRC2) {
uint64_t lastVal2 = readLL(src2+size64, iflag);
T* src2B = (T*)&lastVal2;
for (int offset = 0; offset < sizeRem; offset++) {
vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
}
} else if (HAS_SRC1) {
for (int offset = 0; offset < sizeRem; offset++) {
vals[offset] = src1B[offset];
}
}
if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
if (HAS_DST1) {
for (int offset = 0; offset < sizeRem; offset++) {
dst1B[offset] = vals[offset];
}
}
}
}
public:
static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
}
static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
}
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
}
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
}
static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
}
static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
}
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
}
};
// Common macros
#define STEP_TO_SLOT(step) \
(step % NCCL_LL_CHUNKS)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define SYNC __syncthreads()
#else
#define SYNC asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads))
#endif
#define WAIT_NEXT \
if (tid == 0) { \
while (sendHead + NCCL_LL_CHUNKS <= step) { \
sendHead = LOAD(sendHeadPtr); \
} \
} \
SYNC;
#define POST_SIZE \
if (tid == 0 && sizesFifo) { STORE(sizesFifo + step % NCCL_LL_CHUNKS, (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T))); }
#define ACK_PREV \
SYNC; \
if (tid == 0) STORE(recvHeadPtr,step);
#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
if (step > LOAD(&ring->send.conn.llLastCleaning) + NCCL_LL_CLEAN_FREQ) { \
/* Reset all flags */ \
static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
} \
__threadfence_system(); \
/* Restart from the same slot, only make sure sender waits for data to be reset */ \
step += NCCL_LL_CHUNKS; \
ACK_PREV; \
while (LOAD(sendHeadPtr) < step); \
{ if (tid == 0) STORE(&ring->send.conn.llLastCleaning, step); }\
} \
STORE(&ring->send.conn.llStep, step); \
} while (0);
#endif
+608 -202
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -10,229 +10,635 @@
#include <type_traits>
#include "reduce_kernel.h" // for reduction funcs
#include "common.h"
#define SPINS_BEFORE_CHECK_ABORT 1000000
/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
*
* In order to reduce the reptetion of template arguments, the operations
* are bundled as static methods of the Primitives class.
*
* Each primitive operation copies/reduces a contiguous buffer and syncs
* an optional set of flags against a sub-step counter. The sync value is
* based on the step parameter. Sync flags must be of type WaitFlag or
* PostFlag. The primitive routines wait for all WaitFlag args to attain
* at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
* corresponding substep by previous step) before executing the transfer.
* After each substep is transfered, all PostFlag arguments get updated to
* the value SUBSTEPS*step+substep+1.
*/
// Unroll unconditionally the first send/recv since nsend/nrecv should be at
// least 1 if SEND/RECV is set.
#define FOR_SEND(func, ...) do { \
if (SEND) { \
/* Send to far first, then close */ \
for (int i=1; i<NSEND && i<nsend; i++) func(i, ##__VA_ARGS__); \
func(0, ##__VA_ARGS__); \
} \
} while (0)
class WaitFlag {
volatile uint64_t * const flag;
const int shift;
public:
__device__
WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
__device__
void wait(uint64_t val) { while ((LOAD(flag) + shift) < val) /*SPIN*/; }
};
class PostFlag {
volatile uint64_t * const flag;
const int shift;
volatile int * const fifo;
const int fifo_size;
uint32_t * hdp_reg;
public:
__device__
PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL)
: flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { }
// remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag
__device__
void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); }
__device__
void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
};
// Helper to check if any argument is of type T.
// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
template<typename T> __device__
bool AnyAre() { return false; }
template<typename T, typename FIRST_T, typename... TAIL_Ts>
__device__
bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
}
// Wait on all WaitFlags, ignore PostFlags
__device__
static void WaitOnFlags(uint64_t val) { }
template <typename... TAIL_Ts> __device__
static void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
flag.wait(val);
WaitOnFlags(val, tail...);
}
template <typename... TAIL_Ts> __device__
static void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
WaitOnFlags(val, tail...);
}
// Post all PostFlags, ignore WaitFlags
__device__
static void PostToFlags(uint64_t val) { }
template <typename... TAIL_Ts> __device__
static void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
PostToFlags(val, tail...);
}
template <typename... TAIL_Ts> __device__
static void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
flag.post(val);
PostToFlags(val, tail...);
}
// Post sizes for PostFlags, ignore WaitFlags
__device__
static void PostSizeToFlags(uint64_t step, int size) { }
template <typename... TAIL_Ts> __device__
static void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
PostSizeToFlags(step, size, tail...);
}
template <typename... TAIL_Ts> __device__
static void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
flag.postSize(step, size);
PostSizeToFlags(step, size, tail...);
}
// Create pointer arithmetic syntax that doesn't break for std::nullptr_t
template <typename Tptr> __device__
static Tptr ptradd(Tptr ptr, int i) {
return ptr + i;
}
__device__
static std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
return nullptr;
}
// use different unroll numbers for all primitives for best throughput
#define COPY_UNROLL 4
#define REDUCE_UNROLL 2
#define DOUBLECOPY_UNROLL 2
#define REDUCECOPY_UNROLL 2
#define FOR_RECV(func, ...) do { \
if (RECV) { \
/* Recv from close first, then far */ \
func(0, ##__VA_ARGS__); \
for (int i=1; i<NRECV && i<nrecv; i++) func(i, ##__VA_ARGS__); \
} \
} while (0)
// Implementation of primitive types
template <int, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
class Primitives {
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
class ncclPrimitives {
private:
template <int UNROLL,
typename SRC2_T, // either T* or std::nullptr_t
typename DST2_T, // either T* or std::nullptr_t
typename... SYNC_Ts> // either WaitFunc or PostFunc
static __device__ __attribute__((noinline)) void
GenericOp(const int tid, const int nthreads,
const T* src1,
const SRC2_T src2,
T* dst1,
DST2_T dst2,
int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
const int tid;
const int nthreads;
int nrecv = 0;
int nsend = 0;
const int stepSize;
struct ncclConnInfo* recvConn[NRECV];
struct ncclConnInfo* sendConn[NSEND];
volatile uint64_t* waitPtr;
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
uint64_t sendConnHead[NSEND];
const T* recvDirectBuff[NRECV];
T* sendDirectBuff[NSEND];
const T* recvBuff[NRECV];
T* sendBuff[NSEND];
struct ncclDevComm* comm;
uint32_t* abortCount;
enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
"src2 must be of type T* or std::nullptr_t");
static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
"dst2 must be of type T* or std::nullptr_t");
__device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
__device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
__device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
__device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
int sliceSize = len / SUBSTEPS;
int sliceOffset = 0;
#pragma unroll 1
for (int sub=0; sub<SUBSTEPS; ++sub) {
int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
if (tid < nthreads) {
if (AnyAre<WaitFlag>(flags...)) {
if (tid == 0) {
WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
__syncthreads();
__device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
__syncthreads();
#else
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
#endif
}
ReduceOrCopy
<
UNROLL,
OpType,
T,
!std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
!std::is_same<SRC2_T, std::nullptr_t>::value // HAS_SRC1
>
(
tid, nthreads,
ptradd(dst1, sliceOffset),
ptradd(dst2, sliceOffset),
ptradd(src1, sliceOffset),
ptradd(src2, sliceOffset),
realSize
);
if (AnyAre<PostFlag>(flags...)) {
__syncthreads();
if(tid == 0)
PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
__threadfence_system();
if(tid == 0)
PostToFlags(SUBSTEPS*step + sub + 1, flags...);
}
uint32_t mismatch = 0;
const uint64_t opCount;
__device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
if (mismatch) {
// In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
STORE(comm->fatalDevError, ncclDevAssertedMismatch);
} else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
mismatch += 1;
}
}
uint32_t spins = 0;
uint32_t abort = 0;
__device__ int checkAbort(volatile uint64_t* remoteOpCount) {
spins++;
if (spins == SPINS_BEFORE_CHECK_ABORT) {
abort = LOAD(comm->abortFlag);
checkMismatch(remoteOpCount);
spins = 0;
}
return abort;
}
__device__ void waitRecv(int i) {
spins = 0;
mismatch = 0;
recvStep[i] += SLICESTEPS;
if (tid == i) {
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t t0 = clock64();
#endif
while (LOAD(waitPtr) < recvStep[i]) {
if (checkAbort(recvConn[i]->opCountRem)) break;
}
#ifdef ENABLE_PROFILING
__atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
#endif
}
}
__device__ void waitSend(int i) {
spins = 0;
mismatch = 0;
sendStep[i] += SLICESTEPS;
if (tid == WARP_SIZE+i) {
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t t0 = clock64();
#endif
while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
sendConnHead[i] = LOAD(waitPtr);
if (checkAbort(sendConn[i]->opCountRem)) break;
}
#ifdef ENABLE_PROFILING
__atomic_fetch_add(&devProf->wait_send_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
#endif
}
}
inline __device__ void postRecv(int i) {
STORE(recvConn[i]->head, recvStep[i]);
}
inline __device__ void postSend(int i) {
if (sendConn[i]->next_hdp_reg) STORE(sendConn[i]->next_hdp_reg, 0x1);
STORE(sendConn[i]->tail, sendStep[i]);
}
__device__ void postSendSize(int i, int size) {
if (sendConn[i]->fifo) STORE(sendConn[i]->fifo+((sendStep[i]-SLICESTEPS)%NCCL_STEPS), size);
}
template <int DIRECTRECV>
__device__ const T* directRecvPtr(int i, int directOffset) {
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
}
template <int DIRECTSEND>
__device__ T* directSendPtr(int i, int directOffset) {
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
}
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
__device__ void
GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
int offset = 0;
int sliceSize = stepSize * SLICESTEPS;
const T* srcs[RECV*NRECV+SRC];
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
if (RECV) {
if (SRC) srcs[1] = recvPtr(0);
for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
}
T* dsts[SEND*NSEND+DST];
dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
if (SEND) {
if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
}
#pragma unroll 1
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
int realSize = max(0, min(sliceSize, nelem-offset));
FOR_SEND(waitSend);
FOR_RECV(waitRecv);
if (realSize > 0) {
barrier();
if (DIRECTRECV && recvDirectBuff[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
}
} else {
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
}
}
sliceOffset += sliceSize;
exitIfAbortBarrier(abort, abortCount);
if (tid == 0) FOR_SEND(postSendSize, realSize*sizeof(T));
if (SEND) __threadfence_system();
if (tid == 0) FOR_SEND(postSend);
if (tid == 0) FOR_RECV(postRecv);
}
for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
offset += sliceSize;
}
__device__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
recvConn[i] = conn;
recvBuff[i] = (const T*)LOAD(&recvConn[i]->buff);
recvStep[i] = LOAD(&recvConn[i]->step);
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
// Return credits in case we rounded up.
if (tid == 0) STORE(recvConn[i]->head, recvStep[i]);
if (tid == i) {
waitPtr = LOAD(&recvConn[i]->tail);
STORE(recvConn[i]->opCountLoc, opCount);
}
recvDirectBuff[i] = NULL;
if (directBuff && recvConn[i]->direct) {
recvDirectBuff[i] = directBuff;
if (tid == 0) STORE(recvConn[i]->ptrExchange, directBuff);
}
nrecv++;
}
__device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
sendConn[i] = conn;
sendBuff[i] = (T*)LOAD(&sendConn[i]->buff);
sendStep[i] = LOAD(&sendConn[i]->step);
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
if (tid == WARP_SIZE+i) {
waitPtr = LOAD(&sendConn[i]->head);
sendConnHead[i] = LOAD(waitPtr);
STORE(sendConn[i]->opCountLoc, opCount);
}
sendDirectBuff[i] = NULL;
if (directBuff && sendConn[i]->direct) {
void* volatile* ptr = sendConn[i]->ptrExchange;
while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
__syncthreads();
if (tid == 0) STORE(ptr, NULL);
}
nsend++;
}
__device__ void saveRecvConn(int i) {
if (tid == i) {
STORE(&recvConn[i]->step, recvStep[i]);
__threadfence_system();
__atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
}
}
__device__ void saveSendConn(int i) {
if (tid == WARP_SIZE+i) {
STORE(&sendConn[i]->step, sendStep[i]);
__threadfence_system();
__atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
}
}
public:
template <typename... SYNC_Ts>
static __device__ void
Copy(const int tid, const int nthreads, const T* src, T* dst,
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
GenericOp<COPY_UNROLL>(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
__device__
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
// Make sure step is updated before we read it
abortCount = channel->abortCount;
__syncthreads();
// disable directBuff
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0);
}
template <typename... SYNC_Ts>
static __device__ void
DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
GenericOp<DOUBLECOPY_UNROLL>(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
__device__ void
send(const T* src, int nelem) {
GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
}
__device__ void
directSend(const T* src, int directOffset, int nelem) {
GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
}
template <typename... SYNC_Ts>
static __device__ void
Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
GenericOp<REDUCE_UNROLL>(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
__device__ void
recv(T* dst, int nelem) {
GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
}
__device__ void
directRecv(T* dst, int directOffset, int nelem) {
GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
}
template <typename... SYNC_Ts>
static __device__ void
ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
GenericOp<REDUCECOPY_UNROLL>(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
__device__ void
copySend(const T* src, T* dst, int nelem) {
GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
}
__device__ void
directCopySend(const T* src, T* dst, int directOffset, int nelem) {
GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
}
__device__ void
recvCopySend(T* dst, int nelem) {
GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
}
__device__ void
directRecvCopySend(T* dst, int directOffset, int nelem) {
GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
}
__device__ void
recvReduceCopy(const T* src, T* dst, int nelem) {
GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
}
__device__ void
recvReduceSend(const T* src, int nelem) {
GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
}
__device__ void
recvReduceCopySend(const T* src, T* dst, int nelem) {
GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
}
__device__ void
directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
// Direct is only for the send part
GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
}
__device__ ~ncclPrimitives() {
// Save steps for next collective. Have thread 0 do it to be compatible
// with the way LL works.
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
}
};
#endif // end include guard
template <typename T, class FUNC, int NRECV, int NSEND>
class ncclLLPrimitives {
private:
const int tid;
const int nthreads;
int nrecv = 0;
int nsend = 0;
struct ncclConnInfo* recvConn[NRECV];
struct ncclConnInfo* sendConn[NSEND];
volatile uint64_t* waitPtr;
volatile uint64_t* postPtr;
volatile int* fifoPtr;
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
uint64_t sendConnHead;
union ncclLLFifoLine* recvBuff[NRECV];
union ncclLLFifoLine* sendBuff[NSEND];
struct ncclDevComm* comm;
uint32_t* abortCount;
__device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
__device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
__device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
__device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
__device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
__device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
// Exit If Abort Barrier : make sure all threads exit consistently
// Each thread sets a predicate to true if val == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
// If any of the thread's predicate was True, all the threads call exit()
__device__ void exitIfAbortLocalBarrier() {
uint32_t popc;
asm ("{");
asm volatile (" .reg .pred barr_pred;");
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
asm ("}");
if (popc) {
// Make sure threads not participating in the operation get the abort and all threads exit
exitIfAbortBarrier(1);
}
}
#endif
__device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
__syncthreads();
#else
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
#endif
}
uint32_t mismatch = 0;
const uint64_t opCount;
__device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
if (mismatch > 20) {
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
} else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
mismatch += 1;
}
}
uint32_t spins = 0;
uint32_t abort = 0;
__device__ int checkAbort(volatile uint64_t* remoteOpCount) {
spins++;
if (spins == SPINS_BEFORE_CHECK_ABORT) {
abort = LOAD(comm->abortFlag);
checkMismatch(remoteOpCount);
spins = 0;
}
return abort;
}
__device__ void waitSend(int i, int nbytes) {
spins = 0;
mismatch = 0;
if (tid == WARP_SIZE+i) {
while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
sendConnHead = LOAD(waitPtr);
if (checkAbort(sendConn[i]->opCountRem)) break;
}
if (fifoPtr) {
int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
STORE(fifoPtr+sendStep[i]%NCCL_STEPS, size);
}
}
}
__device__ void postRecv(int i) {
recvStep[i]++;
if (tid == i) STORE(postPtr, recvStep[i]);
}
__device__ void postSend(int i, int offset) {
// LL Cleanup : write all flags in the slice to make sure we don't have
// data corruption when flag loops over.
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
}
sendStep[i]++;
}
__device__ __attribute__((noinline)) uint64_t readLL(int i, int offset) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
uint32_t flag = recvFlag(i);
uint32_t data1, flag1, data2, flag2;
spins = 0;
mismatch = 0;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
do {
asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
"s_waitcnt vmcnt(0)\n"
"buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
if (i4[1] == flag && i4[3] == flag) break;
} while (!checkAbort(recvConn[i]->opCountRem));
uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
#else
do {
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
if (checkAbort(recvConn[i]->opCountRem)) break;
} while ((flag1 != flag) || (flag2 != flag));
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
#endif
return val64;
}
__device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = val & 0xffffffff;
i4[1] = flag;
i4[2] = (val >> 32);
i4[3] = flag;
asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
"s_waitcnt vmcnt(0)\n"
"buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
#else
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
#endif
}
// Using memcpy handles misaligned pointers.
__device__ uint64_t readAL(uint64_t* src) {
uint64_t val;
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
return val;
}
__device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
memcpy((char*)dst, (char*)&val, nbytes);
}
template <int RECV, int SEND, int SRC, int DST>
__device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
FOR_SEND(waitSend, nbytes*2);
barrier();
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
uint64_t* srcPack = (uint64_t*)srcPtr;
uint64_t* dstPack = (uint64_t*)dstPtr;
int offset = tid;
// Do multiples of 64 bits
#pragma unroll 1
for (; offset<npack; offset+=nthreads) {
// Recv : local, then intra-node, then inter-node
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
if (RECV) {
if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
for (int i=1; i<NRECV && i<nrecv; i++) {
val = MULTI<FUNC, T>()(readLL(i, offset), val);
}
}
// Send : inter-node, then intra-node, then local
if (SEND) {
for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
storeLL(sendPtr(0)+offset, val, sendFlag(0));
}
if (DST) {
if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
// Last incomplete word
storeAL(dstPack+offset, val, nbytes & 0x7);
} else {
storeAL(dstPack+offset, val, sizeof(uint64_t));
}
}
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
exitIfAbortBarrier(abort, abortCount);
#else
exitIfAbortLocalBarrier();
#endif
FOR_RECV(postRecv);
FOR_SEND(postSend, offset);
}
__device__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
recvConn[i] = conn;
recvBuff[i] = recvConn[i]->llBuff;
recvStep[i] = recvConn[i]->step;
if (tid == i) {
postPtr = recvConn[i]->head;
STORE(recvConn[i]->opCountLoc, opCount);
}
nrecv++;
}
__device__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendConn[i] = conn;
sendBuff[i] = sendConn[i]->llBuff;
sendStep[i] = sendConn[i]->step;
if (tid == WARP_SIZE+i) {
waitPtr = sendConn[i]->head;
fifoPtr = sendConn[i]->fifo;
sendConnHead = LOAD(waitPtr);
STORE(sendConn[i]->opCountLoc, opCount);
}
nsend++;
}
__device__ void saveRecvConn(int i) {
if (tid == i) {
recvConn[i]->step = recvStep[i];
__atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
__threadfence_block();
}
}
__device__ void saveSendConn(int i) {
if (tid == WARP_SIZE+i) {
sendConn[i]->step = sendStep[i];
__atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
__threadfence_block();
}
}
public:
__device__
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
// Make sure step is updated before we read it.
abortCount = channel->abortCount;
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
}
__device__ void send(const T* src, int nelem) {
return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
}
__device__ void recv(T* dst, int nelem) {
return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
}
__device__ void recvReduceSend(const T* src, int nelem) {
return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
}
__device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
}
__device__ void copySend(const T* src, T* dst, int nelem) {
return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
}
__device__ void recvCopySend(T* dst, int nelem) {
return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
}
__device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
}
__device__ ~ncclLLPrimitives() {
// Save steps for the next operation
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
}
};
#ifdef ENABLE_PROFILING
#define INIT_COUNTER \
if (tid==0) { t0 = clock64(); ws = LOAD(&(devProf->wait_send_cycle[blockIdx.x])); \
wr = LOAD(&(devProf->wait_recv_cycle[blockIdx.x])); }
#define ACCUMULATE_COUNTER(prim) \
if (tid==0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \
+ ws - LOAD(&(devProf->wait_send_cycle[blockIdx.x])) \
+ wr - LOAD(&(devProf->wait_recv_cycle[blockIdx.x])), \
__ATOMIC_SEQ_CST); \
__atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
#else
#define INIT_COUNTER
#define ACCUMULATE_COUNTER(prim)
#endif
#endif
+1 -5
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,12 +11,7 @@
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL2(ncclReduce, sum, FuncSum, ncclCollReduce, ncclSum);
#elif NCCL_OP == 1
IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
#elif NCCL_OP == 2
IMPL_COLL2(ncclReduce, min, FuncMin, ncclCollReduce, ncclMin);
#elif NCCL_OP == 3
IMPL_COLL2(ncclReduce, max, FuncMax, ncclCollReduce, ncclMax);
#endif
+41 -134
Просмотреть файл
@@ -1,153 +1,82 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and boffset for buffer sync
#define NEXT_STEP \
step++; \
boffset += sliceSize; \
if (boffset == buffSize) boffset = 0;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->next_hdp_reg);
typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
const int rank = ring->devUserRanks[0];
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->root;
if (tid == 0) {
// Update in case we skipped some collectives
STORE(ring->recv.conn.opCount, args->opCount);
if (rank != root) {
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
}
}
__syncthreads();
uint64_t step = 0ULL;
int boffset = 0;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*chunkSize;
int maxOffset = min(chunkSize, size-offset);
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
if (prevRank == root) {
Prims::Copy(tid, nthreads,
thisInput + offset,
nextOutput + boffset,
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
prims.send(thisInput+offset, nelem);
} else if (rank == root) {
Prims::Reduce(tid, nthreads,
prevInput + boffset,
thisInput + offset,
thisOutput + offset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
Prims::Reduce(tid, nthreads,
prevInput + boffset,
thisInput + offset,
nextOutput + boffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
prims.recvReduceSend(thisInput+offset, nelem);
}
NEXT_STEP; // Increases step, boffset
}
if (tid == 0) {
if (rank != root) {
// Wait for next to have consumed data before resetting the flag
waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
STORE(ring->send.conn.head, 0ULL);
}
STORE(ring->recv.conn.tail, 0ULL);
__threadfence_system();
STORE(ring->recv.conn.opCount, args->opCount+1);
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
boffset += NCCL_LL_SLICE_LINES; \
if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
flag++; \
step++;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
const int nranks = comm->nRanks;
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
const int rank = comm->rank;
const int nranks = comm->nRanks;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->root;
typedef LLPrimitives<T, FUNC> LL;
const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t flag = step + 1;
int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -155,39 +84,17 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
}
ssize_t offset = gridOffset + bid*chunkSize;
int maxOffset = min(chunkSize, size-offset);
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
nextOutput + boffset,
maxOffset, flag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LL::ReduceCopy(
thisInput + offset,
prevInput + boffset,
thisOutput + offset,
maxOffset, flag, llNthreads);
NEXT_STEP_LL;
ACK_PREV;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
prevInput + boffset,
nextOutput + boffset,
maxOffset, flag, flag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
ACK_PREV;
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
// We need everyone to acknowledge data even if they didn't receive anything
// so that the next collective can start right away.
ACK_PREV;
FIFO_CLEANING_AND_SAVE_STEP(flag);
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 0
#include "device/reduce.cu"
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 1
#include "device/reduce.cu"
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 2
#include "device/reduce.cu"
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 3
#include "device/reduce.cu"
+36 -80
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -19,7 +19,7 @@ struct FuncNull {
}
};
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
//we really don't need any specializations and we don't need
//to break things into uint32_t
@@ -164,30 +164,31 @@ struct FuncMin {
}
};
#define MASK0 0x00ff00ff
#define MASK1 0xff00ff00
static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
/* This can be used both for signed and unsigned 8-bit addition */
const uint32_t x0 = x & MASK0;
const uint32_t x1 = x & MASK1;
const uint32_t y0 = y & MASK0;
const uint32_t y1 = y & MASK1;
const uint32_t r0 = (x0+y0);
const uint32_t r1 = (x1+y1);
return (r0 & MASK0) | (r1 & MASK1);
}
template<>
struct FuncSum<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vadd.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
"vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = cx.a.x + cy.a.x;
cr.a.y = cx.a.y + cy.a.y;
cr.a.z = cx.a.z + cy.a.z;
cr.a.w = cx.a.w + cy.a.w;
return cr.storage;
return addChar4(x, y);
#endif
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -196,28 +197,16 @@ struct FuncSum<int8_t> {
};
template<>
struct FuncSum<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vadd.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
"vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a.x = cx.a.x + cy.a.x;
cr.a.y = cx.a.y + cy.a.y;
cr.a.z = cx.a.z + cy.a.z;
cr.a.w = cx.a.w + cy.a.w;
return cr.storage;
return addChar4(x, y);
#endif
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -227,22 +216,6 @@ struct FuncSum<uint8_t> {
static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
/* This can be used both for signed and unsigned 8-bit multiplication */
#if (__CUDA_ARCH__ >= 300)
uint32_t rv;
asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
" vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
" vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
" shl.b32 t3, t3, 16;\n\t"
" shl.b32 t2, t2, 16;\n\t"
" vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
" shl.b32 t1, t1, 8;\n\t"
" vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
" and.b32 t1, t1, 0xff00ff00;\n\t"
" and.b32 t0, t0, 0x00ff00ff;\n\t"
" or.b32 %0, t0, t1;\n\t"
"}" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
union converter { uint32_t storage; char4 a; };
converter cx, cy, cr;
cx.storage = x;
@@ -252,7 +225,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
cr.a.z = cx.a.z * cy.a.z;
cr.a.w = cx.a.w * cy.a.w;
return cr.storage;
#endif
}
template<>
@@ -278,17 +250,12 @@ template<>
struct FuncMax<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
"vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
@@ -298,6 +265,7 @@ struct FuncMax<int8_t> {
cr.a.z = max(cx.a.z, cy.a.z);
cr.a.w = max(cx.a.w, cy.a.w);
return cr.storage;
#endif
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -308,17 +276,12 @@ template<>
struct FuncMax<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
"vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
@@ -328,6 +291,7 @@ struct FuncMax<uint8_t> {
cr.a.z = max(cx.a.z, cy.a.z);
cr.a.w = max(cx.a.w, cy.a.w);
return cr.storage;
#endif
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -339,17 +303,12 @@ template<>
struct FuncMin<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
"vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
@@ -359,6 +318,7 @@ struct FuncMin<int8_t> {
cr.a.z = min(cx.a.z, cy.a.z);
cr.a.w = min(cx.a.w, cy.a.w);
return cr.storage;
#endif
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -369,17 +329,12 @@ template<>
struct FuncMin<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
int32_t rv;
asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
"vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
"vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
"vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
cx.storage = x;
@@ -389,6 +344,7 @@ struct FuncMin<uint8_t> {
cr.a.z = min(cx.a.z, cy.a.z);
cr.a.w = min(cx.a.w, cy.a.w);
return cr.storage;
#endif
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -480,6 +436,6 @@ struct FuncMin<half> {
}
};
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#endif // REDUCE_KERNEL_H_
+1 -6
Просмотреть файл
@@ -11,12 +11,7 @@
#define UNROLL 4
#if NCCL_OP == 0
IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum);
#elif NCCL_OP == 1
IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
#elif NCCL_OP == 2
IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin);
#elif NCCL_OP == 3
IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax);
#endif
IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax);
+38 -130
Просмотреть файл
@@ -1,166 +1,93 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
// Increase Step and poffset/noffset for buffer sync
#define NEXT_STEP \
step++; \
poffset = noffset; \
noffset += sliceSize; \
if (noffset == buffSize) noffset = 0;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->next_hdp_reg);
typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int buffSize = ring->buffSize / sizeof(T);
const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
if (tid == 0) {
// Update in case we skipped some collectives
STORE(ring->recv.conn.opCount, args->opCount);
// Wait for next to be ready
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
waitOpCountNext.wait(args->opCount);
}
__syncthreads();
uint64_t step = 0ULL;
int poffset, noffset = 0;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*chunkSize;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int maxOffset = min(chunkSize, size-chunkOffset);
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
Prims::Copy(tid, nthreads,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext,
postReadyToNext);
NEXT_STEP; // Increases step, poffset, noffset
prims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
Prims::Reduce(tid, nthreads,
prevInput + poffset,
thisInput + offset,
nextOutput + noffset,
sliceSize, maxOffset,
step,
waitDoneFromNext, waitReadyFromPrev,
postReadyToNext, postDoneToPrev);
NEXT_STEP;
prims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
// step k-1: reduce this buffer and data, which will produce the final result
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
Prims::Reduce(tid, nthreads,
prevInput + poffset,
thisInput + offset,
thisOutput + chunkOffset,
sliceSize, maxOffset,
step,
waitReadyFromPrev,
postDoneToPrev);
}
if (tid == 0) {
waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
STORE(ring->send.conn.head, 0ULL);
STORE(ring->recv.conn.tail, 0ULL);
__threadfence_system();
STORE(ring->recv.conn.opCount, args->opCount+1);
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
#include "ll_kernel.h"
#define NEXT_STEP_LL \
poffset = noffset; \
pflag = nflag; \
noffset += NCCL_LL_SLICE_LINES; \
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
nflag++; \
step++;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int llNthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclRing* ring = comm->rings+blockIdx.x;
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
volatile int * sizesFifo = ring->send.conn.llFifo;
uint64_t sendHead = sendHeadPtr[0];
const int nthreads = args->nThreads;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
typedef LLPrimitives<T, FUNC> LL;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nRings*chunkSize;
uint64_t step = ring->send.conn.llStep;
uint32_t pflag, nflag = step + 1;
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -170,37 +97,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int maxOffset = min(chunkSize, size-chunkOffset);
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
nextOutput + noffset,
maxOffset, nflag, llNthreads);
POST_SIZE;
NEXT_STEP_LL;
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
WAIT_NEXT;
LL::ReduceCopy(
thisInput + offset,
prevInput + poffset,
nextOutput + noffset,
maxOffset, pflag, nflag, llNthreads);
POST_SIZE;
ACK_PREV;
NEXT_STEP_LL;
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
@@ -208,13 +119,10 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LL::ReduceCopy(
thisInput + offset,
prevInput + poffset,
thisOutput + chunkOffset,
maxOffset, pflag, llNthreads);
ACK_PREV;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
FIFO_CLEANING_AND_SAVE_STEP(nflag);
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 0
#include "device/reduce_scatter.cu"
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 1
#include "device/reduce_scatter.cu"
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 2
#include "device/reduce_scatter.cu"
-8
Просмотреть файл
@@ -1,8 +0,0 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#define NCCL_OP 3
#include "device/reduce_scatter.cu"
+19
Просмотреть файл
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
-33
Просмотреть файл
@@ -1,33 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
op, root, comm, stream);
}
+19
Просмотреть файл
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
-32
Просмотреть файл
@@ -1,32 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
size_t nbytes = count*ncclTypeSize(datatype);
INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
if (comm->nRanks == 1) {
if (sendbuff != recvbuff)
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
} else {
NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
}
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
op, 0, comm, stream);
}
+441
Просмотреть файл
@@ -0,0 +1,441 @@
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "checks.h"
#include "param.h"
#include "collectives/collectives.h"
// Only generate inline kernels for LL
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_KERN_NAME(coll##LL, op, dtype), \
NCCL_KERN_NAME(coll##LL, op, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Ring, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum)
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
typedef void(*ncclKern_t)(struct ncclColl);
// Must be consistent with the ncclFuncSet enum
static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce)
};
/*****************************************************************************/
/* Launch system : synchronization and CUDA kernel launch */
/*****************************************************************************/
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
if (cgMode & 0x01) {
CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
// These flags are to reduce the latency of using this API
0));
return ncclSuccess;
}
int savedDev;
CUDACHECK(hipGetDevice(&savedDev));
for (int i = 0; i < numDevices; i++) {
hipLaunchParams* params = paramsList+i;
CUDACHECK(hipSetDevice(cudaDevs[i]));
hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
}
CUDACHECK(hipSetDevice(savedDev));
return ncclSuccess;
}
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
// Set active = 2 for the last operation
for (int r=0; r<params->gridDim.x; r++) {
struct ncclChannel* channel = comm->channels+r;
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
}
// Find the first operation, choose the kernel accordingly and pass it
// as the first argument.
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
memcpy(&comm->args, coll, sizeof(struct ncclColl));
// As we pass that coll directly, we can free it immediately.
STORE(&coll->active, 0);
params->func = (void *)ncclKerns[coll->funcIndex];
return ncclSuccess;
}
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
int val = LOAD(ptr);
bool done = false;
while (done == false) {
if (val >= comm->intraRanks) {
WARN("Trying to launch too many collectives");
return ncclInvalidUsage;
}
if (val+1 == comm->intraRanks) {
// Reset the barrier.
comm->intraBarrier[comm->intraPhase^1] = 0;
*isLast = 1;
return ncclSuccess;
}
done = __sync_bool_compare_and_swap(ptr, val, val+1);
val++;
}
*isLast = 0;
return ncclSuccess;
}
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
int val = LOAD(ptr);
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
WARN("Trying to launch too many collectives");
return ncclInternalError;
}
return ncclSuccess;
}
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
while (LOAD(ptr) < comm->intraRanks) pthread_yield();
comm->intraPhase ^= 1;
return ncclSuccess;
}
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
if (comm->nRanks == 1) return ncclSuccess;
hipLaunchParams* params = comm->myParams;
NCCLCHECK(setupLaunch(comm, params));
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
// Enqueue event in user stream
CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
// Create dependency between user stream and internal NCCL stream
CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
params->stream = comm->groupStream;
} else {
if (comm->userStream != params->stream) {
// Stream changed from last call, create dependency against last NCCL kernel launch
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
}
params->stream = comm->userStream;
}
int isLast = 0;
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
if (isLast) {
if (comm->launchMode == ncclComm::GROUP) {
// I'm the last. Launch all operations.
NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
}
NCCLCHECK(ncclCpuBarrierLast(comm));
}
return ncclSuccess;
}
ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
if (comm->nRanks == 1) return ncclSuccess;
// We can't print the CG mode before the first barrier happened.
if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
*comm->intraCGMode ^= 0x10;
INFO(NCCL_INIT,"Launch mode %s%s%s",
comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
*comm->intraCGMode ? "/CGMD" : "",
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
}
NCCLCHECK(ncclCpuBarrierOut(comm));
hipLaunchParams *params = comm->myParams;
if (comm->launchMode == ncclComm::PARALLEL) {
hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
}
// Start the network proxies as soon as the kernel has been launched. We can't
// perform any CUDA call between the two or having a hipFree between the CUDA
// launch and the transportStartProxy call could cause a deadlock.
// Also, starting the proxies after the CUDA launch seems to be better for
// performance (latency).
for (int r=0; r<params->gridDim.x; r++) {
struct ncclChannel* channel = comm->channels+r;
channel->collStart = channel->collFifoTail;
channel->collCount = 0;
}
params->gridDim.x = params->blockDim.x = 0;
NCCLCHECK(transportStartProxy(comm));
return ncclSuccess;
}
ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
hipLaunchParams *params = comm->myParams;
// Enqueue event after NCCL kernel
CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
// Create dependency between NCCL internal stream and user stream
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
}
comm->userStreamSet = false;
return ncclSuccess;
}
/*****************************************************************************/
/* Enqueueing system : computation of kernel and proxy operations parameters */
/*****************************************************************************/
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
else if (info->coll == ncclCollAllReduce) {
if (info->nBytes <= info->comm->treeThreshold)
info->pattern = ncclPatternTreeUpDown;
else
info->pattern = ncclPatternRingTwice;
}
else {
WARN("Unknown collective %d", info->coll);
return ncclInternalError;
}
return ncclSuccess;
}
static ncclResult_t getLoopInfo(struct ncclInfo* info) {
switch (info->pattern) {
case ncclPatternTreeUp:
case ncclPatternTreeDown:
case ncclPatternTreeUpDown:
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
case ncclPatternRing:
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternRingTwice:
info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
default:
WARN("Unknown pattern %d\n", info->pattern);
return ncclInternalError;
}
return ncclSuccess;
}
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
// Compute thresholds and limits that users can override
ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
// First compute nThreads
int nt = NCCL_LL_MIN_NTHREADS;
while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
// Then compute nChannels
int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
if (nc == 0) nc = 1;
if (nc > info->comm->nChannels) nc = info->comm->nChannels;
// Check if we have a fixed LL threshold, otherwise compute it.
int perThreadThreshold = info->comm->threadThreshold;
if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
ssize_t llThreshold = info->comm->llThreshold >= 0 ?
info->comm->llThreshold :
nc*nt*info->nchunksPerLoop*perThreadThreshold;
if (info->nBytes <= llThreshold) {
*llMode = 1;
*nChannels = nc;
*nThreads = nt;
} else {
*llMode = 0;
*nChannels = info->comm->nChannels;
*nThreads = info->comm->nThreads;
}
}
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
// Set nstepsPerLoop and nchunksPerLoop
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
coll->args.root = info->root;
coll->args.N = info->count;
coll->args.ThisInput = info->sendbuff;
coll->args.ThisOutput = info->recvbuff;
coll->args.comm = info->comm->devComm;
coll->args.opCount = info->comm->opCount;
// Compute llMode, nChannels, nThreads
int llMode;
getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
int chunkSize = stepSize*chunkSteps;
// Compute lastChunkSize
if (treeMode == 1 && llMode == 0) {
if (info->pattern == ncclPatternTreeUpDown) {
// Optimize chunkSize / nSteps
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
}
// Use lastChunkSize as chunkSize
coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (llMode == 1) {
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
}
// Compute nSteps for proxies
size_t nBytes = llMode ? info->nBytes*2 : info->nBytes;
int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->sliceSteps = sliceSteps;
proxyArgs->chunkSteps = chunkSteps;
proxyArgs->llMode = llMode;
proxyArgs->opCount = info->comm->opCount;
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
nLoops, proxyArgs->nsteps, info->comm);
return ncclSuccess;
}
static ncclResult_t saveKernel(struct ncclInfo* info) {
if (info->comm->nRanks == 1) {
if (info->sendbuff != info->recvbuff)
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
return ncclSuccess;
}
struct ncclColl coll;
struct ncclProxyArgs proxyArgs;
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
if (info->comm->userStreamSet == false) {
info->comm->userStream = info->stream;
info->comm->userStreamSet = true;
} else if (info->stream != info->comm->userStream) {
WARN("Error : mixing different streams within a group call is not supported.");
return ncclInvalidUsage;
}
for (int bid=0; bid<coll.args.nChannels; bid++) {
struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
if (channel->collCount == NCCL_MAX_OPS) {
WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
return ncclInvalidUsage;
}
// Proxy
proxyArgs.channel = channel;
NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
info->comm->myParams->gridDim.x++;
int opIndex = channel->collFifoTail;
struct ncclColl* c = channel->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (LOAD(activePtr) != 0) sched_yield();
memcpy(c, &coll, sizeof(struct ncclColl));
c->args.bid = bid;
STORE(&c->active, 1);
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
channel->collFifoTail = opIndex;
channel->collCount++;
}
/*if (llMode == 0)*/ info->comm->opCount++;
return ncclSuccess;
}
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
if (info->comm == NULL) return ncclInvalidArgument;
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
// Launch asynchronously if needed
if (ncclAsyncMode()) {
ncclResult_t ret = ncclSuccess;
int savedDev = -1;
if (info->comm->checkPointers) {
CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end);
}
// Check arguments
NCCLCHECKGOTO(ArgsCheck(info), ret, end);
// Always register comm even in case of error to make sure ncclGroupEnd
// cleans it up.
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
NCCLCHECKGOTO(saveKernel(info), ret, end);
end:
if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
ncclAsyncErrCheck(ret);
return ret;
} else {
NCCLCHECK(ArgsCheck(info));
NCCLCHECK(saveKernel(info));
NCCLCHECK(ncclBarrierEnqueue(info->comm));
NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
NCCLCHECK(ncclEnqueueEvents(info->comm));
return ncclSuccess;
}
}
+54
Просмотреть файл
@@ -0,0 +1,54 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ALLOC_H_
#define NCCL_ALLOC_H_
#include "nccl.h"
#include "checks.h"
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
memset(*ptr, 0, size);
*devPtr = *ptr;
return ncclSuccess;
}
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(hipHostFree(ptr));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
}
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
if (isFineGrain)
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
return ncclSuccess;
}
#endif
+15
Просмотреть файл
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ARGCHECK_H_
#define NCCL_ARGCHECK_H_
#include "core.h"
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
ncclResult_t ArgsCheck(struct ncclInfo* info);
#endif
+4 -1
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,9 +9,12 @@
#include "nccl.h"
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapClose(void* commState);
#endif
+14
Просмотреть файл
@@ -0,0 +1,14 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CHANNEL_H_
#define NCCL_CHANNEL_H_
#include "core.h"
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
#endif
+73
Просмотреть файл
@@ -0,0 +1,73 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CHECKS_H_
#define NCCL_CHECKS_H_
#include "debug.h"
// Check CUDA calls
#define CUDACHECK(cmd) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
res = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)
#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)
#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \
} \
} while (0);
#define NCCLCHECKGOTO(call, res, label) do { \
res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
goto label; \
} \
} while (0);
#endif
+117
Просмотреть файл
@@ -0,0 +1,117 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#define MAXCHANNELS 16
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
#define CACHE_LINE_SIZE 64
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
struct ncclSendMem {
union {
struct {
uint64_t head;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
uint64_t opCount;
};
char pad3[MEM_ALIGN];
};
};
struct ncclRecvMem {
union {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
uint64_t opCount;
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
};
char pad4[MEM_ALIGN];
};
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
char buff[1]; // Actually larger than that
};
struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
void* bootstrap;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int nvmlDev; // my NVML device number
enum { GROUP, PARALLEL } launchMode;
hipStream_t userStream;
bool userStreamSet;
hipEvent_t doneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
uint64_t opCount;
// Channels for collectives
int nChannels;
int nThreads;
// Low-latency algorithm threshold
ssize_t llThreshold;
ssize_t threadThreshold;
// Tree algorithm threshold
ssize_t treeThreshold;
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
hipStream_t groupStream;
// Whether there has been a fatal error in this communicator.
ncclResult_t fatalError;
// Error reported by GPU
volatile ncclDevError_t* fatalDevError;
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
// Device side of the communicator
struct ncclDevComm *devComm;
// Host copy of the devComm (to free CUDA allocs)
struct ncclDevComm hostDevComm;
// Intra-process sync
int intraRank;
int intraRanks;
int* intraBarrier;
int intraPhase;
// Storage for deferred intra-process launch
hipLaunchParams * intraParams;
hipLaunchParams *myParams;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclColl args;
struct ncclColl* argsptr;
// Global proxy thread
pthread_t proxyThread;
struct ncclProxyState proxyState;
};
#endif
-196
Просмотреть файл
@@ -1,196 +0,0 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_COLL_H_
#define COMMON_COLL_H_
#include "core.h"
#include "enqueue.h"
#include "collectives/collectives.h"
static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
hipPointerAttribute_t attr;
hipError_t err = hipPointerGetAttributes(&attr, pointer);
if (err != hipSuccess || attr.devicePointer == NULL) {
WARN("%s : %s is not a valid pointer", opname, ptrname);
return ncclInvalidArgument;
}
#if CUDART_VERSION >= 10000
if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
#else
if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
#endif
WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
return ncclInvalidArgument;
}
return ncclSuccess;
}
static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
if (ptr == NULL) {
WARN("%s : %s argument is NULL", opname, ptrname);
return ncclInvalidArgument;
}
return ncclSuccess;
}
static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
NCCLCHECK(PtrCheck(comm, opname, "comm"));
// First, the easy ones
if (root < 0 || root >= comm->nRanks) {
WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
return ncclInvalidArgument;
}
if (type < 0 || type >= ncclNumTypes) {
WARN("%s : invalid type %d", opname, type);
return ncclInvalidArgument;
}
if (op < 0 || op >= ncclNumOps) {
WARN("%s : invalid reduction operation %d", opname, op);
return ncclInvalidArgument;
}
if (comm->checkPointers) {
// Check CUDA device pointers
if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
}
if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
}
}
return ncclSuccess;
}
static __inline__ int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
return 1;
case ncclFloat16:
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
// In : comm, nbytes ; Out : nrings, nthreads, ll
// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
// If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
// This ensures we don't use a large number of rings with a small number of threads
// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
// we use NCCL_THREAD_THRESHOLD when we reach the max
// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
*ll = 0;
int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
if (comm->llThreshold >= 0) { /* user sets total LL threshold */
if (nbytes > comm->llThreshold) { /* non-LL */
*nthreads = comm->nThreads;
*nrings = comm->nRings;
return;
} else {
llEnforced = 1; /* user wants to use LL */
}
}
int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
size_t nr;
int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
while (nt < ll_max_nthreads && *ll == 0) {
nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
if (nr <= maxRings) { /* avoid using few threads but many rings */
nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
*ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
}
if (*ll == 0) {
nt = nt << 1;
}
}
if (*ll == 1) {
*nthreads = nt;
*nrings = (int)nr;
return; /* we can use smaller number of threads to make LL work, stop here */
}
nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
*ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
*nthreads = *ll ? ll_max_nthreads : comm->nThreads;
*nrings = *ll ? (int)nr : comm->nRings;
}
static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream, size_t nbytes, int loopFactor) {
int llMode, nBlocks, nThreads;
ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
if (comm->userStreamSet == false) {
comm->userStream = stream;
comm->userStreamSet = true;
} else if (stream != comm->userStream) {
WARN("Error : mixing different streams within a group call is not supported.");
return ncclInvalidUsage;
}
int lastChunkSize = 0;
if (llMode == 1) {
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
}
for (int bid=0; bid<nBlocks; bid++) {
struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
if (ring->collCount == NCCL_MAX_OPS) {
WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
return ncclInvalidUsage;
}
comm->myParams->gridDim.x++;
int opIndex = ring->collFifoTail;
struct ncclColl* c = ring->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (LOAD(activePtr) != 0) sched_yield();
struct CollectiveArgs* args = &c->args;
args->root = root;
args->N = count;
args->ThisInput = sendbuff;
args->ThisOutput = recvbuff;
args->comm = comm->devComm;
args->opCount = comm->opCount;
args->bid = bid;
args->nRings = nBlocks;
args->nThreads = nThreads;
args->lastChunkSize = lastChunkSize;
c->nThreads = nThreads;
c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
STORE(&c->active, 1);
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
ring->collFifoTail = opIndex;
ring->collCount++;
}
/*if (llMode == 0)*/ comm->opCount++;
return ncclSuccess;
}
extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
#endif
+29 -347
Просмотреть файл
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,313 +7,20 @@
#ifndef NCCL_CORE_H_
#define NCCL_CORE_H_
#define NCCL_MAX_OPS 2048
#include <pthread.h>
#include <algorithm>
#include "nccl.h"
#include "transport.h"
#include "debug.h"
#include "checks.h"
#include "alloc.h"
#include "transport.h"
#include "devcomm.h"
#include "comm.h"
#include "info.h"
#include "argcheck.h"
#include <cstdio>
#include <algorithm> // std::min/std::max
#include <unistd.h>
#include <stdlib.h>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#define MAXRINGS 16
#define MAXTHREADS 256
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
// Rings / LL tuning
#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL for Volta and above
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
#define NCCL_LL_MAX_NTHREADS 256
#define NCCL_LL_MIN_NTHREADS 256
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
struct ncclConnInfo {
// Regular comm mechanism
char *buff; // Local for recv, remote for send
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
uint64_t *opCount; // Local for recv, remote for send
int direct; // Direct communication
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
// Low latency mechanism
char *llBuff; // Local for recv, remote for send
uint64_t *llHead; // Local for send, remote for recv
int *llFifo; // LL Size fifo for proxy
uint64_t llStep; // Keep where we are
uint64_t llLastCleaning;
};
struct ncclConnector {
struct transportProxyInfo* proxyInfo;
struct ncclTransport* transport;
void* transportResources; // Host-side resources
struct ncclConnInfo conn;
};
#define CACHE_LINE_SIZE 64
#define MEM_ALIGN 4096
#define SIZES_FIFO_SIZE 16
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
#define NCCL_LL_CHUNKS 8
#define NUM_LINES_PER_THREAD 8
#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K
#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
#define NCCL_LL_CLEAN_FREQ 0x10000000
struct ncclSendMem {
union {
struct {
uint64_t head;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
uint64_t llHead;
};
char pad3[MEM_ALIGN];
};
};
struct ncclRecvMem {
union {
struct {
uint64_t tail;
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
uint64_t opCount;
char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[SIZES_FIFO_SIZE];
int llSizesFifo[SIZES_FIFO_SIZE];
};
char pad5[MEM_ALIGN];
};
char llBuff[NCCL_LL_BUFF_SIZE];
char buff[1]; // Actually larger than that
};
struct ncclRing {
union {
struct {
int id;
int nthreads;
// Per ring resources
struct ncclSendMem* devMemSend; // CUDA-size resources
struct ncclRecvMem* devMemRecv; // CUDA-size resources
int buffSize;
int devMemSendSize; // Keep the size for IPCs
int devMemRecvSize; // Keep the size for IPCs
struct ncclConnector send;
struct ncclConnector recv;
// Maps an internal nccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int* devUserRanks;
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
// Operation list for aggregation
struct ncclColl* collectives;
struct ncclColl* devCollectives;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
};
int data[0x80];
};
};
static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
#pragma pack(push) /* push current alignment to stack */
#pragma pack(4) /* set alignment to 4 bytes boundary */
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
struct CollectiveArgs {
struct ncclComm* comm;
uint64_t opCount;
// local and remote input, output, and buffer
const void * ThisInput;
void * ThisOutput;
// general parameters
size_t N;
uint32_t root;
uint8_t bid;
uint8_t nRings;
uint16_t nThreads;
int lastChunkSize;
};
struct ncclColl {
union {
struct {
struct CollectiveArgs args;
uint16_t nThreads;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
};
int data[0x10];
};
};
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
#pragma pack(pop) /* restore original alignment from stack */
struct ncclComm {
struct ncclRing rings[MAXRINGS];
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
enum { GROUP, PARALLEL } launchMode;
hipStream_t userStream;
bool userStreamSet;
hipEvent_t doneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
uint64_t opCount;
// Rings for collectives
int nRings;
int nThreads;
// Low-latency algorithm threshold
ssize_t llThreshold;
ssize_t threadThreshold;
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
hipStream_t groupStream;
// Device copy of the communicator
struct ncclComm *devComm;
// Intra-process sync
int intraRank;
int intraRanks;
int* intraBarrier;
int intraPhase;
// Storage for deferred intra-process launch
hipLaunchParams* intraParams;
hipLaunchParams* myParams;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclColl args;
struct ncclColl* argsptr;
};
// Convert volatile access to atomic
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
#else
#define LOAD(VAR) *(VAR)
#define STORE(DST, SRC) *(DST) = (SRC)
#endif
// Check CUDA calls
#define CUDACHECK(cmd) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
res = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)
#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)
#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \
} \
} while (0);
#define NCCLCHECKGOTO(call, res, label) do { \
res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
goto label; \
} \
} while (0);
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
@@ -333,51 +39,27 @@ struct ncclComm {
#endif // end PROFAPI
int ncclCudaCompCap();
ncclResult_t ncclNvlinkGpu(int* nvlink);
int64_t ncclTreeThreshold();
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
memset(*ptr, 0, size);
*devPtr = *ptr;
return ncclSuccess;
}
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(hipHostFree(ptr));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
static __inline__ int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
return 1;
case ncclFloat16:
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
if (isFineGrain) {
hipError_t e = hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained);
if (e != hipSuccess) {
*ptr = 0;
return ncclInvalidUsage;
}
}
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
return ncclSuccess;
}
#endif // end include guard
+61
Просмотреть файл
@@ -0,0 +1,61 @@
/*************************************************************************
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CPUSET_H_
#define NCCL_CPUSET_H_
// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
static int hexToInt(char c) {
int v = c - '0';
if (v < 0) return -1;
if (v > 9) v = 10 + c - 'a';
if ((v < 0) || (v > 15)) return -1;
return v;
}
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
uint32_t cpumasks[CPU_SET_N_U32];
int m = CPU_SET_N_U32-1;
cpumasks[m] = 0;
for (int o=0; o<strlen(str); o++) {
char c = str[o];
if (c == ',') {
m--;
cpumasks[m] = 0;
} else {
int v = hexToInt(c);
if (v == -1) break;
cpumasks[m] <<= 4;
cpumasks[m] += v;
}
}
// Copy cpumasks to mask
for (int a=0; m<CPU_SET_N_U32; a++,m++) {
memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
}
return ncclSuccess;
}
ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
int c = 0;
uint8_t* m8 = (uint8_t*)mask;
for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
if (c == 0 && m8[o] == 0) continue;
sprintf(str+c, "%02x", m8[o]);
c+=2;
if (o && o%4 == 0) {
sprintf(str+c, ",");
c++;
}
}
str[c] = '\0';
return ncclSuccess;
}
#endif
+4 -4
Просмотреть файл
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -25,7 +24,8 @@ extern int ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen);
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
@@ -108,7 +108,7 @@ static inline void initDebug() {
break;
case 'h': // %h = hostname
char hostname[1024];
getHostName(hostname, 1024);
getHostName(hostname, 1024, '.');
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
+259
Просмотреть файл
@@ -0,0 +1,259 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEVICE_H_
#define NCCL_DEVICE_H_
#include "nccl.h"
#include <stdint.h>
// Convert volatile access to atomic
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
#else
#define LOAD(VAR) *(VAR)
#define STORE(DST, SRC) *(DST) = (SRC)
#endif
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
#define MAXTHREADS 256
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
#define NUM_LINES_PER_THREAD 8
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
#ifdef DEBUG_LL
#define NCCL_LL_CLEAN_MASK 0x00000ff8
#define NCCL_LL_FLAG_MAX 0x00001000
#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
#else
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
#endif
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
struct ncclConnInfo {
// Regular comm mechanism
char *buff; // Local for recv, remote for send
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
uint64_t *opCountLoc; // opCount of local rank
uint64_t *opCountRem; // opCount of remote rank
int direct; // Direct communication
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
uint64_t step; // Keep where we are
// Low latency mechanism
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
uint64_t llLastCleaning;
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct ncclConnector {
int connected;
struct ncclProxyArgs *proxyAppend;
struct ncclTransportComm* transportComm;
void* transportResources; // Host-side resources
struct ncclConnInfo conn;
struct ncclComm *comm;
};
struct ncclRing {
// Shortcuts for userRanks[1] and userRanks[n-1]
int prev;
int next;
// Maps an internal nccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int* devUserRanks;
};
#define NCCL_MAX_TREE_ARITY 3
struct ncclTree {
int depth;
int up;
int down[NCCL_MAX_TREE_ARITY];
};
struct ncclPeer {
struct ncclConnector send;
struct ncclConnector recv;
};
struct ncclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(4) /* set alignment to 4 bytes boundary */
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
struct CollectiveArgs {
struct ncclDevComm* comm;
uint64_t opCount;
// local and remote input, output, and buffer
const void * ThisInput;
void * ThisOutput;
// general parameters
size_t N;
uint32_t root;
uint8_t bid;
uint8_t nChannels;
uint16_t nThreads;
int lastChunkSize;
};
struct ncclColl {
union {
struct {
struct CollectiveArgs args;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
};
int data[0x10];
};
};
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
struct ncclChannel {
union {
struct {
struct ncclRing ring;
struct ncclTree tree;
int id;
int nthreads;
int buffSize;
// Communication structures
struct ncclPeer* peers;
struct ncclPeer* devPeers;
// Operation list for aggregation
struct ncclColl* collectives;
struct ncclColl* devCollectives;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
uint32_t* abortCount;
};
int data[0x80];
};
};
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
#pragma pack(pop) /* restore original alignment from stack */
#define MAXCHANNELS 16
#ifdef ENABLE_PROFILING
struct ncclProf {
union {
struct {
uint64_t total_cycle;
uint64_t wait_send_cycle[MAXCHANNELS];
uint64_t wait_recv_cycle[MAXCHANNELS];
// primtive cycles
uint64_t send_cycle;
uint64_t directSend_cycle;
uint64_t recv_cycle;
uint64_t directRecv_cycle;
uint64_t copySend_cycle;
uint64_t directCopySend_cycle;
uint64_t recvCopySend_cycle;
uint64_t directRecvCopySend_cycle;
uint64_t recvReduceCopy_cycle;
uint64_t recvReduceSend_cycle;
uint64_t recvReduceCopySend_cycle;
uint64_t directRecvReduceCopySend_cycle;
// primitive bytes
uint64_t send_byte;
uint64_t directSend_byte;
uint64_t recv_byte;
uint64_t directRecv_byte;
uint64_t copySend_byte;
uint64_t directCopySend_byte;
uint64_t recvCopySend_byte;
uint64_t directRecvCopySend_byte;
uint64_t recvReduceCopy_byte;
uint64_t recvReduceSend_byte;
uint64_t recvReduceCopySend_byte;
uint64_t directRecvReduceCopySend_byte;
};
int data[0x80];
};
};
#endif
typedef enum {
ncclDevSuccess,
ncclDevAssertedMismatch,
ncclDevSuspectedMismatch
} ncclDevError_t;
struct ncclDevComm {
int rank;
int nRanks;
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile ncclDevError_t *fatalDevError;
// Channels, device side
struct ncclChannel* channels;
#ifdef ENABLE_PROFILING
// Profiling counters
struct ncclProf* devProf;
#endif
};
#endif
+8 -6
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -11,12 +11,14 @@
#include "core.h"
#include "group.h"
typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
// Channels / LL tuning
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
#define NCCL_THREAD_THRESHOLD_VEGA 8 // Per thread size before we switch to non-LL for VEGA
#define NCCL_LL_MIN_NTHREADS 256
ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
+1 -1
Просмотреть файл
@@ -4,7 +4,7 @@
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+45
Просмотреть файл
@@ -0,0 +1,45 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_INFO_H_
#define NCCL_INFO_H_
#include "nccl.h"
typedef enum {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown
} ncclPattern_t;
// Used to pass NCCL call information between functions
struct ncclInfo {
ncclColl_t coll;
const char* opName;
// NCCL Coll Args
const void* sendbuff;
void* recvbuff;
size_t count;
ncclDataType_t datatype;
ncclRedOp_t op;
int root;
ncclComm_t comm;
hipStream_t stream;
// Algorithm details
int chunkSteps;
int sliceSteps;
// Computed later
ncclPattern_t pattern;
size_t nBytes;
int nstepsPerLoop;
int nchunksPerLoop;
};
#endif
+46 -3
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -58,8 +58,51 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v1_t;
typedef ncclNet_v1_t ncclNet_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Return the device path in /sys. NCCL will call free on this path.
ncclResult_t (*pciPath)(int dev, char** path);
// Return whether this device supports host pointers and/or CUDA pointers
// as data from the current GPU. Supported types should be composed with
// NCCL_PTR_HOST and NCCL_PTR_CUDA.
ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v2_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
typedef ncclNet_v2_t ncclNet_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
#endif // end include guard
+6 -10
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -13,11 +13,6 @@
extern ncclNet_t* ncclNet;
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
/* Socket Interface Selection type */
typedef enum { findSubnetIf = -1,
dontCareIf = -2
} ncclSocketIfSl_t;
// Translation to external API
static const char* ncclNetName() { return ncclNet->name; }
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
@@ -26,15 +21,16 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
extern ncclNet_t ncclNetIb;
extern ncclNet_t ncclNetSocket;
+35 -57
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -19,6 +19,7 @@
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
@@ -26,7 +27,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
char* rPath = realpath(classPath, NULL);
int fd;
SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
if ((fd = open(rPath, O_RDONLY)) == -1) {
// Could not find device. It might be because we're in a VM and
// we don't see the whole machine. This is handled silently so
// we don't want to print an INFO error.
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
return ncclSystemError;
}
free(rPath);
char pciClass[9];
strncpy(pciClass, "0x000000", 9);
@@ -36,6 +43,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
if (strcmp(pciClass, "0x068000") == 0) {
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
*type = ncclNvLinkDeviceSwitch;
} else if (strcmp(pciClass, "0x068001") == 0) {
// PCI device is of type "Bridge: IBM Device 04ea"
*type = ncclNvLinkDeviceBridge;
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
*type = ncclNvLinkDeviceGpu;
@@ -49,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
/* Get the maximum number of NVLinks based on the GPU generation */
static ncclResult_t getMaxNvlinks(int* maxLinks) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
int ccMajor;
CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
// 6 for Volta, 4 for Pascal
*maxLinks = (ccMajor > 6) ? 6 : 4;
// INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
@@ -68,18 +78,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
if (res != ncclSuccess) return 0;
for(int l=0; l<maxNvLinks; ++l) {
// nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
// report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
// don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
// the POWER CPU case, so it seems best to check this as well.
// Check whether we can use this NVLink for P2P
unsigned canP2P;
if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
// nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
// if the links don't exist, or are disabled. So checking for that return
// here would probably make the nvmlDeviceGetNvLinkCapability check above
// redundant. Presumably, we still need to check the P2P capability above,
// since even non-GPUs would possess PCI info.
// Make sure the Nvlink is up. The previous call should have trained the link.
nvmlEnableState_t isActive;
if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
// Try to figure out what's on the other side of the NVLink
nvmlPciInfo_t remoteProc;
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
@@ -90,7 +97,7 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
p[c] = toupper(p[c]);
}
if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
links++;
} else {
// Make a lower case copy of the bus ID for calling ncclDeviceType
@@ -102,11 +109,21 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
lowerId[c] = tolower(p[c]);
}
// Determine if the remote side is NVswitch
// Determine if the remote side is NVswitch or a GPU
enum ncclNvLinkDeviceType type;
if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
//TODO: we are making an assumption that all GPUs are connected to this switch
//This assumption may change for future architectures
ncclResult_t ret = ncclDeviceType(lowerId, &type);
if (ret == ncclSuccess) {
if (type == ncclNvLinkDeviceSwitch) {
//TODO: we are making an assumption that all GPUs are connected to this switch
//This assumption may change for future architectures
nvswitch_links++;
} else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
links++;
}
} else {
// The NVLink is up but we couldn't find the PCI device on the other
// side. Assume it's an NVswitch outside a VM.
if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
nvswitch_links++;
}
}
@@ -114,43 +131,4 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
}
static int getNumNvlinks(const char* busId) {
nvmlDevice_t nvmlDev;
ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
if (res != ncclSuccess) return 0;
int nvlinks = 0, nvswitch_links = 0;
int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
for(int l=0; l<maxNvLinks; ++l) {
unsigned canP2P;
nvmlEnableState_t isActive;
if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
nvlinks++;
} else {
continue;
}
nvmlPciInfo_t remoteProc;
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
// Make a lower case copy of the bus ID for calling ncclDeviceType
// PCI system path is in lower case
char* p = remoteProc.busId;
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
if (p[c] == 0) break;
lowerId[c] = tolower(p[c]);
}
// Determine if the remote side is NVswitch
enum ncclNvLinkDeviceType type;
if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
//TODO: we are making an assumption that all GPUs are connected to this switch
//This assumption may change for future architectures
nvswitch_links++;
}
}
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
}
#endif
+13 -3
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -8,13 +8,23 @@
#ifndef NCCL_NVLINK_H_
#define NCCL_NVLINK_H_
#include <sys/stat.h>
#include <fcntl.h>
#include "nvmlwrap.h"
#include "topo.h"
#define CONNECT_NVLINK 0x10
#define CONNECT_NVSWITCH 0x100
static int getNumNvlinks(const char* busId) {
return 0;
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
static int getNvlinkGpu(const char* busId1, const char* busId2) {
int links = 0;
return CONNECT_NVLINK*links;
}
#endif
+8 -12
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,7 +7,7 @@
#ifndef NCCL_NVMLWRAP_H_
#define NCCL_NVMLWRAP_H_
#include "core.h"
#include "nccl.h"
//#define NVML_DIRECT 1
#ifdef NVML_DIRECT
@@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
NVMLCHECK(nvmlDeviceGetIndex(device, index));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
return ncclSuccess;
@@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
return ncclSuccess;
}
#else
// Dynamically handle dependencies on NVML
@@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void);
ncclResult_t wrapNvmlShutdown(void);
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
#endif // NVML_DIRECT
#endif // End include guard
+1 -2
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
s++;
strncpy(envValue, line+s, 1024);
setenv(envVar, envValue, 0);
char *str = getenv(envVar);
}
if (line) free(line);
fclose(file);
-14
Просмотреть файл
@@ -1,14 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_RING_H_
#define NCCL_RING_H_
#include "core.h"
ncclResult_t initRing(struct ncclComm* comm, int ringid);
ncclResult_t freeRing(struct ncclRing* ring);
#endif
+4 -5
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -9,14 +9,13 @@
#define NCCL_RINGS_H_
static int getDefaultThreads() {
// On Kepler, rings are doubled later.
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
return 256;
#else
#else // On Kepler, rings are doubled later.
return ncclCudaCompCap() == 3 ? 128 : 256;
#endif
}
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
#endif
+1 -1
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
+35 -19
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,8 +18,9 @@
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
#define SLEEP_INT 1000 // sleep interval in usec
#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
#define SLEEP_INT 1000 // connection retry sleep interval in usec
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
/* Common socket address storage structure for IPv4/IPv6 */
union socketAddress {
@@ -41,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
return buf;
}
static inline short socketToPort(struct sockaddr *saddr) {
static inline uint16_t socketToPort(struct sockaddr *saddr) {
return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port);
}
@@ -60,9 +61,12 @@ static inline int envSocketFamily(void) {
}
static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
#ifdef ENABLE_TRACE
char line[1024];
#endif
struct netIf userIfs[MAX_IFS];
bool searchNot = prefixList && prefixList[0] == '^';
bool searchExact = prefixList && prefixList[0] == '=';
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
int found = 0;
@@ -89,7 +93,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
}
// check against user specified interfaces
if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
continue;
}
@@ -106,7 +110,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
// Store the IP address
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
memcpy(addrs+found, interface->ifa_addr, salen);
INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
found++;
}
}
@@ -159,7 +162,10 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
}
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
char line[1024], line_a[1024];
#ifdef ENABLE_TRACE
char line[1024];
#endif
char line_a[1024];
int found = 0;
struct ifaddrs *interfaces, *interface;
getifaddrs(&interfaces);
@@ -183,7 +189,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
// Store the interface name
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
found++;
if (found == maxIfs) break;
}
@@ -336,8 +342,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
#endif
/* Put the socket in listen mode */
SYSCHECK(listen(sockfd, 128), "listen");
/* Put the socket in listen mode
* NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
*/
SYSCHECK(listen(sockfd, 16384), "listen");
*fd = sockfd;
return ncclSuccess;
}
@@ -367,14 +375,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
#endif
int ret;
int retries = 0;
int timedout_retries = 0;
int refused_retries = 0;
retry:
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
if (ret == 0) return ncclSuccess;
if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
usleep(SLEEP_INT);
goto retry;
if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
usleep(SLEEP_INT);
goto retry;
}
}
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
return ncclSystemError;
@@ -382,12 +394,12 @@ retry:
#define NCCL_SOCKET_SEND 0
#define NCCL_SOCKET_RECV 1
static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) {
int bytes = 0;
char* data = (char*)ptr;
do {
if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
if (op == NCCL_SOCKET_RECV && bytes == 0) {
WARN("Net : Connection closed by remote peer");
return ncclSystemError;
@@ -405,9 +417,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off
return ncclSuccess;
}
static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
return socketProgressOpt(op, fd, ptr, size, offset, 0);
}
static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) {
while (*offset < size)
NCCLCHECK(socketProgress(op, fd, ptr, size, offset));
NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1));
return ncclSuccess;
}
+22 -66
Просмотреть файл
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,78 +11,35 @@
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <iostream>
#include <fstream>
#include <string>
#include <stdio.h>
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
ncclResult_t getCudaPath(int cudaDev, char** path);
static bool isEPYC() {
std::ifstream cpuinfo("/proc/cpuinfo");
std::string line;
int needed = 2;
static bool vendor_id = true, cpu_family = false, initialized = false;
if (initialized) return (vendor_id && cpu_family);
while (std::getline(cpuinfo, line)) {
if (line.compare(0, 9, "vendor_id") == 0) {
if(line.find("AuthenticAMD") == std::string::npos)
vendor_id = false;
needed --;
}
if (line.compare(0, 10, "cpu family") == 0) {
std::string family_str = line.substr(line.find(": ") + 2);
if (std::stoi(family_str) >= 23)
cpu_family = true;
needed --;
}
if (!needed)
break;
}
initialized = true;
return (vendor_id && cpu_family);
}
static int getNumaId(char *path) {
char npath[PATH_MAX];
snprintf(npath, PATH_MAX, "%s/numa_node", path);
npath[PATH_MAX-1] = '\0';
static ncclResult_t getCudaPath(int cudaDev, char** path) {
char busId[BUSID_SIZE];
CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
*path = realpath(busPath, NULL);
if (*path == NULL) {
WARN("Could not find real path of %s", busPath);
return ncclSystemError;
}
return ncclSuccess;
int numaId = -1;
FILE *file = fopen(npath, "r");
if (file == NULL) return -1;
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
fclose(file);
return numaId;
}
enum ncclPathDist {
PATH_PIX = 0,
PATH_PXB = 1,
PATH_PHB = 2,
PATH_SOC = 3
PATH_PIX = 0,
PATH_PXB = 1,
PATH_PHB = 2,
PATH_NODE = 3,
PATH_SYS = 4,
PATH_ARRAY_SIZE = 5
};
static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
extern const char* pathDists[PATH_ARRAY_SIZE];
static int pciDistance(char* path1, char* path2) {
int score = 0;
int depth = 0;
int same = 1;
for (int i=0; i<strlen(path1); i++) {
if (path1[i] != path2[i]) same = 0;
if (path1[i] == '/') {
depth++;
if (same == 1) score++;
}
}
if (isEPYC() && score <= 3) return PATH_PHB;
if (score <= 3) return PATH_SOC;
if (score == 4) return PATH_PHB;
if (score == depth-1) return PATH_PIX;
return PATH_PXB;
}
int pciDistance(char* path1, char* path2);
#endif
+50 -42
Просмотреть файл
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,7 +8,9 @@
#define NCCL_TRANSPORT_H_
#include "nccl.h"
#include "devcomm.h"
#include <stdint.h>
#include "nvmlwrap.h"
#define NTRANSPORTS 3
@@ -19,11 +21,13 @@ struct ncclRing;
struct ncclConnector;
struct ncclComm;
#define RANK_INFO_SIZE 64
typedef char ncclTinfo_t[RANK_INFO_SIZE];
struct ncclInfo {
ncclTinfo_t tinfo[NTRANSPORTS];
struct ncclPeerInfo {
int rank;
int cudaDev;
int nvmlDev;
uint64_t hostHash;
uint64_t pidHash;
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
};
// Used to hold the transport connection values
@@ -34,18 +38,47 @@ struct ncclConnect {
char data[CONNECT_SIZE];
};
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
struct ncclProxyArgs {
struct ncclRing* ring;
int substeps;
proxyProgressFunc_t progress;
struct ncclChannel* channel;
struct ncclConnector* connector;
int sliceSteps;
int chunkSteps;
int nsteps;
uint64_t opCount;
int llMode;
bool needProxy;
int active; // add component before this line -- it is left out during initialization
int state; // add component before this line -- it is left out during initialization
// Internal state
uint64_t head;
uint64_t tail;
uint64_t end;
void* requests[NCCL_STEPS];
int idle;
// Element linking
pthread_mutex_t mutex;
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
};
struct ncclProxyPool;
struct ncclProxyState {
pthread_cond_t cond;
pthread_mutex_t mutex;
bool stop;
struct ncclProxyArgs* ops;
struct ncclProxyArgs* pool;
struct ncclProxyPool* pools;
};
struct ncclTransportComm {
ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -53,8 +86,7 @@ struct ncclTransportComm {
struct ncclTransport {
const char name[4];
ncclResult_t (*fillInfo)(ncclTinfo_t*, int, uint64_t);
ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
struct ncclTransportComm send;
struct ncclTransportComm recv;
@@ -64,37 +96,17 @@ struct ncclTransport {
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
struct transportProxyInfo {
struct ncclComm* comm;
pthread_t thread;
threadFunc_t func;
volatile int proxyReady;
struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
volatile uint64_t argsFifoHead;
volatile uint64_t argsFifoTail;
pthread_cond_t cond;
pthread_mutex_t mutex;
};
ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
enum proxyMode {
proxyRing = 0,
proxyFrom = 1,
proxyTo = 2
};
static int proxyPatternRing = proxyRing;
static inline int proxyPatternFrom(int root) { return 1+root; }
static inline int proxyPatternTo(int root) { return -1-root; }
static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
ncclResult_t transportStartProxies(struct ncclComm* comm);
ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
ncclResult_t transportStartProxy(struct ncclComm* comm);
ncclResult_t transportCreateProxy(struct ncclComm* comm);
ncclResult_t transportDestroyProxy(struct ncclComm* comm);
#include <unistd.h>
@@ -106,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
}
}
inline void transportProxyIdle(int idle) {
sched_yield();
}
#endif
+13
Просмотреть файл
@@ -0,0 +1,13 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_TREES_H_
#define NCCL_TREES_H_
ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
#endif
+4 -3
Просмотреть файл
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,7 +11,7 @@
#include "nccl.h"
#include <stdint.h>
ncclResult_t getHostName(char* hostname, int maxlen);
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getnHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();
@@ -21,6 +22,6 @@ struct netIf {
};
int parseStringList(const char* string, struct netIf* ifList, int maxList);
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
#endif
Разница между файлами не показана из-за своего большого размера Загрузить разницу
-970
Просмотреть файл
@@ -1,970 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "core.h"
#include "ring.h"
#include "param.h"
#include "nvmlwrap.h"
#include "rings.h"
#include "bootstrap.h"
#include "transport.h"
#include "common_coll.h"
#include "group.h"
#include "utils.h"
#include "net.h"
#include "topo.h"
#include <numa.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sched.h>
#include <fcntl.h>
#include <unistd.h>
#include <hip/hip_runtime_api.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <dlfcn.h>
#define STR2(v) #v
#define STR(v) STR2(v)
int ncclDebugLevel;
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
pthread_mutex_t ncclDebugOutputLock;
FILE *ncclDebugFile = stdout;
#ifdef ENABLE_TRACE
std::chrono::high_resolution_clock::time_point ncclEpoch;
#endif
#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
#else
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
#endif
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
ncclNet_t* ncclNet = NULL;
// We define this as weak to let tests redefine their own
#pragma weak ncclCudaCompCap
int ncclCudaCompCap() {
int cudaDev;
if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
int ccMajor;
if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
return ccMajor;
}
int ncclCudaFullCompCap() {
int cudaDev;
if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
int ccMajor, ccMinor;
if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
return ccMajor*10+ccMinor;
}
// Returns ncclInternalError if anything fails, causing that network to be ignored.
ncclResult_t initNet(ncclNet_t* net) {
int ndev;
if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
if (ndev <= 0) {
INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name);
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t initNetPlugin(ncclNet_t** net) {
void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
if (netPluginLib == NULL) {
// dlopen does not guarantee to set errno, but dlerror only gives us a
// string, so checking errno doesn't hurt to try to provide a better
// error message
if (errno == ENOENT) {
INFO(NCCL_INIT|NCCL_NET, "No network plugin found.");
} else {
INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror());
}
return ncclSuccess;
}
ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
if (extNet == NULL) {
INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
goto cleanup;
}
if (initNet(extNet) == ncclSuccess) {
*net = extNet;
return ncclSuccess;
}
cleanup:
if (netPluginLib != NULL) dlclose(netPluginLib);
return ncclSuccess;
}
ncclResult_t initNet() {
// Always initialize sockets as we use it for bootstrap
NCCLCHECK(initNet(&ncclNetSocket));
NCCLCHECK(initNetPlugin(&ncclNet));
if (ncclNet != NULL) {
INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName());
return ncclSuccess;
}
if (initNet(&ncclNetIb) == ncclSuccess) {
ncclNet = &ncclNetIb;
} else {
ncclNet = &ncclNetSocket;
}
INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName());
return ncclSuccess;
}
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
int ncclThreadThreshold(int minCompCap, int multiNode) {
int threshold = ncclParamThreadThreshold();
if (threshold == -2) { // user has not set this env variable
threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
// multiply by 2 if running on multiple nodes
if (multiNode) {
threshold *= 2;
}
}
return threshold;
}
bool useFineGrainVramPcie = false;
void parseHsaForceFineGrainVramPcie() {
char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
if (str && strlen(str) > 0) {
errno = 0;
int64_t v = strtoll(str, NULL, 0);
if (errno || (v != 0 && v != 1)) {
INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
} else {
useFineGrainVramPcie = v;
INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
}
}
}
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
static bool initialized = false;
static ncclResult_t ncclInit() {
if (initialized) return ncclSuccess;
pthread_mutex_lock(&initLock);
if (!initialized) {
initEnv();
initDebug();
initNet();
// Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
parseHsaForceFineGrainVramPcie();
initialized = true;
}
pthread_mutex_unlock(&initLock);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
ncclResult_t ncclGetVersion(int* version) {
if (version == NULL) return ncclInvalidArgument;
*version = NCCL_VERSION_CODE;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
NCCLCHECK(ncclInit());
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
return bootstrapGetUniqueId(out);
}
static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
CUDACHECK(hipFree(comm->devComm));
for (int ring=0; ring<comm->nRings; ring++)
NCCLCHECK(freeRing(comm->rings+ring));
if (comm->doneEvent != NULL)
CUDACHECK(hipEventDestroy(comm->doneEvent));
if (comm->launchMode == ncclComm::GROUP) {
CUDACHECK(hipStreamDestroy(comm->groupStream));
}
// Last rank frees shared resources between threads
int isLast;
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
if (isLast) {
free(comm->intraBarrier);
free(comm->intraParams);
free(comm->intraCudaDevs);
free(comm->intraCGMode);
free(comm->intraCC);
}
free(comm);
return ncclSuccess;
}
static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
if (ndev < 1) {
WARN("invalid device count (%d) requested", ndev);
return ncclInvalidArgument;
}
if (rank >= ndev || rank < 0) {
WARN("rank %d exceeds ndev=%d", rank, ndev);
return ncclInvalidArgument;
}
// Try to create a CUDA object right away. If there is something wrong with
// the device we're on (failure cause #1) , better know it early.
hipEvent_t doneEvent;
CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
comm->rank = rank;
comm->nRanks = ndev;
hipGetDevice(&comm->cudaDev);
comm->doneEvent = doneEvent;
comm->llThreshold = ncclParamLlThreshold();
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
comm->groupCudaStream = ncclParamGroupCudaStream();
#else
// Don't allow the user to overload the default setting in older CUDA builds
comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
#endif
comm->argsptr = &comm->args;
*comret = comm;
return ncclSuccess;
}
static ncclResult_t devCommSetup(ncclComm_t comm) {
// Fully duplicate the comm on the device
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
// Copy the comm on the device
NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
// Copy userRanks
for (int r=0; r<comm->nRings; r++) {
NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
}
return ncclSuccess;
}
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
#else
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
#endif
static void showVersion() {
static int shown = 0;
if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
printf("%s\n", VERSION_STRING);
fflush(stdout);
if (ncclDebugFile != stdout)
INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
shown = 1;
}
}
static ncclResult_t fillInfo(struct ncclInfo* info, int rank, uint64_t commHash) {
for (int t=0; t<NTRANSPORTS; t++) {
NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank, commHash));
}
return ncclSuccess;
}
bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice);
template <int type>
static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
for (int t=0; t<NTRANSPORTS; t++) {
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
ncclTvalue_t ret = 0;
NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
if (ret > 0) {
cpu_set_t affinitySave;
nvmlDevice_t nvmlDevice;
int cudaDev;
CUDACHECK(hipGetDevice(&cudaDev));
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
SetCpuAffinity(cudaDev, &nvmlDevice);
NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
*transportRet = transport;
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return ncclSuccess;
}
}
WARN("No transport found !");
*transportRet = NULL;
return ncclInternalError;
}
static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
NCCLCHECK(initRing(comm, ringid));
struct ncclRing* ring = comm->rings+ringid;
// Reorganize ranks to start with rank.
int shift;
for (shift = 0; shift<nranks; shift++) {
if (ringRanks[shift] == rank) {
break;
}
}
for (int i=0; i<nranks; i++) {
ring->userRanks[i] = ringRanks[(i+shift)%nranks];
}
int prev = ring->userRanks[nranks-1];
int next = ring->userRanks[1];
NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
NCCLCHECK(transportCreateProxy(0, ring, comm));
NCCLCHECK(transportCreateProxy(1, ring, comm));
return ncclSuccess;
}
static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
for (int r=0; r<nranks; r++) {
connectTransport[r] = -1;
for (int t=0; t<NTRANSPORTS; t++) {
NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
if (connectValue[r] > 0) {
connectTransport[r] = t;
break;
}
}
}
return ncclSuccess;
}
static void swap(void* mem1, void* mem2, int size) {
char tmp[size];
memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
}
#define MAXWIDTH 64
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
void dumpMatrix(int* connectMatrix, int nranks) {
char line[STRLENGTH+1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
INFO(NCCL_INIT,"%s", line);
for (int i=0; i<nranks; i++) {
memset(line, ' ', STRLENGTH);
sprintf(line, "%3d ", i);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
INFO(NCCL_INIT,"%s", line);
}
}
void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
char line[STRLENGTH+1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
INFO(NCCL_INIT,"%s", line);
for (int i=0; i<nranks; i++) {
memset(line, ' ', STRLENGTH);
sprintf(line, "%3d ", i);
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
INFO(NCCL_INIT,"%s", line);
}
}
void dumpLine(int* values, int nranks, const char* prefix) {
int prefixlen = strlen(prefix);
char line[STRLENGTH+1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
strncpy(line, prefix, PREFIXLEN);
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
INFO(NCCL_INIT,"%s", line);
}
static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
for (int r=0; r<nrings; r++) {
char prefix[30];
/*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
dumpLine(next+r*nranks, nranks, prefix);*/
int current = rank;
for (int i=0; i<nranks; i++) {
rings[r*nranks+i] = current;
current = next[r*nranks+current];
}
sprintf(prefix, "Ring %02d : ", r);
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
if (current != rank) {
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
return ncclInternalError;
}
// Check that all ranks are there
for (int i=0; i<nranks; i++) {
int found = 0;
for (int j=0; j<nranks; j++) {
if (rings[r*nranks+j] == i) {
found = 1;
break;
}
}
if (found == 0) {
WARN("Error : ring %d does not contain rank %d", r, i);
return ncclInternalError;
}
}
}
return ncclSuccess;
}
void* waitForNonNullPtr(void* p) {
volatile void** ptr = (volatile void**) p;
while (LOAD(ptr) == NULL) sched_yield();
return (void*)LOAD(ptr);
}
ncclResult_t initParams(struct ncclComm* comm) {
hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
params->args = (void **)&comm->argsptr;
params->stream = NULL;
params->sharedMem = 0;
params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
return ncclSuccess;
}
// Allocate/Set Intra Process Structures and set CG options
ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
comm->intraRank = rank;
comm->intraRanks = ranks;
comm->intraPhase = 0;
// Alloc shared structures
if (rank == 0) {
assert(comm == comm0);
int* bar;
NCCLCHECK(ncclCalloc(&bar, 2));
bar[0] = bar[1] = 0;
comm->intraBarrier = bar;
NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
int* CGMode;
NCCLCHECK(ncclCalloc(&CGMode, 1));
*CGMode = 0x11;
comm->intraCGMode = CGMode;
int* CC;
NCCLCHECK(ncclCalloc(&CC, 1));
*CC = ncclCudaFullCompCap();
comm->intraCC = CC;
} else {
comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
}
comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
NCCLCHECK(initParams(comm));
int cgMdLaunch = 1;
// Set CG Mode
comm->launchMode = ncclComm::GROUP;
char* str = getenv("NCCL_LAUNCH_MODE");
if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
comm->launchMode = ncclComm::PARALLEL;
}
if (comm->launchMode == ncclComm::GROUP) {
CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
#if CUDART_VERSION >= 9000
if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
// Check whether the GPU supports Cooperative Group Multi Device Launch
(void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
}
#endif
}
// Disable cgMdLaunch if any rank does not support it
if (cgMdLaunch == 0) {
*comm->intraCGMode = 0x10;
}
return ncclSuccess;
}
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
int rank = comm->rank;
int nranks = comm->nRanks;
void* commState;
uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
struct ncclInfo* allInfo;
NCCLCHECK(ncclCalloc(&allInfo, nranks));
NCCLCHECK(fillInfo(allInfo+rank, rank, commHash));
NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
int* connectTransport;
ncclTvalue_t* connectValue;
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
//if (rank == 0) dumpMatrix(connectTransport, nranks);
//if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
// Get my rings
int nrings;
int* prev, *next;
NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
comm->nThreads = getDefaultThreads();
NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
free(connectTransport);
free(connectValue);
// Find max nThreads
int allData[nranks];
allData[rank] = comm->nThreads;
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++)
comm->nThreads = std::max(allData[i], comm->nThreads);
if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
// Determine the minimum CUDA Compute capability of all GPUs
int myCompCap = ncclCudaCompCap();
int minCompCap = myCompCap;
allData[rank] = myCompCap;
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++)
minCompCap = std::min(allData[i], minCompCap);
if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
// Find min nrings across ranks
allData[rank] = nrings;
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++)
nrings = std::min(allData[i], nrings);
// Exchange data with others to build complete rings
comm->nRings = nrings;
for (int r=0; r<nrings; r++) {
NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
}
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
free(prev);
free(next);
// Connect with prev/next for each ring
struct ncclConnect *connectData;
NCCLCHECK(ncclCalloc(&connectData, 2*nranks));
for (int r=0; r<nrings; r++) {
int* ringRanks = rings+r*nranks;
struct ncclRing *ring = comm->rings+r;
NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank));
int prev_offset = ring->userRanks[nranks-1]*2+1;
int next_offset = ring->userRanks[1]*2;
NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2));
NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send));
NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv));
}
free(connectData);
free(rings);
free(allInfo);
// Intra-process barrier setup
struct rankInfo {
uint64_t hostHash;
uint64_t pidHash;
struct ncclComm* comm;
} rankInfos[nranks];
rankInfos[rank].hostHash = getHostHash();
rankInfos[rank].pidHash = getPidHash();
rankInfos[rank].comm = comm;
NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
// Compute intra ranks
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
int multiNode = 0;
for (int r=0; r<nranks; r++) {
if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
(rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
if (intraRanks == 0) intraRank0 = r;
if (r == rank) intraRank = intraRanks;
intraRanks++;
} else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) {
multiNode = 1;
}
}
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
return ncclInternalError;
}
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
// Determine thread threshold across all GPUs
comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode);
// Barrier
bootstrapClose(commState);
return ncclSuccess;
}
bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
if (numa_available() < 0) {
WARN("System does not support NUMA API!");
return false;
}
char* cudaPath;
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
strcat(cudaPath, "/numa_node");
int fd;
SYSCHECKVAL(open(cudaPath, O_RDONLY), "open", fd);
char numa_node[5];
int len;
SYSCHECKVAL(read(fd, numa_node, 4), "read", len);
SYSCHECK(close(fd), "close");
errno = 0;
long node = strtol(numa_node, NULL, 10);
if (errno == ERANGE || errno == EINVAL) {
INFO(NCCL_ALL,"%s: Call to strtol returned %s", __func__, strerror(errno));
free(cudaPath);
return false;
}
numa_run_on_node(node);
numa_set_preferred(node);
free(cudaPath);
return true;
#else
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
if (hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != hipSuccess) return false;
if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
WARN("Failed to set CPU affinity");
return false;
}
return true;
#endif
}
ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
cpu_set_t affinitySave;
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
NCCLCHECK(wrapNvmlSymbols());
NCCLCHECK(wrapNvmlInit());
// Make sure all host memory allocation are close to the GPU
int cudaDev;
nvmlDevice_t nvmlDevice;
CUDACHECK(hipGetDevice(&cudaDev));
SetCpuAffinity(cudaDev, &nvmlDevice);
ncclResult_t res;
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
return ncclSuccess;
cleanup:
*newcomm = NULL;
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return res;
}
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
char* env = getenv("NCCL_COMM_ID");
if (env && myrank == 0) {
NCCLCHECK(bootstrapCreateRoot(&commId, true));
}
NCCLCHECK(ncclInit());
if (myrank == 0) showVersion();
INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
// Make sure the CUDA runtime is initialized.
CUDACHECK(hipFree(NULL));
NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
WARN("Invalid rank requested : %d/%d", myrank, nranks);
return ncclInvalidArgument;
}
if (ncclAsyncMode()) {
int cudaDev;
CUDACHECK(hipGetDevice(&cudaDev));
return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
} else {
return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
}
}
static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
struct ncclInfo* allInfo;
NCCLCHECK(ncclCalloc(&allInfo, nranks));
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(hipSetDevice(devs[rank]));
NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
}
int* connectTransport;
ncclTvalue_t* connectValue;
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
for (int rank=0; rank<nranks; rank++)
NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
int* prev, *prevFinal, *next, *nextFinal;
NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
int nrings = MAXRINGS;
int nthreads=0;
int myCompCap = ncclCudaCompCap();
int minCompCap = myCompCap;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(hipSetDevice(devs[rank]));
int nringsRank;
int nthreadsRank = getDefaultThreads();
myCompCap = ncclCudaCompCap();
NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
nrings = std::min(nrings, nringsRank);
nthreads = std::max(nthreads, nthreadsRank);
minCompCap = std::min(minCompCap, myCompCap);
for (int ring=0; ring<nrings; ring++) {
int index = ring*nranks+rank;
prevFinal[index] = prev[index];
nextFinal[index] = next[index];
}
}
free(connectTransport);
free(connectValue);
free(prev);
free(next);
INFO(NCCL_INIT,"Using %d threads", nthreads);
INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
int* rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
free(prevFinal);
free(nextFinal);
// Determine thread threshold across all GPUs
int threadThreshold = ncclThreadThreshold(minCompCap, 0);
for (int rank=0; rank<nranks; rank++) {
comms[rank]->nRings = nrings;
comms[rank]->nThreads = nthreads;
comms[rank]->threadThreshold = threadThreshold;
}
for (int r=0; r<nrings; r++) {
struct ncclConnect connect[2*nranks];
int* ringRanks = rings+r*nranks;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(hipSetDevice(devs[rank]));
NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
}
// RingExchange connect information
for (int rank=0; rank<nranks; rank++) {
// Swap rank->prev and prevRank->next
struct ncclRing *ring = comms[rank]->rings+r;
int prevRank = ring->userRanks[nranks-1];
struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
struct ncclConnect* rankPrevConnect = connect+2*rank;
swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
}
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(hipSetDevice(devs[rank]));
struct ncclRing *ring = comms[rank]->rings+r;
NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
}
}
free(rings);
free(allInfo);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
NCCLCHECK(ncclInit());
NCCLCHECK(wrapNvmlSymbols());
NCCLCHECK(wrapNvmlInit());
showVersion();
INFO(NCCL_INIT,"nranks %d", ndev);
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
if (ndev < 1) {
WARN("Invalid device count requested : %d", ndev);
return ncclInvalidArgument;
}
ncclResult_t res;
int savedDevice;
int rank, cudaDev;
ncclComm_t comm = NULL;
nvmlDevice_t nvmlDevice;
int ncclDevList[ndev];
for (int i=0; i<ndev; i++) {
ncclDevList[i] = devlist ? devlist[i] : i;
}
hipGetDevice(&savedDevice);
for(rank=0; rank<ndev; ++rank)
comms[rank] = NULL;
cpu_set_t affinitySave;
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
for (rank=0; rank<ndev; ++rank) {
cudaDev = ncclDevList[rank];
CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
SetCpuAffinity(cudaDev, &nvmlDevice);
NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
comms[rank] = comm;
NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
}
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
for(rank=0; rank<ndev; ++rank) {
cudaDev = ncclDevList[rank];
CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
}
res = ncclSuccess;
goto final;
cleanup:
for(rank=0; rank<ndev; ++rank) {
if(comms[rank] != NULL) {
commFree(comms[rank]);
}
}
final:
if(wrapNvmlShutdown() != ncclSuccess)
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
hipSetDevice(savedDevice);
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return res;
}
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
int savedDevice;
CUDACHECK(hipGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
if (savedDevice != commDevice) {
CUDACHECK(hipSetDevice(commDevice));
}
NCCLCHECK(commFree(comm));
if (savedDevice != commDevice)
CUDACHECK(hipSetDevice(savedDevice));
return ncclSuccess;
}
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
const char* ncclGetErrorString(ncclResult_t code) {
switch (code) {
case ncclSuccess : return "no error";
case ncclUnhandledCudaError : return "unhandled cuda error";
case ncclSystemError : return "unhandled system error";
case ncclInternalError : return "internal error";
case ncclInvalidArgument : return "invalid argument";
case ncclInvalidUsage : return "invalid usage";
default : return "unknown result code";
}
}
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
*count = comm->nRanks;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
*devid = comm->cudaDev;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
*rank = comm->rank;
return ncclSuccess;
}
+69
Просмотреть файл
@@ -0,0 +1,69 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "argcheck.h"
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
hipPointerAttribute_t attr;
hipError_t err = hipPointerGetAttributes(&attr, pointer);
if (err != hipSuccess || attr.devicePointer == NULL) {
WARN("%s : %s is not a valid pointer", opname, ptrname);
return ncclInvalidArgument;
}
#if CUDART_VERSION >= 10000
if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
#else
if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
#endif
WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
return ncclInvalidArgument;
}
return ncclSuccess;
}
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
if (ptr == NULL) {
WARN("%s : %s argument is NULL", opname, ptrname);
return ncclInvalidArgument;
}
return ncclSuccess;
}
ncclResult_t ArgsCheck(struct ncclInfo* info) {
NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
// First, the easy ones
if (info->root < 0 || info->root >= info->comm->nRanks) {
WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
return ncclInvalidArgument;
}
if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
WARN("%s : invalid type %d", info->opName, info->datatype);
return ncclInvalidArgument;
}
// Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
info->nBytes = info->count * ncclTypeSize(info->datatype);
if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
info->count = info->nBytes;
info->datatype = ncclInt8;
}
if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
if (info->op < 0 || info->op >= ncclNumOps) {
WARN("%s : invalid reduction operation %d", info->opName, info->op);
return ncclInvalidArgument;
}
if (info->comm->checkPointers) {
// Check CUDA device pointers
if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
}
if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
}
}
return ncclSuccess;
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше